summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2012-11-19 16:42:21 +0000
committerRobin Watts <robin.watts@artifex.com>2012-11-21 00:56:55 +0000
commitb0b0b7717c05587b11f58a20d9d63bcccaca43ca (patch)
treee2eb6d48d57cc83cbb034a35d91c12ad15353c76
parent06ab03422110773744d6028b2d128702d2fbcfce (diff)
downloadmupdf-b0b0b7717c05587b11f58a20d9d63bcccaca43ca.tar.xz
ARM optimised fast_cmyk_to_rgb code.
-rw-r--r--fitz/res_colorspace.c222
1 files changed, 207 insertions, 15 deletions
diff --git a/fitz/res_colorspace.c b/fitz/res_colorspace.c
index 50c0a8f7..826909a0 100644
--- a/fitz/res_colorspace.c
+++ b/fitz/res_colorspace.c
@@ -2,6 +2,28 @@
#define SLOWCMYK
+#ifdef NDK_PROFILER
+extern void __gnu_mcount_nc(void);
+#define ENTER_PG "push {lr}\nbl __gnu_mcount_nc\n"
+#else
+#define ENTER_PG
+#endif
+
+/* If we're compiling as thumb code, then we need to tell the compiler
+ * to enter and exit ARM mode around our assembly sections. If we move
+ * the ARM functions to a separate file and arrange for it to be compiled
+ * without thumb mode, we can save some time on entry.
+ */
+#ifdef ARCH_ARM
+#ifdef ARCH_THUMB
+#define ENTER_ARM ".balign 4\nmov r12,pc\nbx r12\n0:.arm\n" ENTER_PG
+#define ENTER_THUMB "9:.thumb\n" ENTER_PG
+#else
+#define ENTER_ARM
+#define ENTER_THUMB
+#endif
+#endif
+
void
fz_free_colorspace_imp(fz_context *ctx, fz_storable *cs_)
{
@@ -316,26 +338,192 @@ static void fast_cmyk_to_gray(fz_pixmap *dst, fz_pixmap *src)
}
}
-#if 0
+#ifdef ARCH_ARM
+static void
+fast_cmyk_to_rgb_ARM(unsigned char *dst, unsigned char *src, int n)
+__attribute__((naked));
+
static void
fast_cmyk_to_rgb_ARM(unsigned char *dst, unsigned char *src, int n)
{
asm volatile(
ENTER_ARM
- "stmfd r13!,{r4-r12,r14} \n"
- "@ r0 = dst \n"
- "@ r1 = src \n"
- "@ r2 = weights \n"
- "mov r4, #0 @ r4 = CMYK = 0 \n"
- "mvn r5, #0xFF000000 @ r5 = RGB = FFFFFF \n"
- "1: \n"
- "ldr r3, [r1], #4 @ r3 = cmyk \n"
- "cmp r3, r4 @ if (cmyk == CMYK) \n"
- "beq match @ goto match \n"
- "cmp r3, #0 @ if (cmyk = 0000) \n"
- "beq black @
-
- "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n"
+ "stmfd r13!,{r4-r11,r14} \n"
+ "@ r0 = dst \n"
+ "@ r1 = src \n"
+ "@ r2 = n \n"
+ "mov r12, #0 @ r12= CMYK = 0 \n"
+ "b 2f @ enter loop \n"
+ "1: @ White or Black \n"
+ "@ Cunning trick: On entry r11 = 0 if black, r11 = FF if white \n"
+ "ldrb r7, [r1],#1 @ r8 = s[4] \n"
+ "strb r11,[r0],#1 @ d[0] = r \n"
+ "strb r11,[r0],#1 @ d[1] = g \n"
+ "strb r11,[r0],#1 @ d[2] = b \n"
+ "strb r7, [r0],#1 @ d[3] = s[4] \n"
+ "subs r2, r2, #1 @ r2 = n-- \n"
+ "beq 9f \n"
+ "2: @ Main loop starts here \n"
+ "ldrb r3, [r1], #4 @ r3 = c \n"
+ "ldrb r6, [r1, #-1] @ r6 = k \n"
+ "ldrb r5, [r1, #-2] @ r5 = y \n"
+ "ldrb r4, [r1, #-3] @ r4 = m \n"
+ "eors r11,r6, #0xFF @ if (k == 255) \n"
+ "beq 1b @ goto black \n"
+ "orr r7, r3, r4, LSL #8 \n"
+ "orr r14,r5, r6, LSL #8 \n"
+ "orrs r7, r7, r14,LSL #16 @ r7 = cmyk \n"
+ "beq 1b @ if (cmyk == 0) white \n"
+ "@ At this point, we have to decode a new pixel \n"
+ "@ r0 = dst r1 = src r2 = n r7 = cmyk \n"
+ "3: @ unmatched \n"
+ "stmfd r13!,{r0-r1,r7} @ stash regs for space \n"
+ "add r3, r3, r3, LSR #7 @ r3 = c += c>>7 \n"
+ "add r4, r4, r4, LSR #7 @ r4 = m += m>>7 \n"
+ "add r5, r5, r5, LSR #7 @ r5 = y += y>>7 \n"
+ "add r6, r6, r6, LSR #7 @ r6 = k += k>>7 \n"
+ "mov r5, r5, LSR #1 @ sacrifice 1 bit of Y \n"
+ "mul r8, r3, r4 @ r8 = cm = c * m \n"
+ "rsb r9, r8, r4, LSL #8 @ r9 = c1m = (m<<8) - cm \n"
+ "rsb r3, r8, r3, LSL #8 @ r3 = cm1 = (c<<8) - cm \n"
+ "rsb r4, r4, #0x100 @ r4 = 256-m \n"
+ "rsb r4, r3, r4, LSL #8 @ r4 = c1m1 =((256-m)<<8)-cm1 \n"
+ "mul r7, r4, r5 @ r7 = c1m1y = c1m1 * y \n"
+ "rsb r4, r7, r4, LSL #7 @ r4 = c1m1y1 = (c1m1<<7)-c1m1y \n"
+ "mul r10,r9, r5 @ r10= c1my = c1m * y \n"
+ "rsb r9, r10,r9, LSL #7 @ r9 = c1my1 = (c1m<<7) - c1my \n"
+ "mul r11,r3, r5 @ r11= cm1y = cm1 * y \n"
+ "rsb r3, r11,r3, LSL #7 @ r3 = cm1y1 = (cm1<<7) - cm1y \n"
+ "mul r5, r8, r5 @ r5 = cmy = cm * y \n"
+ "rsb r8, r5, r8, LSL #7 @ r8 = cmy1 = (cm<<7) - cmy \n"
+ "@ Register recap: \n"
+ "@ r3 = cm1y1 \n"
+ "@ r4 = c1m1y1 \n"
+ "@ r5 = cmy \n"
+ "@ r6 = k \n"
+ "@ r7 = c1m1y \n"
+ "@ r8 = cmy1 \n"
+ "@ r9 = c1my1 \n"
+ "@ r10= c1my \n"
+ "@ r11= cm1y \n"
+ "@ The actual matrix multiplication \n"
+ "mul r14,r4, r6 @ r14= x1 = c1m1y1 * k \n"
+ "rsb r4, r14,r4, LSL #8 @ r4 = x0 = (c1m1y1<<8) - x1 \n"
+ "add r4, r4, r14,LSR #8-5 @ r4 = b = x0 + 32*(x1>>8) \n"
+ "sub r1, r4, r14,LSR #8 @ r1 = g = x0 + 31*(x1>>8) \n"
+ "add r0, r1, r14,LSR #8-2 @ r0 = r = x0 + 35*(x1>>8) \n"
+ " \n"
+ "mul r14,r7, r6 @ r14= x1 = c1m1y * k \n"
+ "rsb r7, r14,r7, LSL #8 @ r7 = x0 = (c1m1y<<8) - x1 \n"
+ "add r0, r0, r7 @ r0 = r += x0 \n"
+ "add r1, r1, r7 @ r1 = g += (x0>>8 * 256) \n"
+ "sub r1, r1, r7, LSR #8-3 @ 248 \n"
+ "sub r1, r1, r7, LSR #8-2 @ 244 \n"
+ "sub r1, r1, r7, LSR #8 @ 243 \n"
+ "sub r7, r14,r14,LSR #3 @ r7 = 28*(x1>>5) \n"
+ "add r0, r0, r7, LSR #8-5 @ r0 = r += 28 * x1 \n"
+ "sub r7, r7, r14,LSR #4 @ r7 = 26*(x1>>5) \n"
+ "add r1, r1, r7, LSR #8-5 @ r1 = g += 26 * x1 \n"
+ " \n"
+ "mul r14,r9, r6 @ r14= x1 = c1my1 * k \n"
+ "sub r9, r9, r14,LSR #8 @ r9 = x0>>8 = c1my1 - (x1>>8) \n"
+ "add r0, r0, r14,LSR #8-5 @ r0 = r += (x1>>8)*32 \n"
+ "add r0, r0, r14,LSR #8-2 @ r0 = r += (x1>>8)*36 \n"
+ "mov r14,#237 @ r14= 237 \n"
+ "mla r0,r14,r9,r0 @ r14= r += x0*237 \n"
+ "mov r14,#141 @ r14= 141 \n"
+ "mla r4,r14,r9,r4 @ r14= b += x0*141 \n"
+ " \n"
+ "mul r14,r10,r6 @ r14= x1 = c1my * k \n"
+ "sub r10,r10,r14,LSR #8 @ r10= x0>>8 = c1my - (x1>>8) \n"
+ "add r0, r0, r14,LSR #8-5 @ r0 = r += 32 * x1 \n"
+ "add r0, r0, r14,LSR #8-1 @ r0 = r += 34 * x1 \n"
+ "mov r14,#238 @ r14= 238 \n"
+ "mla r0,r14,r10,r0 @ r0 = r += 238 * x0 \n"
+ "mov r14,#28 @ r14= 28 \n"
+ "mla r1,r14,r10,r1 @ r1 = g += 28 * x0 \n"
+ "mov r14,#36 @ r14= 36 \n"
+ "mla r4,r14,r10,r4 @ r4 = b += 36 * x0 \n"
+ " \n"
+ "mul r14,r3, r6 @ r14= x1 = cm1y1 * k \n"
+ "sub r3, r3, r14,LSR #8 @ r3 = x1>>8 = cm1y1 - (x1>>8) \n"
+ "add r1, r1, r14,LSR #8-4 @ r1 = g += 16*x1 \n"
+ "sub r1, r1, r14,LSR #8 @ 15*x1 \n"
+ "add r4, r4, r14,LSR #8-5 @ r4 = b += 32*x1 \n"
+ "add r4, r4, r14,LSR #8-2 @ 36*x1 \n"
+ "mov r14,#174 @ r14= 174 \n"
+ "mla r1, r14,r3, r1 @ r1 = g += 174 * x0 \n"
+ "mov r14,#240 @ r14= 240 \n"
+ "mla r4, r14,r3, r4 @ r4 = b += 240 * x0 \n"
+ " \n"
+ "mul r14,r11,r6 @ r14= x1 = cm1y * k \n"
+ "sub r11,r11,r14,LSR #8 @ r11= x0>>8 = cm1y - (x1>>8) \n"
+ "add r1, r1, r14,LSR #8-4 @ r1 = g += x1 * 16 \n"
+ "add r1, r1, r14,LSR #8 @ x1 * 17 \n"
+ "add r1, r1, r14,LSR #8-1 @ x1 * 19 \n"
+ "mov r14,#167 @ r14 = 167 \n"
+ "mla r1, r14,r11,r1 @ r1 = g += 167 * x0 \n"
+ "mov r14,#80 @ r14 = 80 \n"
+ "mla r4, r14,r11,r4 @ r4 = b += 80 * x0 \n"
+ " \n"
+ "mul r14,r8, r6 @ r14= x1 = cmy1 * k \n"
+ "sub r8, r8, r14,LSR #8 @ r8 = x0>>8 = cmy1 - (x1>>8) \n"
+ "add r4, r4, r14,LSR #8-1 @ r4 = b += x1 * 2 \n"
+ "mov r14,#46 @ r14=46 \n"
+ "mla r0, r14,r8, r0 @ r0 = r += 46 * x0 \n"
+ "mov r14,#49 @ r14=49 \n"
+ "mla r1, r14,r8, r1 @ r1 = g += 49 * x0 \n"
+ "mov r14,#147 @ r14=147 \n"
+ "mla r4, r14,r8, r4 @ r4 = b += 147 * x0 \n"
+ " \n"
+ "rsb r6, r6, #256 @ r6 = k = 256-k \n"
+ "mul r14,r5, r6 @ r14= x0 = cmy * (256-k) \n"
+ "mov r11,#54 @ r11= 54 \n"
+ "mov r14,r14,LSR #8 @ r14= (x0>>8) \n"
+ "mov r8,#57 @ r8 = 57 \n"
+ "mla r0,r14,r11,r0 @ r0 = r += 54*x0 \n"
+ "mla r1,r14,r11,r1 @ r1 = g += 54*x0 \n"
+ "mla r4,r14,r8, r4 @ r4 = b += 57*x0 \n"
+ " \n"
+ "sub r8, r0, r0, LSR #8 @ r8 = r -= (r>>8) \n"
+ "sub r9, r1, r1, LSR #8 @ r9 = g -= (r>>8) \n"
+ "sub r10,r4, r4, LSR #8 @ r10= b -= (r>>8) \n"
+ "ldmfd r13!,{r0-r1,r12} \n"
+ "mov r8, r8, LSR #23 @ r8 = r>>23 \n"
+ "mov r9, r9, LSR #23 @ r9 = g>>23 \n"
+ "mov r10,r10,LSR #23 @ r10= b>>23 \n"
+ "ldrb r14,[r1],#1 @ r8 = s[4] \n"
+ "strb r8, [r0],#1 @ d[0] = r \n"
+ "strb r9, [r0],#1 @ d[1] = g \n"
+ "strb r10,[r0],#1 @ d[2] = b \n"
+ "strb r14,[r0],#1 @ d[3] = s[4] \n"
+ "subs r2, r2, #1 @ r2 = n-- \n"
+ "beq 9f \n"
+ "@ At this point, we've just decoded a pixel \n"
+ "@ r0 = dst r1 = src r2 = n r8 = r r9 = g r10= b r12= CMYK \n"
+ "4: \n"
+ "ldrb r3, [r1], #4 @ r3 = c \n"
+ "ldrb r6, [r1, #-1] @ r6 = k \n"
+ "ldrb r5, [r1, #-2] @ r5 = y \n"
+ "ldrb r4, [r1, #-3] @ r4 = m \n"
+ "eors r11,r6, #0xFF @ if (k == 255) \n"
+ "beq 1b @ goto black \n"
+ "orr r7, r3, r4, LSL #8 \n"
+ "orr r14,r5, r6, LSL #8 \n"
+ "orrs r7, r7, r14,LSL #16 @ r7 = cmyk \n"
+ "beq 1b @ if (cmyk == 0) white \n"
+ "cmp r7, r12 @ if (cmyk != CMYK) \n"
+ "bne 3b @ not the same, loop \n"
+ "@ If we get here, we just matched a pixel we have just decoded \n"
+ "ldrb r3, [r1],#1 @ r8 = s[4] \n"
+ "strb r8, [r0],#1 @ d[0] = r \n"
+ "strb r9, [r0],#1 @ d[1] = g \n"
+ "strb r10,[r0],#1 @ d[2] = b \n"
+ "strb r3, [r0],#1 @ d[3] = s[4] \n"
+ "subs r2, r2, #1 @ r2 = n-- \n"
+ "bne 4b \n"
+ "9: \n"
+ "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
ENTER_THUMB
);
}
@@ -346,6 +534,9 @@ static void fast_cmyk_to_rgb(fz_context *ctx, fz_pixmap *dst, fz_pixmap *src)
unsigned char *s = src->samples;
unsigned char *d = dst->samples;
int n = src->w * src->h;
+#ifdef ARCH_ARM
+ fast_cmyk_to_rgb_ARM(d, s, n);
+#else
unsigned int C,M,Y,K,r,g,b;
C = 0;
@@ -488,6 +679,7 @@ static void fast_cmyk_to_rgb(fz_context *ctx, fz_pixmap *dst, fz_pixmap *src)
s += 5;
d += 4;
}
+#endif
}
static void fast_cmyk_to_bgr(fz_context *ctx, fz_pixmap *dst, fz_pixmap *src)