diff options
author | Robin Watts <robin.watts@artifex.com> | 2012-11-19 16:42:21 +0000 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2012-11-21 00:56:55 +0000 |
commit | b0b0b7717c05587b11f58a20d9d63bcccaca43ca (patch) | |
tree | e2eb6d48d57cc83cbb034a35d91c12ad15353c76 | |
parent | 06ab03422110773744d6028b2d128702d2fbcfce (diff) | |
download | mupdf-b0b0b7717c05587b11f58a20d9d63bcccaca43ca.tar.xz |
ARM optimised fast_cmyk_to_rgb code.
-rw-r--r-- | fitz/res_colorspace.c | 222 |
1 files changed, 207 insertions, 15 deletions
diff --git a/fitz/res_colorspace.c b/fitz/res_colorspace.c index 50c0a8f7..826909a0 100644 --- a/fitz/res_colorspace.c +++ b/fitz/res_colorspace.c @@ -2,6 +2,28 @@ #define SLOWCMYK +#ifdef NDK_PROFILER +extern void __gnu_mcount_nc(void); +#define ENTER_PG "push {lr}\nbl __gnu_mcount_nc\n" +#else +#define ENTER_PG +#endif + +/* If we're compiling as thumb code, then we need to tell the compiler + * to enter and exit ARM mode around our assembly sections. If we move + * the ARM functions to a separate file and arrange for it to be compiled + * without thumb mode, we can save some time on entry. + */ +#ifdef ARCH_ARM +#ifdef ARCH_THUMB +#define ENTER_ARM ".balign 4\nmov r12,pc\nbx r12\n0:.arm\n" ENTER_PG +#define ENTER_THUMB "9:.thumb\n" ENTER_PG +#else +#define ENTER_ARM +#define ENTER_THUMB +#endif +#endif + void fz_free_colorspace_imp(fz_context *ctx, fz_storable *cs_) { @@ -316,26 +338,192 @@ static void fast_cmyk_to_gray(fz_pixmap *dst, fz_pixmap *src) } } -#if 0 +#ifdef ARCH_ARM +static void +fast_cmyk_to_rgb_ARM(unsigned char *dst, unsigned char *src, int n) +__attribute__((naked)); + static void fast_cmyk_to_rgb_ARM(unsigned char *dst, unsigned char *src, int n) { asm volatile( ENTER_ARM - "stmfd r13!,{r4-r12,r14} \n" - "@ r0 = dst \n" - "@ r1 = src \n" - "@ r2 = weights \n" - "mov r4, #0 @ r4 = CMYK = 0 \n" - "mvn r5, #0xFF000000 @ r5 = RGB = FFFFFF \n" - "1: \n" - "ldr r3, [r1], #4 @ r3 = cmyk \n" - "cmp r3, r4 @ if (cmyk == CMYK) \n" - "beq match @ goto match \n" - "cmp r3, #0 @ if (cmyk = 0000) \n" - "beq black @ - - "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n" + "stmfd r13!,{r4-r11,r14} \n" + "@ r0 = dst \n" + "@ r1 = src \n" + "@ r2 = n \n" + "mov r12, #0 @ r12= CMYK = 0 \n" + "b 2f @ enter loop \n" + "1: @ White or Black \n" + "@ Cunning trick: On entry r11 = 0 if black, r11 = FF if white \n" + "ldrb r7, [r1],#1 @ r8 = s[4] \n" + "strb r11,[r0],#1 @ d[0] = r \n" + "strb r11,[r0],#1 @ d[1] = g \n" + "strb r11,[r0],#1 @ d[2] = b \n" + "strb r7, [r0],#1 @ d[3] = s[4] \n" + "subs r2, r2, #1 @ r2 = n-- \n" + "beq 9f \n" + "2: @ Main loop starts here \n" + "ldrb r3, [r1], #4 @ r3 = c \n" + "ldrb r6, [r1, #-1] @ r6 = k \n" + "ldrb r5, [r1, #-2] @ r5 = y \n" + "ldrb r4, [r1, #-3] @ r4 = m \n" + "eors r11,r6, #0xFF @ if (k == 255) \n" + "beq 1b @ goto black \n" + "orr r7, r3, r4, LSL #8 \n" + "orr r14,r5, r6, LSL #8 \n" + "orrs r7, r7, r14,LSL #16 @ r7 = cmyk \n" + "beq 1b @ if (cmyk == 0) white \n" + "@ At this point, we have to decode a new pixel \n" + "@ r0 = dst r1 = src r2 = n r7 = cmyk \n" + "3: @ unmatched \n" + "stmfd r13!,{r0-r1,r7} @ stash regs for space \n" + "add r3, r3, r3, LSR #7 @ r3 = c += c>>7 \n" + "add r4, r4, r4, LSR #7 @ r4 = m += m>>7 \n" + "add r5, r5, r5, LSR #7 @ r5 = y += y>>7 \n" + "add r6, r6, r6, LSR #7 @ r6 = k += k>>7 \n" + "mov r5, r5, LSR #1 @ sacrifice 1 bit of Y \n" + "mul r8, r3, r4 @ r8 = cm = c * m \n" + "rsb r9, r8, r4, LSL #8 @ r9 = c1m = (m<<8) - cm \n" + "rsb r3, r8, r3, LSL #8 @ r3 = cm1 = (c<<8) - cm \n" + "rsb r4, r4, #0x100 @ r4 = 256-m \n" + "rsb r4, r3, r4, LSL #8 @ r4 = c1m1 =((256-m)<<8)-cm1 \n" + "mul r7, r4, r5 @ r7 = c1m1y = c1m1 * y \n" + "rsb r4, r7, r4, LSL #7 @ r4 = c1m1y1 = (c1m1<<7)-c1m1y \n" + "mul r10,r9, r5 @ r10= c1my = c1m * y \n" + "rsb r9, r10,r9, LSL #7 @ r9 = c1my1 = (c1m<<7) - c1my \n" + "mul r11,r3, r5 @ r11= cm1y = cm1 * y \n" + "rsb r3, r11,r3, LSL #7 @ r3 = cm1y1 = (cm1<<7) - cm1y \n" + "mul r5, r8, r5 @ r5 = cmy = cm * y \n" + "rsb r8, r5, r8, LSL #7 @ r8 = cmy1 = (cm<<7) - cmy \n" + "@ Register recap: \n" + "@ r3 = cm1y1 \n" + "@ r4 = c1m1y1 \n" + "@ r5 = cmy \n" + "@ r6 = k \n" + "@ r7 = c1m1y \n" + "@ r8 = cmy1 \n" + "@ r9 = c1my1 \n" + "@ r10= c1my \n" + "@ r11= cm1y \n" + "@ The actual matrix multiplication \n" + "mul r14,r4, r6 @ r14= x1 = c1m1y1 * k \n" + "rsb r4, r14,r4, LSL #8 @ r4 = x0 = (c1m1y1<<8) - x1 \n" + "add r4, r4, r14,LSR #8-5 @ r4 = b = x0 + 32*(x1>>8) \n" + "sub r1, r4, r14,LSR #8 @ r1 = g = x0 + 31*(x1>>8) \n" + "add r0, r1, r14,LSR #8-2 @ r0 = r = x0 + 35*(x1>>8) \n" + " \n" + "mul r14,r7, r6 @ r14= x1 = c1m1y * k \n" + "rsb r7, r14,r7, LSL #8 @ r7 = x0 = (c1m1y<<8) - x1 \n" + "add r0, r0, r7 @ r0 = r += x0 \n" + "add r1, r1, r7 @ r1 = g += (x0>>8 * 256) \n" + "sub r1, r1, r7, LSR #8-3 @ 248 \n" + "sub r1, r1, r7, LSR #8-2 @ 244 \n" + "sub r1, r1, r7, LSR #8 @ 243 \n" + "sub r7, r14,r14,LSR #3 @ r7 = 28*(x1>>5) \n" + "add r0, r0, r7, LSR #8-5 @ r0 = r += 28 * x1 \n" + "sub r7, r7, r14,LSR #4 @ r7 = 26*(x1>>5) \n" + "add r1, r1, r7, LSR #8-5 @ r1 = g += 26 * x1 \n" + " \n" + "mul r14,r9, r6 @ r14= x1 = c1my1 * k \n" + "sub r9, r9, r14,LSR #8 @ r9 = x0>>8 = c1my1 - (x1>>8) \n" + "add r0, r0, r14,LSR #8-5 @ r0 = r += (x1>>8)*32 \n" + "add r0, r0, r14,LSR #8-2 @ r0 = r += (x1>>8)*36 \n" + "mov r14,#237 @ r14= 237 \n" + "mla r0,r14,r9,r0 @ r14= r += x0*237 \n" + "mov r14,#141 @ r14= 141 \n" + "mla r4,r14,r9,r4 @ r14= b += x0*141 \n" + " \n" + "mul r14,r10,r6 @ r14= x1 = c1my * k \n" + "sub r10,r10,r14,LSR #8 @ r10= x0>>8 = c1my - (x1>>8) \n" + "add r0, r0, r14,LSR #8-5 @ r0 = r += 32 * x1 \n" + "add r0, r0, r14,LSR #8-1 @ r0 = r += 34 * x1 \n" + "mov r14,#238 @ r14= 238 \n" + "mla r0,r14,r10,r0 @ r0 = r += 238 * x0 \n" + "mov r14,#28 @ r14= 28 \n" + "mla r1,r14,r10,r1 @ r1 = g += 28 * x0 \n" + "mov r14,#36 @ r14= 36 \n" + "mla r4,r14,r10,r4 @ r4 = b += 36 * x0 \n" + " \n" + "mul r14,r3, r6 @ r14= x1 = cm1y1 * k \n" + "sub r3, r3, r14,LSR #8 @ r3 = x1>>8 = cm1y1 - (x1>>8) \n" + "add r1, r1, r14,LSR #8-4 @ r1 = g += 16*x1 \n" + "sub r1, r1, r14,LSR #8 @ 15*x1 \n" + "add r4, r4, r14,LSR #8-5 @ r4 = b += 32*x1 \n" + "add r4, r4, r14,LSR #8-2 @ 36*x1 \n" + "mov r14,#174 @ r14= 174 \n" + "mla r1, r14,r3, r1 @ r1 = g += 174 * x0 \n" + "mov r14,#240 @ r14= 240 \n" + "mla r4, r14,r3, r4 @ r4 = b += 240 * x0 \n" + " \n" + "mul r14,r11,r6 @ r14= x1 = cm1y * k \n" + "sub r11,r11,r14,LSR #8 @ r11= x0>>8 = cm1y - (x1>>8) \n" + "add r1, r1, r14,LSR #8-4 @ r1 = g += x1 * 16 \n" + "add r1, r1, r14,LSR #8 @ x1 * 17 \n" + "add r1, r1, r14,LSR #8-1 @ x1 * 19 \n" + "mov r14,#167 @ r14 = 167 \n" + "mla r1, r14,r11,r1 @ r1 = g += 167 * x0 \n" + "mov r14,#80 @ r14 = 80 \n" + "mla r4, r14,r11,r4 @ r4 = b += 80 * x0 \n" + " \n" + "mul r14,r8, r6 @ r14= x1 = cmy1 * k \n" + "sub r8, r8, r14,LSR #8 @ r8 = x0>>8 = cmy1 - (x1>>8) \n" + "add r4, r4, r14,LSR #8-1 @ r4 = b += x1 * 2 \n" + "mov r14,#46 @ r14=46 \n" + "mla r0, r14,r8, r0 @ r0 = r += 46 * x0 \n" + "mov r14,#49 @ r14=49 \n" + "mla r1, r14,r8, r1 @ r1 = g += 49 * x0 \n" + "mov r14,#147 @ r14=147 \n" + "mla r4, r14,r8, r4 @ r4 = b += 147 * x0 \n" + " \n" + "rsb r6, r6, #256 @ r6 = k = 256-k \n" + "mul r14,r5, r6 @ r14= x0 = cmy * (256-k) \n" + "mov r11,#54 @ r11= 54 \n" + "mov r14,r14,LSR #8 @ r14= (x0>>8) \n" + "mov r8,#57 @ r8 = 57 \n" + "mla r0,r14,r11,r0 @ r0 = r += 54*x0 \n" + "mla r1,r14,r11,r1 @ r1 = g += 54*x0 \n" + "mla r4,r14,r8, r4 @ r4 = b += 57*x0 \n" + " \n" + "sub r8, r0, r0, LSR #8 @ r8 = r -= (r>>8) \n" + "sub r9, r1, r1, LSR #8 @ r9 = g -= (r>>8) \n" + "sub r10,r4, r4, LSR #8 @ r10= b -= (r>>8) \n" + "ldmfd r13!,{r0-r1,r12} \n" + "mov r8, r8, LSR #23 @ r8 = r>>23 \n" + "mov r9, r9, LSR #23 @ r9 = g>>23 \n" + "mov r10,r10,LSR #23 @ r10= b>>23 \n" + "ldrb r14,[r1],#1 @ r8 = s[4] \n" + "strb r8, [r0],#1 @ d[0] = r \n" + "strb r9, [r0],#1 @ d[1] = g \n" + "strb r10,[r0],#1 @ d[2] = b \n" + "strb r14,[r0],#1 @ d[3] = s[4] \n" + "subs r2, r2, #1 @ r2 = n-- \n" + "beq 9f \n" + "@ At this point, we've just decoded a pixel \n" + "@ r0 = dst r1 = src r2 = n r8 = r r9 = g r10= b r12= CMYK \n" + "4: \n" + "ldrb r3, [r1], #4 @ r3 = c \n" + "ldrb r6, [r1, #-1] @ r6 = k \n" + "ldrb r5, [r1, #-2] @ r5 = y \n" + "ldrb r4, [r1, #-3] @ r4 = m \n" + "eors r11,r6, #0xFF @ if (k == 255) \n" + "beq 1b @ goto black \n" + "orr r7, r3, r4, LSL #8 \n" + "orr r14,r5, r6, LSL #8 \n" + "orrs r7, r7, r14,LSL #16 @ r7 = cmyk \n" + "beq 1b @ if (cmyk == 0) white \n" + "cmp r7, r12 @ if (cmyk != CMYK) \n" + "bne 3b @ not the same, loop \n" + "@ If we get here, we just matched a pixel we have just decoded \n" + "ldrb r3, [r1],#1 @ r8 = s[4] \n" + "strb r8, [r0],#1 @ d[0] = r \n" + "strb r9, [r0],#1 @ d[1] = g \n" + "strb r10,[r0],#1 @ d[2] = b \n" + "strb r3, [r0],#1 @ d[3] = s[4] \n" + "subs r2, r2, #1 @ r2 = n-- \n" + "bne 4b \n" + "9: \n" + "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n" ENTER_THUMB ); } @@ -346,6 +534,9 @@ static void fast_cmyk_to_rgb(fz_context *ctx, fz_pixmap *dst, fz_pixmap *src) unsigned char *s = src->samples; unsigned char *d = dst->samples; int n = src->w * src->h; +#ifdef ARCH_ARM + fast_cmyk_to_rgb_ARM(d, s, n); +#else unsigned int C,M,Y,K,r,g,b; C = 0; @@ -488,6 +679,7 @@ static void fast_cmyk_to_rgb(fz_context *ctx, fz_pixmap *dst, fz_pixmap *src) s += 5; d += 4; } +#endif } static void fast_cmyk_to_bgr(fz_context *ctx, fz_pixmap *dst, fz_pixmap *src) |