From d16f386cbacfc18dd97be703a4af6e71c7f4e925 Mon Sep 17 00:00:00 2001 From: Robin Watts Date: Tue, 20 Aug 2013 11:25:44 +0000 Subject: Optimise fz_paint_span_with_color and fz_paint_solid_color The 2 biggest hotspots in benchmarking on the Raspberry pi at 1200dpi. --- source/fitz/draw-paint.c | 595 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 544 insertions(+), 51 deletions(-) diff --git a/source/fitz/draw-paint.c b/source/fitz/draw-paint.c index e7cfbdb4..30b9364f 100644 --- a/source/fitz/draw-paint.c +++ b/source/fitz/draw-paint.c @@ -86,19 +86,192 @@ fz_paint_solid_alpha(byte * restrict dp, int w, int alpha) } } -void -fz_paint_solid_color(byte * restrict dp, int n, int w, byte *color) +static inline void +fz_paint_solid_color_2(byte * restrict dp, int w, byte *color) +{ + int sa = FZ_EXPAND(color[1]); + if (sa == 0) + return; + if (sa == 256) + { + while (w--) + { + dp[0] = color[0]; + dp[1] = 255; + dp += 2; + } + } + else + { + while (w--) + { + dp[0] = FZ_BLEND(color[0], dp[0], sa); + dp[1] = FZ_BLEND(255, dp[1], sa); + dp += 2; + } + } +} + +#if 0 +/* This should be a #ifdef ARCH_ARM section, but in tests this seems to perform + * more slowly than the C. I have no idea why, but I am leaving it disabled + * for now. Once it has been in git, I may remove it. */ +static void +fz_paint_solid_color_4(byte * restrict dp, int w, byte *color) + __attribute__((naked)); + +static void +fz_paint_solid_color_4(byte * restrict dp, int w, byte *color) { + asm volatile( + ENTER_ARM + "stmfd r13!,{r4,r10-r11,r14} @ r0 = dp, r1 = w, r2 = color \n" + "ldr r2, [r2] @ r2 = aabbggrr = color[0,1,2,3]\n" + "mov r14,#0xFF00 @ r14= 0xFF00 \n" + "orr r14,r14,#0xFF000000 @ r14= 0xff00ff00 = mask \n" + "mov r3, r2, LSR #24 @ r3 = sa = color[3] \n" + "orr r2, r2, #0xFF000000 @ r2 = ffbbggrr \n" + "adds r3, r3, r3, LSR #7 @ r3 = sa = FZ_EXPAND(sa) \n" + "cmpne r1, #0 @ if (sa == 0 || w == 0) \n" + "beq 9f @ return \n" + "cmp r3, #256 @ if (sa == 256) { \n" + "bne 5f @ \n" + "1: @ do { \n" + "str r2, [r0], #4 @ *(int *)dp = rgba, dp+=4 \n" + "subs r1, r1, #1 @ w--; \n" + "bne 1b @ } while (w); \n" + "b 9f @ } else { \n" + "5: @ \n" + "and r4, r2, r14 @ r4 = ga = ff00gg00 \n" + "bic r2, r2, r14 @ r2 = rb = 00bb00rr \n" + "mov r4, r4, LSR #8 @ r4 = ga = 00ff00gg \n" + "6: @ do { \n" + "ldr r12,[r0] @ r12 = *(int *)dp = AABBGGRR \n" + "and r11,r14,r12 @ r11= AA00GG00 \n" + "sub r10,r4, r11,LSR #8 @ r10= aa00gg - AA00GG \n" + "and r12,r14,r12,LSL #8 @ r12= BB00RR00 \n" + "mla r11,r3, r10,r11 @ r11= sa*r10+r11 \n" + "sub r10,r2, r12,LSR #8 @ r9 = bb00rr - BB00RR \n" + "mla r12,r3, r10,r12 @ r12= sa*r10+r12 \n" + "and r11,r11,r14 @ r11= Aa00Gg00 \n" + "and r12,r12,r14 @ r12= Bb00Rr00 \n" + "orr r12,r11,r12,LSR #8 @ r12= AaBbGgRr \n" + "str r12,[r0, #-4] @ dp[-4] = r12 \n" + "subs r1, r1, #1 @ w-- \n" + "bne 6b @ } while (w != 0); \n" + "9: @ } \n" + "ldmfd r13!,{r4,r10-r11,PC} \n" + ENTER_THUMB + ); +} +#else +#ifndef AVOID_SWAR +static inline void +fz_paint_solid_color_4(byte * restrict dp, int w, byte *color) +{ + unsigned int rgba = *(int *)color; + int sa = FZ_EXPAND(color[3]); + if (sa == 0) + return; + rgba |= 0xFF000000; + if (sa == 256) + { + while (w--) + { + *(unsigned int *)dp = rgba; + dp += 4; + } + } + else + { + unsigned int mask = 0xFF00FF00; + unsigned int rb = rgba & (mask>>8); + unsigned int ga = (rgba & mask)>>8; + while (w--) + { + unsigned int RGBA = *(unsigned int *)dp; + unsigned int RB = (RGBA<<8) & mask; + unsigned int GA = RGBA & mask; + RB += (rb-(RB>>8))*sa; + GA += (ga-(GA>>8))*sa; + RB &= mask; + GA &= mask; + *(unsigned int *)dp = (RB>>8) | GA; + dp += 4; + } + } +} +#else +static inline void +fz_paint_solid_color_4(byte * restrict dp, int w, byte *color) +{ + int sa = FZ_EXPAND(color[3]); + if (sa == 0) + return; + if (sa == 256) + { + while (w--) + { + dp[0] = color[0]; + dp[1] = color[1]; + dp[2] = color[2]; + dp[3] = 255; + dp += 4; + } + } + else + { + while (w--) + { + dp[0] = FZ_BLEND(color[0], dp[0], sa); + dp[1] = FZ_BLEND(color[1], dp[1], sa); + dp[2] = FZ_BLEND(color[2], dp[2], sa); + dp[3] = FZ_BLEND(255, dp[3], sa); + dp += 4; + } + } +} +#endif +#endif + +static inline void +fz_paint_solid_color_N(byte * restrict dp, int n, int w, byte *color) +{ + int k; int n1 = n - 1; int sa = FZ_EXPAND(color[n1]); - int k; - while (w--) + if (sa == 0) + return; + if (sa == 256) { - int ma = FZ_COMBINE(FZ_EXPAND(255), sa); - for (k = 0; k < n1; k++) - dp[k] = FZ_BLEND(color[k], dp[k], ma); - dp[k] = FZ_BLEND(255, dp[k], ma); - dp += n; + while (w--) + { + for (k = 0; k < n1; k++) + dp[k] = color[k]; + dp[k] = 255; + dp += n; + } + } + else + { + while (w--) + { + for (k = 0; k < n1; k++) + dp[k] = FZ_BLEND(color[k], dp[k], sa); + dp[k] = FZ_BLEND(255, dp[k], sa); + dp += n; + } + } +} + +void +fz_paint_solid_color(byte * restrict dp, int n, int w, byte *color) +{ + switch (n) + { + case 2: fz_paint_solid_color_2(dp, w, color); break; + case 4: fz_paint_solid_color_4(dp, w, color); break; + default: fz_paint_solid_color_N(dp, n, w, color); break; } } @@ -119,39 +292,274 @@ fz_paint_span_with_color_2(byte * restrict dp, byte * restrict mp, int w, byte * } } +#if 0 +/* This should be a #ifdef ARCH_ARM section, but in tests this seems to perform + * more slowly than the C. I have no idea why, but I am leaving it disabled + * for now. Once it has been in git, I may remove it. */ +static void +fz_paint_span_with_color_4(byte * restrict dp, byte * restrict mp, int w, byte *color) + __attribute__((naked)); + +static void +fz_paint_span_with_color_4(byte * restrict dp, byte * restrict mp, int w, byte *color) +{ + asm volatile( + ENTER_ARM + "stmfd r13!,{r4-r11,r14} @ r0 = dp, r1 = mp, r2 = w \n" + " @ r3 = color \n" + "ldr r3, [r3] @ r3 = aabbggrr = color[0,1,2,3]\n" + "mov r14,#0xFF00 @ r14= 0xFF00 \n" + "orr r14,r14,#0xFF000000 @ r14= 0xff00ff00 = mask \n" + "mov r7, r3, LSR #24 @ r7 = sa = color[3] \n" + "orr r3, r3, #0xFF000000 @ r3 = ffbbggrr \n" + "and r5, r3, r14 @ r5 = ff00gg00 \n" + "bic r4, r3, r14 @ r4 = 00bb00rr \n" + "mov r5, r5, LSR #8 @ r5 = 00ff00gg \n" + "adds r7, r7, r7, LSR #7 @ r7 = sa = FZ_EXPAND(sa) \n" + "cmpne r2, #0 @ if (sa == 0 || w == 0) \n" + "beq 9f @ return \n" + "cmp r7, #256 @ if (sa == 256) \n" + "beq 1f @ fast_loop \n" + "b 8f @ else slow_loop \n" + "@ fast 'solid' loop (biased towards 0 or 256) \n" + "0: \n" + "subs r2, r2, #1 @ w-- \n" + "beq 9f @ } while (w != 0); \n" + "1: @ do { \n" + "ldrb r12,[r1], #1 @ r12 = ma = *mp++ \n" + "add r0, r0, #4 @ Advance r0 without loading \n" + "@stall \n" + "adds r12,r12,r12,LSR #7 @ r12 = FZ_EXPAND(ma) \n" + "beq 0b @ if (r12 = 0) skip \n" + "cmp r12,#256 @ if (r12 != 256) \n" + "bne 3f @ use non-solid loop \n" + "2: @ \n" + "str r3,[r0, #-4] @ \n" + "subs r2, r2, #1 @ w-- \n" + "bne 1b @ } while (w != 0); \n" + "b 9f @ } else \n" + "3: @ \n" + "ldr r11,[r0, #-4] @ \n" + "b 6f @ \n" + "@ Non-solid loop (biased towards 0 or non-256) \n" + "4: \n" + "subs r2, r2, #1 @ w-- \n" + "beq 9f @ } while (w != 0); \n" + "5: @ do { \n" + "ldrb r12,[r1], #1 @ r12 = ma = *mp++ \n" + "ldr r11,[r0], #4 @ r11 = AABBGGRR = dp[0123] \n" + "@stall \n" + "adds r12,r12,r12,LSR #7 @ r12 = FZ_EXPAND(ma) \n" + "beq 4b @ if (r12 = 0) skip \n" + "cmp r12,#256 @ if (r12 == 256) \n" + "beq 2b @ use solid loop \n" + "6: @ \n" + "and r10,r14,r11 @ r10= AA00GG00 \n" + "sub r9, r5, r10,LSR #8 @ r9 = aa00gg - AA00GG \n" + "and r11,r14,r11,LSL #8 @ r11= BB00RR00 \n" + "mla r10,r12,r9, r10 @ r10= ma*r9+r10 \n" + "sub r9, r4, r11,LSR #8 @ r9 = bb00rr - BB00RR \n" + "mla r11,r12,r9, r11 @ r11= ma*r9+r11 \n" + "and r10,r10,r14 @ r10= Aa00Gg00 \n" + "and r11,r11,r14 @ r11= Bb00Rr00 \n" + "orr r11,r10,r11,LSR #8 @ r11= AaBbGgRr \n" + "str r11,[r0, #-4] @ dp[-4] = r11 \n" + "subs r2, r2, #1 @ w-- \n" + "bne 5b @ } while (w != 0); \n" + "b 9f @ } else \n" + "7: \n" + "subs r2, r2, #1 @ w-- \n" + "beq 9f @ } while (w != 0); \n" + "8: @ do { \n" + "ldrb r12,[r1], #1 @ r12 = ma = *mp++ \n" + "ldr r11,[r0], #4 @ r11 = AABBGGRR = dp[0123] \n" + "@stall \n" + "adds r12,r12,r12,LSR #7 @ r12 = FZ_EXPAND(ma) \n" + "beq 7b @ if (r12 = 0) skip \n" + "mul r12,r7, r12 @ \n" + "and r10,r14,r11 @ r10= AA00GG00 \n" + "mov r12,r12,LSR #8 @ r12 = ma = FZ_COMBINE(r12, sa)\n" + "sub r9, r5, r10,LSR #8 @ r9 = aa00gg - AA00GG \n" + "and r11,r14,r11,LSL #8 @ r11= BB00RR00 \n" + "mla r10,r12,r9, r10 @ r10= ma*r9+r10 \n" + "sub r9, r4, r11,LSR #8 @ r9 = bb00rr - BB00RR \n" + "mla r11,r12,r9, r11 @ r11= ma*r9+r11 \n" + "and r10,r10,r14 @ r10= Aa00Gg00 \n" + "and r11,r11,r14 @ r11= Bb00Rr00 \n" + "orr r11,r10,r11,LSR #8 @ r11= AaBbGgRr \n" + "str r11,[r0, #-4] @ dp[-4] = r11 \n" + "subs r2, r2, #1 @ w-- \n" + "bne 8b @ } while (w != 0); \n" + "9: @ } \n" + "ldmfd r13!,{r4-r11,PC} \n" + ENTER_THUMB + ); +} +#else +#ifndef AVOID_SWAR static inline void fz_paint_span_with_color_4(byte * restrict dp, byte * restrict mp, int w, byte *color) { + unsigned int rgba = *((unsigned int *)color); + unsigned int mask, rb, ga; int sa = FZ_EXPAND(color[3]); + if (sa == 0) + return; + rgba |= 0xFF000000; + mask = 0xFF00FF00; + rb = rgba & (mask>>8); + ga = (rgba & mask)>>8; + if (sa == 256) + { + while (w--) + { + unsigned int ma = *mp++; + dp += 4; + ma = FZ_EXPAND(ma); + if (ma == 0) + { + } + else if (ma == 256) + { + ((unsigned int *)dp)[-1] = rgba; + } + else + { + unsigned int RGBA = ((unsigned int *)dp)[-1]; + unsigned int RB = (RGBA<<8) & mask; + unsigned int GA = RGBA & mask; + RB += (rb-(RB>>8))*ma; + GA += (ga-(GA>>8))*ma; + RB &= mask; + GA &= mask; + ((unsigned int *)dp)[-1] = (RB>>8) | GA; + } + } + } + else + { + while (w--) + { + unsigned int ma = *mp++; + ma = FZ_COMBINE(FZ_EXPAND(ma), sa); + dp += 4; + if (ma != 0) + { + unsigned int RGBA = ((unsigned int*)dp)[-1]; + unsigned int RB = (RGBA<<8) & mask; + unsigned int GA = RGBA & mask; + RB += (rb-(RB>>8))*ma; + GA += (ga-(GA>>8))*ma; + RB &= mask; + GA &= mask; + ((unsigned int *)dp)[-1] = (RB>>8) | GA; + } + } + } +} +#else +static inline void +fz_paint_span_with_color_4(byte * restrict dp, byte * restrict mp, int w, byte *color) +{ int r = color[0]; int g = color[1]; int b = color[2]; - while (w--) + int sa = FZ_EXPAND(color[3]); + if (sa == 0) + return; + if (sa == 256) { - int ma = *mp++; - ma = FZ_COMBINE(FZ_EXPAND(ma), sa); - dp[0] = FZ_BLEND(r, dp[0], ma); - dp[1] = FZ_BLEND(g, dp[1], ma); - dp[2] = FZ_BLEND(b, dp[2], ma); - dp[3] = FZ_BLEND(255, dp[3], ma); - dp += 4; + int ival; + union { + char c[4]; + unsigned int i; + } rgba; + rgba.c[0] = r; + rgba.c[1] = g; + rgba.c[2] = b; + rgba.c[3] = 255; + ival = rgba.i; + while (w--) + { + int ma = *mp++; + ma = FZ_EXPAND(ma); + if (ma == 0) + { + } + else if (ma == 256) + { + ((int *)dp)[0] = ival; + } + else + { + dp[0] = FZ_BLEND(r, dp[0], ma); + dp[1] = FZ_BLEND(g, dp[1], ma); + dp[2] = FZ_BLEND(b, dp[2], ma); + dp[3] = FZ_BLEND(255, dp[3], ma); + } + dp += 4; + } + } + else + { + while (w--) + { + int ma = *mp++; + ma = FZ_COMBINE(FZ_EXPAND(ma), sa); + dp[0] = FZ_BLEND(r, dp[0], ma); + dp[1] = FZ_BLEND(g, dp[1], ma); + dp[2] = FZ_BLEND(b, dp[2], ma); + dp[3] = FZ_BLEND(255, dp[3], ma); + dp += 4; + } } } +#endif +#endif static inline void fz_paint_span_with_color_N(byte * restrict dp, byte * restrict mp, int n, int w, byte *color) { + int k; int n1 = n - 1; int sa = FZ_EXPAND(color[n1]); - int k; - while (w--) + if (sa == 0) + return; + if (sa == 256) { - int ma = *mp++; - ma = FZ_COMBINE(FZ_EXPAND(ma), sa); - for (k = 0; k < n1; k++) - dp[k] = FZ_BLEND(color[k], dp[k], ma); - dp[k] = FZ_BLEND(255, dp[k], ma); - dp += n; + while (w--) + { + int ma = *mp++; + ma = FZ_EXPAND(ma); + if (ma == 0) + { + } + else if (ma == 256) + { + for (k = 0; k < n1; k++) + dp[k] = color[k]; + dp[k] = 255; + } + else + { + for (k = 0; k < n1; k++) + dp[k] = FZ_BLEND(color[k], dp[k], ma); + dp[k] = FZ_BLEND(255, dp[k], ma); + } + dp += n; + } + } + else + { + while (w--) + { + int ma = *mp++; + ma = FZ_COMBINE(FZ_EXPAND(ma), sa); + for (k = 0; k < n1; k++) + dp[k] = FZ_BLEND(color[k], dp[k], ma); + dp[k] = FZ_BLEND(255, dp[k], ma); + dp += n; + } } } @@ -168,6 +576,7 @@ fz_paint_span_with_color(byte * restrict dp, byte * restrict mp, int n, int w, b /* Blend source in mask over destination */ +/* FIXME: There is potential for SWAR optimisation here */ static inline void fz_paint_span_with_mask_2(byte * restrict dp, byte * restrict sp, byte * restrict mp, int w) { @@ -176,16 +585,42 @@ fz_paint_span_with_mask_2(byte * restrict dp, byte * restrict sp, byte * restric int masa; int ma = *mp++; ma = FZ_EXPAND(ma); - masa = FZ_COMBINE(sp[1], ma); - masa = 255 - masa; - masa = FZ_EXPAND(masa); - *dp = FZ_COMBINE2(*sp, ma, *dp, masa); - sp++; dp++; - *dp = FZ_COMBINE2(*sp, ma, *dp, masa); - sp++; dp++; + if (ma == 0) + { + dp += 2; + sp += 2; + } + else if (ma == 256) + { + masa = 255 - sp[1]; + if (masa == 0) + { + *dp++ = *sp++; + *dp++ = *sp++; + } + else + { + masa = FZ_EXPAND(masa); + *dp = *sp + FZ_COMBINE(*dp, masa); + sp++; dp++; + *dp = *sp + FZ_COMBINE(*dp, masa); + sp++; dp++; + } + } + else + { + masa = FZ_COMBINE(sp[1], ma); + masa = 255 - masa; + masa = FZ_EXPAND(masa); + *dp = FZ_COMBINE2(*sp, ma, *dp, masa); + sp++; dp++; + *dp = FZ_COMBINE2(*sp, ma, *dp, masa); + sp++; dp++; + } } } +/* FIXME: There is potential for SWAR optimisation here */ static inline void fz_paint_span_with_mask_4(byte * restrict dp, byte * restrict sp, byte * restrict mp, int w) { @@ -194,17 +629,47 @@ fz_paint_span_with_mask_4(byte * restrict dp, byte * restrict sp, byte * restric int masa; int ma = *mp++; ma = FZ_EXPAND(ma); - masa = FZ_COMBINE(sp[3], ma); - masa = 255 - masa; - masa = FZ_EXPAND(masa); - *dp = FZ_COMBINE2(*sp, ma, *dp, masa); - sp++; dp++; - *dp = FZ_COMBINE2(*sp, ma, *dp, masa); - sp++; dp++; - *dp = FZ_COMBINE2(*sp, ma, *dp, masa); - sp++; dp++; - *dp = FZ_COMBINE2(*sp, ma, *dp, masa); - sp++; dp++; + if (ma == 0) + { + dp += 4; + sp += 4; + } + else if (ma == 256) + { + masa = 255 - sp[3]; + if (masa == 0) + { + *(int*)dp = *(int *)sp; + sp += 4; dp += 4; + } + else + { + masa = FZ_EXPAND(masa); + *dp = *sp + FZ_COMBINE(*dp, masa); + sp++; dp++; + *dp = *sp + FZ_COMBINE(*dp, masa); + sp++; dp++; + *dp = *sp + FZ_COMBINE(*dp, masa); + sp++; dp++; + *dp = *sp + FZ_COMBINE(*dp, masa); + sp++; dp++; + } + } + else + { + /* FIXME: There is potential for SWAR optimisation here */ + masa = FZ_COMBINE(sp[3], ma); + masa = 255 - masa; + masa = FZ_EXPAND(masa); + *dp = FZ_COMBINE2(*sp, ma, *dp, masa); + sp++; dp++; + *dp = FZ_COMBINE2(*sp, ma, *dp, masa); + sp++; dp++; + *dp = FZ_COMBINE2(*sp, ma, *dp, masa); + sp++; dp++; + *dp = FZ_COMBINE2(*sp, ma, *dp, masa); + sp++; dp++; + } } } @@ -213,17 +678,45 @@ fz_paint_span_with_mask_N(byte * restrict dp, byte * restrict sp, byte * restric { while (w--) { - int k = n; - int masa; int ma = *mp++; ma = FZ_EXPAND(ma); - masa = FZ_COMBINE(sp[n-1], ma); - masa = 255-masa; - masa = FZ_EXPAND(masa); - while (k--) + if (ma == 0) { - *dp = FZ_COMBINE2(*sp, ma, *dp, masa); - sp++; dp++; + dp += n; + sp += n; + } + else if (ma == 256) + { + int k = n; + int masa = 255 - sp[n-1]; + if (masa == 0) + { + while (k--) + { + *dp++ = *sp++; + } + } + else + { + masa = FZ_EXPAND(masa); + while (k--) + { + *dp = *sp + FZ_COMBINE(*dp, masa); + sp++; dp++; + } + } + } + else + { + int k = n; + int masa = FZ_COMBINE(sp[n-1], ma); + masa = 255-masa; + masa = FZ_EXPAND(masa); + while (k--) + { + *dp = FZ_COMBINE2(*sp, ma, *dp, masa); + sp++; dp++; + } } } } -- cgit v1.2.3