diff options
author | Robin Watts <robin.watts@artifex.com> | 2012-11-16 15:47:34 +0000 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2012-11-19 16:49:27 +0000 |
commit | 019ec348606cf55616b478fda04fc214647e7881 (patch) | |
tree | bc641c158a59c8d9ff86b164fa47084481691207 /draw | |
parent | 6ac6f0b873cffea1817a361764e868e93f043af2 (diff) | |
download | mupdf-019ec348606cf55616b478fda04fc214647e7881.tar.xz |
Unroll inner loop of ARM version of scale_row_to_temp1.
This avoids a stall, and saves time on repeated loops.
Diffstat (limited to 'draw')
-rw-r--r-- | draw/draw_simple_scale.c | 45 |
1 files changed, 24 insertions, 21 deletions
diff --git a/draw/draw_simple_scale.c b/draw/draw_simple_scale.c index f79b4c2c..8c438783 100644 --- a/draw/draw_simple_scale.c +++ b/draw/draw_simple_scale.c @@ -590,10 +590,9 @@ __attribute__((naked)); static void scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights) { - /* possible optimisation in here; unroll inner loops to avoid stall. */ asm volatile( ENTER_ARM - "stmfd r13!,{r4-r5,r9,r14} \n" + "stmfd r13!,{r4-r7,r9,r14} \n" "@ r0 = dst \n" "@ r1 = src \n" "@ r2 = weights \n" @@ -601,7 +600,7 @@ scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights) "ldr r3, [r2],#20 @ r3 = count r2 = &index\n" "ldr r4, [r2] @ r4 = index[0] \n" "cmp r12,#0 @ if (flip) \n" - "beq 4f @ { \n" + "beq 5f @ { \n" "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n" "add r0, r0, r3 @ dst += count \n" "1: \n" @@ -609,43 +608,47 @@ scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights) "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n" "mov r5, #128 @ r5 = a = 128 \n" "add r4, r1, r4 @ r4 = min = &src[r4] \n" - "cmp r9, #0 @ while (len-- > 0) \n" - "beq 3f @ { \n" - "2: \n" + "subs r9, r9, #1 @ len-- \n" + "blt 3f @ while (len >= 0) \n" + "2: @ { \n" + "ldrgt r6, [r2], #4 @ r6 = *contrib++ \n" + "ldrgtb r7, [r4], #1 @ r7 = *min++ \n" "ldr r12,[r2], #4 @ r12 = *contrib++ \n" "ldrb r14,[r4], #1 @ r14 = *min++ \n" - "subs r9, r9, #1 @ r9 = len-- \n" - "@stall on r14 \n" + "mlagt r5, r6, r7, r5 @ g += r6 * r7 \n" + "subs r9, r9, #2 @ r9 = len -= 2 \n" "mla r5, r12,r14,r5 @ g += r14 * r12 \n" - "bgt 2b @ } \n" + "bge 2b @ } \n" "3: \n" "mov r5, r5, lsr #8 @ g >>= 8 \n" "strb r5,[r0, #-1]! @ *--dst=a \n" "subs r3, r3, #1 @ i-- \n" "bgt 1b @ \n" - "ldmfd r13!,{r4-r5,r9,PC} @ pop, return to thumb \n" - "4:" - "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n" + "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n" "5:" + "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n" + "6:" "ldr r4, [r2], #4 @ r4 = *contrib++ \n" "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n" "mov r5, #128 @ r5 = a = 128 \n" "add r4, r1, r4 @ r4 = min = &src[r4] \n" - "cmp r9, #0 @ while (len-- > 0) \n" - "beq 7f @ { \n" - "6: \n" + "subs r9, r9, #1 @ len-- \n" + "blt 9f @ while (len > 0) \n" + "7: @ { \n" + "ldrgt r6, [r2], #4 @ r6 = *contrib++ \n" + "ldrgtb r7, [r4], #1 @ r7 = *min++ \n" "ldr r12,[r2], #4 @ r12 = *contrib++ \n" "ldrb r14,[r4], #1 @ r14 = *min++ \n" - "subs r9, r9, #1 @ r9 = len-- \n" - "@stall on r14 \n" + "mlagt r5, r6,r7,r5 @ a += r6 * r7 \n" + "subs r9, r9, #2 @ r9 = len -= 2 \n" "mla r5, r12,r14,r5 @ a += r14 * r12 \n" - "bgt 6b @ } \n" - "7: \n" + "bge 7b @ } \n" + "9: \n" "mov r5, r5, LSR #8 @ a >>= 8 \n" "strb r5, [r0], #1 @ *dst++=a \n" "subs r3, r3, #1 @ i-- \n" - "bgt 5b @ \n" - "ldmfd r13!,{r4-r5,r9,PC} @ pop, return to thumb \n" + "bgt 6b @ \n" + "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n" ENTER_THUMB ); } |