summaryrefslogtreecommitdiff
path: root/draw
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2012-11-16 15:47:34 +0000
committerRobin Watts <robin.watts@artifex.com>2012-11-19 16:49:27 +0000
commit019ec348606cf55616b478fda04fc214647e7881 (patch)
treebc641c158a59c8d9ff86b164fa47084481691207 /draw
parent6ac6f0b873cffea1817a361764e868e93f043af2 (diff)
downloadmupdf-019ec348606cf55616b478fda04fc214647e7881.tar.xz
Unroll inner loop of ARM version of scale_row_to_temp1.
This avoids a stall, and saves time on repeated loops.
Diffstat (limited to 'draw')
-rw-r--r--draw/draw_simple_scale.c45
1 files changed, 24 insertions, 21 deletions
diff --git a/draw/draw_simple_scale.c b/draw/draw_simple_scale.c
index f79b4c2c..8c438783 100644
--- a/draw/draw_simple_scale.c
+++ b/draw/draw_simple_scale.c
@@ -590,10 +590,9 @@ __attribute__((naked));
static void
scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights)
{
- /* possible optimisation in here; unroll inner loops to avoid stall. */
asm volatile(
ENTER_ARM
- "stmfd r13!,{r4-r5,r9,r14} \n"
+ "stmfd r13!,{r4-r7,r9,r14} \n"
"@ r0 = dst \n"
"@ r1 = src \n"
"@ r2 = weights \n"
@@ -601,7 +600,7 @@ scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights)
"ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
"ldr r4, [r2] @ r4 = index[0] \n"
"cmp r12,#0 @ if (flip) \n"
- "beq 4f @ { \n"
+ "beq 5f @ { \n"
"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
"add r0, r0, r3 @ dst += count \n"
"1: \n"
@@ -609,43 +608,47 @@ scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights)
"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
"mov r5, #128 @ r5 = a = 128 \n"
"add r4, r1, r4 @ r4 = min = &src[r4] \n"
- "cmp r9, #0 @ while (len-- > 0) \n"
- "beq 3f @ { \n"
- "2: \n"
+ "subs r9, r9, #1 @ len-- \n"
+ "blt 3f @ while (len >= 0) \n"
+ "2: @ { \n"
+ "ldrgt r6, [r2], #4 @ r6 = *contrib++ \n"
+ "ldrgtb r7, [r4], #1 @ r7 = *min++ \n"
"ldr r12,[r2], #4 @ r12 = *contrib++ \n"
"ldrb r14,[r4], #1 @ r14 = *min++ \n"
- "subs r9, r9, #1 @ r9 = len-- \n"
- "@stall on r14 \n"
+ "mlagt r5, r6, r7, r5 @ g += r6 * r7 \n"
+ "subs r9, r9, #2 @ r9 = len -= 2 \n"
"mla r5, r12,r14,r5 @ g += r14 * r12 \n"
- "bgt 2b @ } \n"
+ "bge 2b @ } \n"
"3: \n"
"mov r5, r5, lsr #8 @ g >>= 8 \n"
"strb r5,[r0, #-1]! @ *--dst=a \n"
"subs r3, r3, #1 @ i-- \n"
"bgt 1b @ \n"
- "ldmfd r13!,{r4-r5,r9,PC} @ pop, return to thumb \n"
- "4:"
- "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
+ "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n"
"5:"
+ "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
+ "6:"
"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
"mov r5, #128 @ r5 = a = 128 \n"
"add r4, r1, r4 @ r4 = min = &src[r4] \n"
- "cmp r9, #0 @ while (len-- > 0) \n"
- "beq 7f @ { \n"
- "6: \n"
+ "subs r9, r9, #1 @ len-- \n"
+ "blt 9f @ while (len > 0) \n"
+ "7: @ { \n"
+ "ldrgt r6, [r2], #4 @ r6 = *contrib++ \n"
+ "ldrgtb r7, [r4], #1 @ r7 = *min++ \n"
"ldr r12,[r2], #4 @ r12 = *contrib++ \n"
"ldrb r14,[r4], #1 @ r14 = *min++ \n"
- "subs r9, r9, #1 @ r9 = len-- \n"
- "@stall on r14 \n"
+ "mlagt r5, r6,r7,r5 @ a += r6 * r7 \n"
+ "subs r9, r9, #2 @ r9 = len -= 2 \n"
"mla r5, r12,r14,r5 @ a += r14 * r12 \n"
- "bgt 6b @ } \n"
- "7: \n"
+ "bge 7b @ } \n"
+ "9: \n"
"mov r5, r5, LSR #8 @ a >>= 8 \n"
"strb r5, [r0], #1 @ *dst++=a \n"
"subs r3, r3, #1 @ i-- \n"
- "bgt 5b @ \n"
- "ldmfd r13!,{r4-r5,r9,PC} @ pop, return to thumb \n"
+ "bgt 6b @ \n"
+ "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n"
ENTER_THUMB
);
}