From 57e627d1665b738570aa0d5faa8d39eb609ff22a Mon Sep 17 00:00:00 2001 From: Robin Watts Date: Fri, 25 Mar 2016 00:18:40 +0000 Subject: More ARM code thresholding work. Fix do_threshold_1 and implement do_threshold_4. do_threshold_1 checks for white and shortcuts the work if it can. There are 2 ARM variants of do_threshold_4. One for ARMs that support unaligned loads, and one for the (rare configurations) that don't. The former checks for white and shortcuts the work. --- source/fitz/halftone.c | 299 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 250 insertions(+), 49 deletions(-) diff --git a/source/fitz/halftone.c b/source/fitz/halftone.c index 2df8e3e7..186c0aa6 100644 --- a/source/fitz/halftone.c +++ b/source/fitz/halftone.c @@ -147,7 +147,6 @@ static void make_ht_line(unsigned char *buf, fz_halftone *ht, int x, int y, int typedef void (threshold_fn)(const unsigned char *ht_line, const unsigned char *pixmap, unsigned char *out, int w, int ht_len); #ifdef ARCH_ARM - static void do_threshold_1(const unsigned char * restrict ht_line, const unsigned char * restrict pixmap, unsigned char *restrict out, int w, int ht_len) __attribute__((naked)); @@ -166,95 +165,110 @@ do_threshold_1(const unsigned char * restrict ht_line, const unsigned char * res "@ <> = ht_len \n" "ldr r9, [r13,#6*4] @ r9 = ht_len \n" "subs r3, r3, #7 @ r3 = w -= 7 \n" - "blt 2f @ while (w > 0) { \n" + "ble 2f @ while (w > 0) { \n" "mov r12,r9 @ r12= l = ht_len \n" + "b 1f \n" + "9: \n" + "strb r14,[r2], #1 @ *out++ = 0 \n" + "subs r12,r12,#8 @ r12 = l -= 8 \n" + "moveq r12,r9 @ if(l==0) l = ht_len \n" + "subeq r0, r0, r9 @ ht_line -= l \n" + "subs r3, r3, #8 @ w -= 8 \n" + "ble 2f @ } \n" "1: \n" + "ldr r14,[r1], #4 @ r14= pixmap[0..3] \n" + "ldr r5, [r1], #4 @ r5 = pixmap[4..7] \n" + "ldr r6, [r1], #4 @ r6 = pixmap[8..11] \n" + "ldr r7, [r1], #4 @ r7 = pixmap[12..15] \n" + "ldrb r4, [r0], #8 @ r0 = ht_line += 8 \n" + "eors r14,r14,r5, ROR #8 @ if (white) \n" + "teqeq r6, r7, ROR #8 @ \n" + "beq 9b @ white \n" + "ldrb r5, [r1, #-16] @ r5 = pixmap[0] \n" + "ldrb r6, [r0, #-7] @ r6 = ht_line[1] \n" + "ldrb r7, [r1, #-14] @ r7 = pixmap[2] \n" "mov r14,#0 @ r14= h = 0 \n" - "ldrb r4, [r0], #1 @ r4 = ht_line[0] \n" - "ldrb r5, [r1], #2 @ r5 = pixmap[0] \n" - "ldrb r6, [r0], #1 @ r6 = ht_line[1] \n" - "ldrb r7, [r1], #2 @ r7 = pixmap[2] \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x80 @ h |= 0x80 \n" - "ldrb r4, [r0], #1 @ r4 = ht_line[2] \n" - "ldrb r5, [r1], #2 @ r5 = pixmap[4] \n" + "ldrb r4, [r0, #-6] @ r4 = ht_line[2] \n" + "ldrb r5, [r1, #-12] @ r5 = pixmap[4] \n" "cmp r7, r6 @ if (r7 < r6) \n" "orrlt r14,r14,#0x40 @ h |= 0x40 \n" - "ldrb r6, [r0], #1 @ r6 = ht_line[3] \n" - "ldrb r7, [r1], #2 @ r7 = pixmap[6] \n" + "ldrb r6, [r0, #-5] @ r6 = ht_line[3] \n" + "ldrb r7, [r1, #-10] @ r7 = pixmap[6] \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x20 @ h |= 0x20 \n" - "ldrb r4, [r0], #1 @ r4 = ht_line[2] \n" - "ldrb r5, [r1], #2 @ r5 = pixmap[4] \n" + "ldrb r4, [r0, #-4] @ r4 = ht_line[4] \n" + "ldrb r5, [r1, #-8] @ r5 = pixmap[8] \n" "cmp r7, r6 @ if (r7 < r6) \n" "orrlt r14,r14,#0x10 @ h |= 0x10 \n" - "ldrb r6, [r0], #1 @ r6 = ht_line[3] \n" - "ldrb r7, [r1], #2 @ r7 = pixmap[6] \n" + "ldrb r6, [r0, #-3] @ r6 = ht_line[5] \n" + "ldrb r7, [r1, #-6] @ r7 = pixmap[10] \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x08 @ h |= 0x08 \n" - "ldrb r4, [r0], #1 @ r4 = ht_line[2] \n" - "ldrb r5, [r1], #2 @ r5 = pixmap[4] \n" + "ldrb r4, [r0, #-2] @ r4 = ht_line[6] \n" + "ldrb r5, [r1, #-4] @ r5 = pixmap[12] \n" "cmp r7, r6 @ if (r7 < r6) \n" "orrlt r14,r14,#0x04 @ h |= 0x04 \n" - "ldrb r6, [r0], #1 @ r6 = ht_line[3] \n" - "ldrb r7, [r1], #2 @ r7 = pixmap[6] \n" + "ldrb r6, [r0, #-1] @ r6 = ht_line[7] \n" + "ldrb r7, [r1, #-2] @ r7 = pixmap[14] \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x02 @ h |= 0x02 \n" "cmp r7, r6 @ if (r7 < r6) \n" "orrlt r14,r14,#0x01 @ h |= 0x01 \n" "subs r12,r12,#8 @ r12 = l -= 8 \n" + "strb r14,[r2], #1 @ *out++ = h \n" "moveq r12,r9 @ if(l==0) l = ht_len \n" - "subeq r6, r6, r12,LSL #2 @ ht_line -= l \n" + "subeq r0, r0, r9 @ ht_line -= l \n" "subs r3, r3, #8 @ w -= 8 \n" - "strb r14,[r2], #1 @ *out++ = h \n" "bgt 1b @ } \n" - "1: \n" - "adds r3, r3, #6 @ w += 6 \n" - "blt 3f @ if (w < 0) { \n" + "2: \n" + "adds r3, r3, #7 @ w += 7 \n" + "ble 4f @ if (w >= 0) { \n" "ldrb r4, [r0], #1 @ r4 = ht_line[0] \n" "ldrb r5, [r1], #2 @ r5 = pixmap[0] \n" "mov r14, #0 @ r14= h = 0 \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x80 @ h |= 0x80 \n" - "teq r3, #0 @ \n" - "ldrneb r4, [r0], #1 @ r6 = ht_line[1] \n" - "ldrneb r5, [r1], #2 @ r7 = pixmap[2] \n" - "beq 2f @ \n" + "cmp r3, #1 @ \n" + "ldrgtb r4, [r0], #1 @ r6 = ht_line[1] \n" + "ldrgtb r5, [r1], #2 @ r7 = pixmap[2] \n" + "ble 3f @ \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x40 @ h |= 0x40 \n" - "teq r3, #1 @ \n" - "ldrneb r4, [r0], #1 @ r6 = ht_line[1] \n" - "ldrneb r5, [r1], #2 @ r7 = pixmap[2] \n" - "beq 2f @ \n" + "cmp r3, #2 @ \n" + "ldrgtb r4, [r0], #1 @ r6 = ht_line[2] \n" + "ldrgtb r5, [r1], #2 @ r7 = pixmap[4] \n" + "ble 3f @ \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x20 @ h |= 0x20 \n" - "teq r3, #2 @ \n" - "ldrneb r4, [r0], #1 @ r6 = ht_line[1] \n" - "ldrneb r5, [r1], #2 @ r7 = pixmap[2] \n" - "beq 2f @ \n" + "cmp r3, #3 @ \n" + "ldrgtb r4, [r0], #1 @ r6 = ht_line[3] \n" + "ldrgtb r5, [r1], #2 @ r7 = pixmap[6] \n" + "ble 3f @ \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x10 @ h |= 0x10 \n" - "teq r3, #3 @ \n" - "ldrneb r4, [r0], #1 @ r6 = ht_line[1] \n" - "ldrneb r5, [r1], #2 @ r7 = pixmap[2] \n" - "beq 2f @ \n" + "cmp r3, #4 @ \n" + "ldrgtb r4, [r0], #1 @ r6 = ht_line[4] \n" + "ldrgtb r5, [r1], #2 @ r7 = pixmap[8] \n" + "ble 3f @ \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x08 @ h |= 0x08 \n" - "teq r3, #4 @ \n" - "ldrneb r4, [r0], #1 @ r6 = ht_line[1] \n" - "ldrneb r5, [r1], #2 @ r7 = pixmap[2] \n" - "beq 2f @ \n" + "cmp r3, #5 @ \n" + "ldrgtb r4, [r0], #1 @ r6 = ht_line[5] \n" + "ldrgtb r5, [r1], #2 @ r7 = pixmap[10] \n" + "ble 3f @ \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x04 @ h |= 0x04 \n" - "teq r3, #5 @ \n" - "ldrneb r4, [r0], #1 @ r6 = ht_line[1] \n" - "ldrneb r5, [r1], #2 @ r7 = pixmap[2] \n" - "beq 2f @ \n" + "cmp r3, #6 @ \n" + "ldrgtb r4, [r0], #1 @ r6 = ht_line[6] \n" + "ldrgtb r5, [r1], #2 @ r7 = pixmap[12] \n" + "ble 3f @ \n" "cmp r5, r4 @ if (r5 < r4) \n" "orrlt r14,r14,#0x02 @ h |= 0x02 \n" - "2: \n" - "strb r14,[r2] @ *out = h \n" "3: \n" + "strb r14,[r2] @ *out = h \n" + "4: \n" "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n" ENTER_THUMB ); @@ -325,6 +339,192 @@ static void do_threshold_1(const unsigned char * restrict ht_line, const unsigne white = 0xFF. Reversing these tests enables us to maintain that BlackIs1 in bitmaps. */ +#ifdef ARCH_ARM +static void +do_threshold_4(const unsigned char * restrict ht_line, const unsigned char * restrict pixmap, unsigned char *restrict out, int w, int ht_len) +__attribute__((naked)); + +#ifdef ARCH_ARM_CAN_LOAD_UNALIGNED +static void +do_threshold_4(const unsigned char * restrict ht_line, const unsigned char * restrict pixmap, unsigned char *restrict out, int w, int ht_len) +{ + asm volatile( + ENTER_ARM + // Store one more reg that required to keep double stack alignment + "stmfd r13!,{r4-r7,r9,r14} \n" + "@ r0 = ht_line \n" + "@ r1 = pixmap \n" + "@ r2 = out \n" + "@ r3 = w \n" + "@ <> = ht_len \n" + "ldr r9, [r13,#6*4] @ r9 = ht_len \n" + "subs r3, r3, #1 @ r3 = w -= 1 \n" + "ble 2f @ while (w > 0) { \n" + "mov r12,r9 @ r12= l = ht_len \n" + "b 1f @ \n" + "9: @ \n" + "strb r14,[r2], #1 @ *out++ = h \n" + "subs r12,r12,#2 @ r12 = l -= 2 \n" + "moveq r12,r9 @ if(l==0) l = ht_len \n" + "subeq r0, r0, r9, LSL #2 @ ht_line -= l \n" + "subs r3, r3, #2 @ w -= 2 \n" + "beq 2f @ } \n" + "blt 3f @ \n" + "1: \n" + "ldr r5, [r1], #5 @ r5 = pixmap[0..3] \n" + "ldr r7, [r1], #5 @ r7 = pixmap[5..8] \n" + "add r0, r0, #8 @ r0 = ht_line += 8 \n" + "mov r14,#0 @ r14= h = 0 \n" + "orrs r5, r5, r7 @ if (r5 | r7 == 0) \n" + "beq 9b @ white \n" + "ldrb r4, [r0, #-8] @ r4 = ht_line[0] \n" + "ldrb r5, [r1, #-10] @ r5 = pixmap[0] \n" + "ldrb r6, [r0, #-7] @ r6 = ht_line[1] \n" + "ldrb r7, [r1, #-9] @ r7 = pixmap[1] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x80 @ h |= 0x80 \n" + "ldrb r4, [r0, #-6] @ r4 = ht_line[2] \n" + "ldrb r5, [r1, #-8] @ r5 = pixmap[2] \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x40 @ h |= 0x40 \n" + "ldrb r6, [r0, #-5] @ r6 = ht_line[3] \n" + "ldrb r7, [r1, #-7] @ r7 = pixmap[3] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x20 @ h |= 0x20 \n" + "ldrb r4, [r0, #-4] @ r4 = ht_line[4] \n" + "ldrb r5, [r1, #-5] @ r5 = pixmap[5] \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x10 @ h |= 0x10 \n" + "ldrb r6, [r0, #-3] @ r6 = ht_line[5] \n" + "ldrb r7, [r1, #-4] @ r7 = pixmap[6] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x08 @ h |= 0x08 \n" + "ldrb r4, [r0, #-2] @ r4 = ht_line[6] \n" + "ldrb r5, [r1, #-3] @ r5 = pixmap[7] \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x04 @ h |= 0x04 \n" + "ldrb r6, [r0, #-1] @ r6 = ht_line[7] \n" + "ldrb r7, [r1, #-2] @ r7 = pixmap[8] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x02 @ h |= 0x02 \n" + "cmp r6, r7 @ if (r7 < r6) \n" + "orrle r14,r14,#0x01 @ h |= 0x01 \n" + "subs r12,r12,#2 @ r12 = l -= 2 \n" + "strb r14,[r2], #1 @ *out++ = h \n" + "moveq r12,r9 @ if(l==0) l = ht_len \n" + "subeq r0, r0, r9, LSL #2 @ ht_line -= l \n" + "subs r3, r3, #2 @ w -= 2 \n" + "bgt 1b @ } \n" + "blt 3f @ \n" + "2: \n" + "ldrb r4, [r0], #1 @ r4 = ht_line[0] \n" + "ldrb r5, [r1], #1 @ r5 = pixmap[0] \n" + "mov r14, #0 @ r14= h = 0 \n" + "ldrb r6, [r0], #1 @ r6 = ht_line[1] \n" + "ldrb r7, [r1], #1 @ r7 = pixmap[1] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x80 @ h |= 0x80 \n" + "ldrb r4, [r0], #1 @ r6 = ht_line[2] \n" + "ldrb r5, [r1], #1 @ r7 = pixmap[2] \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x40 @ h |= 0x40 \n" + "ldrb r6, [r0], #1 @ r6 = ht_line[1] \n" + "ldrb r7, [r1], #2 @ r7 = pixmap[2] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x20 @ h |= 0x20 \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x10 @ h |= 0x10 \n" + "strb r14,[r2] @ *out = h \n" + "3: \n" + "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n" + ENTER_THUMB + ); +} +#else +/* Vanilla version, should work on all ARMs */ +static void +do_threshold_4(const unsigned char * restrict ht_line, const unsigned char * restrict pixmap, unsigned char *restrict out, int w, int ht_len) +{ + asm volatile( + ENTER_ARM + // Store one more reg that required to keep double stack alignment + "stmfd r13!,{r4-r7,r9,r14} \n" + "@ r0 = ht_line \n" + "@ r1 = pixmap \n" + "@ r2 = out \n" + "@ r3 = w \n" + "@ <> = ht_len \n" + "ldr r9, [r13,#6*4] @ r9 = ht_len \n" + "subs r3, r3, #1 @ r3 = w -= 1 \n" + "ble 2f @ while (w > 0) { \n" + "mov r12,r9 @ r12= l = ht_len \n" + "1: \n" + "mov r14,#0 @ r14= h = 0 \n" + "ldrb r4, [r0], #1 @ r4 = ht_line[0] \n" + "ldrb r5, [r1], #1 @ r5 = pixmap[0] \n" + "ldrb r6, [r0], #1 @ r6 = ht_line[1] \n" + "ldrb r7, [r1], #1 @ r7 = pixmap[1] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x80 @ h |= 0x80 \n" + "ldrb r4, [r0], #1 @ r4 = ht_line[2] \n" + "ldrb r5, [r1], #1 @ r5 = pixmap[2] \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x40 @ h |= 0x40 \n" + "ldrb r6, [r0], #1 @ r6 = ht_line[3] \n" + "ldrb r7, [r1], #2 @ r7 = pixmap[3] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x20 @ h |= 0x20 \n" + "ldrb r4, [r0], #1 @ r4 = ht_line[4] \n" + "ldrb r5, [r1], #1 @ r5 = pixmap[4] \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x10 @ h |= 0x10 \n" + "ldrb r6, [r0], #1 @ r6 = ht_line[5] \n" + "ldrb r7, [r1], #1 @ r7 = pixmap[6] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x08 @ h |= 0x08 \n" + "ldrb r4, [r0], #1 @ r4 = ht_line[6] \n" + "ldrb r5, [r1], #1 @ r5 = pixmap[7] \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x04 @ h |= 0x04 \n" + "ldrb r6, [r0], #1 @ r6 = ht_line[7] \n" + "ldrb r7, [r1], #2 @ r7 = pixmap[8] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x02 @ h |= 0x02 \n" + "cmp r6, r7 @ if (r7 < r6) \n" + "orrle r14,r14,#0x01 @ h |= 0x01 \n" + "subs r12,r12,#2 @ r12 = l -= 2 \n" + "strb r14,[r2], #1 @ *out++ = h \n" + "moveq r12,r9 @ if(l==0) l = ht_len \n" + "subeq r0, r0, r9, LSL #2 @ ht_line -= l \n" + "subs r3, r3, #2 @ w -= 2 \n" + "bgt 1b @ } \n" + "blt 3f @ \n" + "2: \n" + "ldrb r4, [r0], #1 @ r4 = ht_line[0] \n" + "ldrb r5, [r1], #1 @ r5 = pixmap[0] \n" + "mov r14, #0 @ r14= h = 0 \n" + "ldrb r6, [r0], #1 @ r6 = ht_line[1] \n" + "ldrb r7, [r1], #1 @ r7 = pixmap[1] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x80 @ h |= 0x80 \n" + "ldrb r4, [r0], #1 @ r6 = ht_line[2] \n" + "ldrb r5, [r1], #1 @ r7 = pixmap[2] \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x40 @ h |= 0x40 \n" + "ldrb r6, [r0], #1 @ r6 = ht_line[1] \n" + "ldrb r7, [r1], #2 @ r7 = pixmap[2] \n" + "cmp r4, r5 @ if (r4 < r5) \n" + "orrle r14,r14,#0x20 @ h |= 0x20 \n" + "cmp r6, r7 @ if (r6 < r7) \n" + "orrle r14,r14,#0x10 @ h |= 0x10 \n" + "strb r14,[r2] @ *out = h \n" + "3: \n" + "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n" + ENTER_THUMB + ); +} +#endif /* UNALIGNED */ +#else static void do_threshold_4(const unsigned char * restrict ht_line, const unsigned char * restrict pixmap, unsigned char * restrict out, int w, int ht_len) { int l = ht_len; @@ -374,6 +574,7 @@ static void do_threshold_4(const unsigned char * restrict ht_line, const unsigne *out = h; } } +#endif fz_bitmap *fz_new_bitmap_from_pixmap(fz_context *ctx, fz_pixmap *pix, fz_halftone *ht) { -- cgit v1.2.3