summaryrefslogtreecommitdiff
path: root/draw
diff options
context:
space:
mode:
authorRobin Watts <Robin.Watts@artifex.com>2011-10-07 12:46:20 +0100
committerRobin Watts <robin@ghostscript.com>2011-11-14 15:52:20 +0000
commite806b23e27d873516ba2c232c7d5f6c3bbfad370 (patch)
treea3a6880f2aa25a4ce6a020b8e7f3c212e7f47f50 /draw
parent97d00440c043b712a2d16134e3b52850c7b36d47 (diff)
downloadmupdf-e806b23e27d873516ba2c232c7d5f6c3bbfad370.tar.xz
ARM opts in draw_scale.c; updated android build opts.
Kammerer reports 90%+ of CPU time is spent in the image scaling code for his documents on Android. In this commit we provide ARM optimised cores for the common scaling routines (1/2/4 components). Tests indicate this doubles the speed of rendering for a bitmap heavy PDF file on an HTC desire. This code is included if ARCH_ARM is defined. If ARCH_THUMB is defined then extra instructions are added to ensure correct interworking. We also update the Android jni makefiles to set these defines. We update the ReadMe.txt with more explicit instructions and update with more modern ndk/sdk versions. We update build.xml in line with new sdk releases.
Diffstat (limited to 'draw')
-rw-r--r--draw/draw_scale.c368
1 files changed, 285 insertions, 83 deletions
diff --git a/draw/draw_scale.c b/draw/draw_scale.c
index 427512ce..bb766398 100644
--- a/draw/draw_scale.c
+++ b/draw/draw_scale.c
@@ -16,6 +16,21 @@ and then positioning it at (frac(x),frac(y)).
*/
#define SINGLE_PIXEL_SPECIALS
+/* If we're compiling as thumb code, then we need to tell the compiler
+ * to enter and exit ARM mode around our assembly sections. If we move
+ * the ARM functions to a separate file and arrange for it to be compiled
+ * without thumb mode, we can save some time on entry.
+ */
+#ifdef ARCH_ARM
+#ifdef ARCH_THUMB
+#define ENTER_ARM ".balign 4\nmov r12,pc\nbx r12\n0:.arm\n"
+#define ENTER_THUMB "9:.thumb\n"
+#else
+#define ENTER_ARM
+#define ENTER_THUMB
+#endif
+#endif
+
#ifdef DEBUG_SCALING
#ifdef WIN32
#include <windows.h>
@@ -247,10 +262,10 @@ typedef struct fz_weights_s fz_weights;
struct fz_weights_s
{
+ int flip;
int count;
int max_len;
int n;
- int flip;
int new_line;
int index[1];
};
@@ -584,6 +599,274 @@ scale_row_to_temp(int *dst, unsigned char *src, fz_weights *weights)
}
}
+#ifdef ARCH_ARM
+
+static void
+scale_row_to_temp1(int *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_to_temp2(int *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_from_temp(unsigned char *dst, int *src, fz_weights *weights, int width, int row)
+__attribute__((naked));
+
+static void
+scale_row_to_temp1(int *dst, unsigned char *src, fz_weights *weights)
+{
+ /* possible optimisation in here; unroll inner loops to avoid stall. */
+ asm volatile(
+ ENTER_ARM
+ "stmfd r13!,{r4-r5,r9,r14} \n"
+ "@ r0 = dst \n"
+ "@ r1 = src \n"
+ "@ r2 = weights \n"
+ "ldr r12,[r2],#4 @ r12= flip \n"
+ "ldr r3, [r2],#16 @ r3 = count r2 = &index\n"
+ "ldr r4, [r2] @ r4 = index[0] \n"
+ "cmp r12,#0 @ if (flip) \n"
+ "beq 4f @ { \n"
+ "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
+ "add r0, r0, r3, LSL #2 @ dst += count \n"
+ "1: \n"
+ "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
+ "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
+ "mov r5, #0 @ r5 = a = 0 \n"
+ "add r4, r1, r4 @ r4 = min = &src[r4] \n"
+ "cmp r9, #0 @ while (len-- > 0) \n"
+ "beq 3f @ { \n"
+ "2: \n"
+ "ldr r12,[r2], #4 @ r12 = *contrib++ \n"
+ "ldrb r14,[r4], #1 @ r14 = *min++ \n"
+ "subs r9, r9, #1 @ r9 = len-- \n"
+ "@stall on r14 \n"
+ "mla r5, r12,r14,r5 @ g += r14 * r12 \n"
+ "bgt 2b @ } \n"
+ "3: \n"
+ "str r5,[r0, #-4]! @ *--dst=a \n"
+ "subs r3, r3, #1 @ i-- \n"
+ "bgt 1b @ \n"
+ "ldmfd r13!,{r4-r5,r9,PC} @ pop, return to thumb \n"
+ "4:"
+ "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
+ "5:"
+ "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
+ "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
+ "mov r5, #0 @ r5 = a = 0 \n"
+ "add r4, r1, r4 @ r4 = min = &src[r4] \n"
+ "cmp r9, #0 @ while (len-- > 0) \n"
+ "beq 7f @ { \n"
+ "6: \n"
+ "ldr r12,[r2], #4 @ r12 = *contrib++ \n"
+ "ldrb r14,[r4], #1 @ r14 = *min++ \n"
+ "subs r9, r9, #1 @ r9 = len-- \n"
+ "@stall on r14 \n"
+ "mla r5, r12,r14,r5 @ a += r14 * r12 \n"
+ "bgt 6b @ } \n"
+ "7: \n"
+ "str r5, [r0], #4 @ *dst++=a \n"
+ "subs r3, r3, #1 @ i-- \n"
+ "bgt 5b @ \n"
+ "ldmfd r13!,{r4-r5,r9,PC} @ pop, return to thumb \n"
+ ENTER_THUMB
+ );
+}
+
+static void
+scale_row_to_temp2(int *dst, unsigned char *src, fz_weights *weights)
+{
+ asm volatile(
+ ENTER_ARM
+ "stmfd r13!,{r4-r6,r9-r11,r14} \n"
+ "@ r0 = dst \n"
+ "@ r1 = src \n"
+ "@ r2 = weights \n"
+ "ldr r12,[r2],#4 @ r12= flip \n"
+ "ldr r3, [r2],#16 @ r3 = count r2 = &index\n"
+ "ldr r4, [r2] @ r4 = index[0] \n"
+ "cmp r12,#0 @ if (flip) \n"
+ "beq 4f @ { \n"
+ "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
+ "add r0, r0, r3, LSL #3 @ dst += 2*count \n"
+ "1: \n"
+ "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
+ "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
+ "mov r5, #0 @ r5 = g = 0 \n"
+ "mov r6, #0 @ r6 = a = 0 \n"
+ "add r4, r1, r4, LSL #1 @ r4 = min = &src[2*r4] \n"
+ "cmp r9, #0 @ while (len-- > 0) \n"
+ "beq 3f @ { \n"
+ "2: \n"
+ "ldr r14,[r2], #4 @ r14 = *contrib++ \n"
+ "ldrb r11,[r4], #1 @ r11 = *min++ \n"
+ "ldrb r12,[r4], #1 @ r12 = *min++ \n"
+ "subs r9, r9, #1 @ r9 = len-- \n"
+ "mla r5, r14,r11,r5 @ g += r11 * r14 \n"
+ "mla r6, r14,r12,r6 @ a += r12 * r14 \n"
+ "bgt 2b @ } \n"
+ "3: \n"
+ "stmdb r0!,{r5,r6} @ *--dst=a;*--dst=g; \n"
+ "subs r3, r3, #1 @ i-- \n"
+ "bgt 1b @ \n"
+ "ldmfd r13!,{r4-r6,r9-r11,PC} @ pop, return to thumb \n"
+ "4:"
+ "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
+ "5:"
+ "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
+ "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
+ "mov r5, #0 @ r5 = g = 0 \n"
+ "mov r6, #0 @ r6 = a = 0 \n"
+ "add r4, r1, r4, LSL #1 @ r4 = min = &src[2*r4] \n"
+ "cmp r9, #0 @ while (len-- > 0) \n"
+ "beq 7f @ { \n"
+ "6: \n"
+ "ldr r14,[r2], #4 @ r10 = *contrib++ \n"
+ "ldrb r11,[r4], #1 @ r11 = *min++ \n"
+ "ldrb r12,[r4], #1 @ r12 = *min++ \n"
+ "subs r9, r9, #1 @ r9 = len-- \n"
+ "mla r5, r14,r11,r5 @ g += r11 * r14 \n"
+ "mla r6, r14,r12,r6 @ a += r12 * r14 \n"
+ "bgt 6b @ } \n"
+ "7: \n"
+ "stmia r0!,{r5,r6} @ *dst++=r;*dst++=g; \n"
+ "subs r3, r3, #1 @ i-- \n"
+ "bgt 5b @ \n"
+ "ldmfd r13!,{r4-r6,r9-r11,PC} @ pop, return to thumb \n"
+ ENTER_THUMB
+ );
+}
+
+static void
+scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
+{
+ asm volatile(
+ ENTER_ARM
+ "stmfd r13!,{r4-r11,r14} \n"
+ "@ r0 = dst \n"
+ "@ r1 = src \n"
+ "@ r2 = weights \n"
+ "ldr r12,[r2],#4 @ r12= flip \n"
+ "ldr r3, [r2],#16 @ r3 = count r2 = &index\n"
+ "ldr r4, [r2] @ r4 = index[0] \n"
+ "cmp r12,#0 @ if (flip) \n"
+ "beq 4f @ { \n"
+ "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
+ "add r0, r0, r3, LSL #4 @ dst += 4*count \n"
+ "1: \n"
+ "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
+ "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
+ "mov r5, #0 @ r5 = r = 0 \n"
+ "mov r6, #0 @ r6 = g = 0 \n"
+ "mov r7, #0 @ r7 = b = 0 \n"
+ "mov r8, #0 @ r8 = a = 0 \n"
+ "add r4, r1, r4, LSL #2 @ r4 = min = &src[4*r4] \n"
+ "cmp r9, #0 @ while (len-- > 0) \n"
+ "beq 3f @ { \n"
+ "2: \n"
+ "ldr r10,[r2], #4 @ r10 = *contrib++ \n"
+ "ldrb r11,[r4], #1 @ r11 = *min++ \n"
+ "ldrb r12,[r4], #1 @ r12 = *min++ \n"
+ "ldrb r14,[r4], #1 @ r14 = *min++ \n"
+ "mla r5, r10,r11,r5 @ r += r11 * r10 \n"
+ "ldrb r11,[r4], #1 @ r11 = *min++ \n"
+ "mla r6, r10,r12,r6 @ g += r12 * r10 \n"
+ "mla r7, r10,r14,r7 @ b += r14 * r10 \n"
+ "mla r8, r10,r11,r8 @ a += r11 * r10 \n"
+ "subs r9, r9, #1 @ r9 = len-- \n"
+ "bgt 2b @ } \n"
+ "3: \n"
+ "stmdb r0!,{r5,r6,r7,r8} @ *--dst=a;*--dst=b; \n"
+ " @ *--dst=g;*--dst=r; \n"
+ "subs r3, r3, #1 @ i-- \n"
+ "bgt 1b @ \n"
+ "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
+ "4:"
+ "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
+ "5:"
+ "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
+ "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
+ "mov r5, #0 @ r5 = r = 0 \n"
+ "mov r6, #0 @ r6 = g = 0 \n"
+ "mov r7, #0 @ r7 = b = 0 \n"
+ "mov r8, #0 @ r8 = a = 0 \n"
+ "add r4, r1, r4, LSL #2 @ r4 = min = &src[4*r4] \n"
+ "cmp r9, #0 @ while (len-- > 0) \n"
+ "beq 7f @ { \n"
+ "6: \n"
+ "ldr r10,[r2], #4 @ r10 = *contrib++ \n"
+ "ldrb r11,[r4], #1 @ r11 = *min++ \n"
+ "ldrb r12,[r4], #1 @ r12 = *min++ \n"
+ "ldrb r14,[r4], #1 @ r14 = *min++ \n"
+ "mla r5, r10,r11,r5 @ r += r11 * r10 \n"
+ "ldrb r11,[r4], #1 @ r11 = *min++ \n"
+ "mla r6, r10,r12,r6 @ g += r12 * r10 \n"
+ "mla r7, r10,r14,r7 @ b += r14 * r10 \n"
+ "mla r8, r10,r11,r8 @ a += r11 * r10 \n"
+ "subs r9, r9, #1 @ r9 = len-- \n"
+ "bgt 6b @ } \n"
+ "7: \n"
+ "stmia r0!,{r5,r6,r7,r8} @ *dst++=r;*dst++=g; \n"
+ " @ *dst++=b;*dst++=a; \n"
+ "subs r3, r3, #1 @ i-- \n"
+ "bgt 5b @ \n"
+ "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
+ ENTER_THUMB
+ );
+}
+
+static void
+scale_row_from_temp(unsigned char *dst, int *src, fz_weights *weights, int width, int row)
+{
+ asm volatile(
+ ENTER_ARM
+ "ldr r12,[r13] @ r12= row \n"
+ "add r2, r2, #20 @ r2 = weights->index \n"
+ "stmfd r13!,{r4-r11,r14} \n"
+ "@ r0 = dst \n"
+ "@ r1 = src \n"
+ "@ r2 = &weights->index[0] \n"
+ "@ r3 = width \n"
+ "@ r12= row \n"
+ "ldr r4, [r2, r12, LSL #2] @ r4 = index[row] \n"
+ "add r2, r2, #4 @ r2 = &index[1] \n"
+ "mov r6, r3 @ r6 = x = width \n"
+ "ldr r14,[r2, r4, LSL #2]! @ r2 = contrib = index[index[row]+1]\n"
+ " @ r14= len = *contrib \n"
+ "1: \n"
+ "mov r5, r1 @ r5 = min = src \n"
+ "mov r7, #1<<15 @ r7 = val = 1<<15 \n"
+ "movs r8, r14 @ r8 = len2 = len \n"
+ "add r9, r2, #4 @ r9 = contrib2 \n"
+ "ble 3f @ while (len2-- > 0) { \n"
+ "2: \n"
+ "ldr r10,[r9], #4 @ r10 = *contrib2++ \n"
+ "ldr r12,[r5], r3, LSL #2 @ r12 = *min r5 = min += width\n"
+ "subs r8, r8, #1 @ len2-- \n"
+ "@ stall r12 \n"
+ "mla r7, r10,r12,r7 @ val += r12 * r10 \n"
+ "bgt 2b @ } \n"
+ "3: \n"
+ "movs r7, r7, asr #16 @ r7 = val >>= 16 \n"
+ "movlt r7, #0 @ if (r7 < 0) r7 = 0 \n"
+ "cmp r7, #255 @ if (r7 > 255) \n"
+ "add r1, r1, #4 @ src++ \n"
+ "movgt r7, #255 @ r7 = 255 \n"
+ "subs r6, r6, #1 @ x-- \n"
+ "strb r7, [r0], #1 @ *dst++ = val \n"
+ "bgt 1b @ \n"
+ "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
+ ENTER_THUMB
+ );
+}
+
+#else
+
static void
scale_row_to_temp1(int *dst, unsigned char *src, fz_weights *weights)
{
@@ -672,54 +955,13 @@ static void
scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
{
int *contrib = &weights->index[weights->index[0]];
-#ifndef ARCH_ARM
int len, i;
unsigned char *min;
-#endif
assert(weights->n == 4);
if (weights->flip)
{
dst += 4*weights->count;
-#ifdef ARCH_ARM
- asm volatile(
- "1:"
- "ldr r4, [%2], #4 @ r4 = *contrib++ \n"
- "ldr r9, [%2], #4 @ r9 = len = *contrib++ \n"
- "mov r5, #0 @ r5 = r = 0 \n"
- "mov r6, #0 @ r6 = g = 0 \n"
- "mov r7, #0 @ r7 = b = 0 \n"
- "mov r8, #0 @ r8 = a = 0 \n"
- "add r4, %1, r4, LSL #2 @ r4 = min = &src[4*r4] \n"
- "cmp r9, #0 @ while (len-- > 0) \n"
- "beq 3f @ { \n"
- "2: \n"
- "ldr r10,[%2], #4 @ r10 = *contrib++ \n"
- "ldrb r11,[r4], #1 @ r11 = *min++ \n"
- "ldrb r12,[r4], #1 @ r12 = *min++ \n"
- "ldrb r14,[r4], #1 @ r14 = *min++ \n"
- "mla r5, r10,r11,r5 @ r += r11 * r10 \n"
- "ldrb r11,[r4], #1 @ r11 = *min++ \n"
- "mla r6, r10,r12,r6 @ g += r12 * r10 \n"
- "mla r7, r10,r14,r7 @ b += r14 * r10 \n"
- "mla r8, r10,r11,r8 @ a += r11 * r10 \n"
- "subs r9, r9, #1 @ r9 = len-- \n"
- "bgt 2b @ } \n"
- "stmdb %0!,{r5,r6,r7,r8} @ *--dst=a;*--dst=b; \n"
- "3: @ *--dst=g;*--dst=r; \n"
- "subs %3, %3, #1 @ i-- \n"
- "bgt 1b @ \n"
- :
- :
- "r" (dst),
- "r" (src),
- "r" (contrib),
- "r" (weights->count)
- :
- "r4","r5","r6","r7","r8","r9","r10","r11","r12","r14",
- "memory","cc"
- );
-#else
for (i=weights->count; i > 0; i--)
{
int r = 0;
@@ -740,49 +982,9 @@ scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
*--dst = g;
*--dst = r;
}
-#endif
}
else
{
-#ifdef ARCH_ARM
- asm volatile(
- "1:"
- "ldr r4, [%2], #4 @ r4 = *contrib++ \n"
- "ldr r9, [%2], #4 @ r9 = len = *contrib++ \n"
- "mov r5, #0 @ r5 = r = 0 \n"
- "mov r6, #0 @ r6 = g = 0 \n"
- "mov r7, #0 @ r7 = b = 0 \n"
- "mov r8, #0 @ r8 = a = 0 \n"
- "add r4, %1, r4, LSL #2 @ r4 = min = &src[4*r4] \n"
- "cmp r9, #0 @ while (len-- > 0) \n"
- "beq 3f @ { \n"
- "2: \n"
- "ldr r10,[%2], #4 @ r10 = *contrib++ \n"
- "ldrb r11,[r4], #1 @ r11 = *min++ \n"
- "ldrb r12,[r4], #1 @ r12 = *min++ \n"
- "ldrb r14,[r4], #1 @ r14 = *min++ \n"
- "mla r5, r10,r11,r5 @ r += r11 * r10 \n"
- "ldrb r11,[r4], #1 @ r11 = *min++ \n"
- "mla r6, r10,r12,r6 @ g += r12 * r10 \n"
- "mla r7, r10,r14,r7 @ b += r14 * r10 \n"
- "mla r8, r10,r11,r8 @ a += r11 * r10 \n"
- "subs r9, r9, #1 @ r9 = len-- \n"
- "bgt 2b @ } \n"
- "stmia %0!,{r5,r6,r7,r8} @ *dst++=r;*dst++=g; \n"
- "3: @ *dst++=b;*dst++=a; \n"
- "subs %3, %3, #1 @ i-- \n"
- "bgt 1b @ \n"
- :
- :
- "r" (dst),
- "r" (src),
- "r" (contrib),
- "r" (weights->count)
- :
- "r4","r5","r6","r7","r8","r9","r10","r11","r12","r14",
- "memory","cc"
- );
-#else
for (i=weights->count; i > 0; i--)
{
int r = 0;
@@ -803,7 +1005,6 @@ scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
*dst++ = b;
*dst++ = a;
}
-#endif
}
}
@@ -836,6 +1037,7 @@ scale_row_from_temp(unsigned char *dst, int *src, fz_weights *weights, int width
src++;
}
}
+#endif
#ifdef SINGLE_PIXEL_SPECIALS
static void