Merge branch 'master' into context

Mostly redoing the xps_context to xps_document change and adding contexts to newly written code. Conflicts: apps/pdfapp.c apps/pdfapp.h apps/x11_main.c apps/xpsdraw.c draw/draw_device.c draw/draw_scale.c fitz/base_object.c fitz/fitz.h pdf/mupdf.h pdf/pdf_interpret.c pdf/pdf_outline.c pdf/pdf_page.c xps/muxps.h xps/xps_doc.c xps/xps_xml.c
author: Robin Watts <robin.watts@artifex.com> 2011-11-14 18:22:13 +0000
committer: Robin Watts <robin.watts@artifex.com> 2011-11-15 15:20:54 +0000
commit: 9c0a49060475b2dea1e4c2668bebd1d566113a7b (patch)
tree: 49e45a691cf105f4266d5c6b7242a4a3256c1200 /draw
parent: 60c0544742931da63db623ad7a79ba3758704cc1 (diff)
parent: fd6def85f22b598d4c278e76138ab7dccbb84c36 (diff)
download: mupdf-9c0a49060475b2dea1e4c2668bebd1d566113a7b.tar.xz
4 files changed, 1795 insertions, 152 deletions
diff --git a/draw/draw_affine.c b/draw/draw_affine.c
index 95d28bea..c1ee88f8 100644
--- a/draw/draw_affine.c
+++ b/draw/draw_affine.c
@@ -454,6 +454,146 @@ fz_paint_affine_color_near(byte *dp, byte *sp, int sw, int sh, int u, int v, int
 	}
 }
 
+/* RJW: The following code was originally written to be sensitive to
+ * FLT_EPSILON. Given the way the 'minimum representable difference'
+ * between 2 floats changes size as we scale, we now pick a larger
+ * value to ensure idempotency even with rounding problems. The
+ * value we pick is still far smaller than would ever show up with
+ * antialiasing.
+ */
+#define MY_EPSILON 0.001
+
+void
+fz_gridfit_matrix(fz_matrix *m)
+{
+	if (fabsf(m->b) < FLT_EPSILON && fabsf(m->c) < FLT_EPSILON)
+	{
+		if (m->a > 0)
+		{
+			float f;
+			/* Adjust left hand side onto pixel boundary */
+			f = (float)(int)(m->e);
+			if (f - m->e > MY_EPSILON)
+				f -= 1.0; /* Ensure it moves left */
+			m->a += m->e - f; /* width gets wider as f <= m->e */
+			m->e = f;
+			/* Adjust right hand side onto pixel boundary */
+			f = (float)(int)(m->a);
+			if (m->a - f > MY_EPSILON)
+				f += 1.0; /* Ensure it moves right */
+			m->a = f;
+		}
+		else if (m->a < 0)
+		{
+			float f;
+			/* Adjust right hand side onto pixel boundary */
+			f = (float)(int)(m->e);
+			if (m->e - f > MY_EPSILON)
+				f += 1.0; /* Ensure it moves right */
+			m->a += m->e - f; /* width gets wider (more -ve) */
+			m->e = f;
+			/* Adjust left hand side onto pixel boundary */
+			f = (float)(int)(m->a);
+			if (f - m->a > MY_EPSILON)
+				f -= 1.0; /* Ensure it moves left */
+			m->a = f;
+		}
+		if (m->d > 0)
+		{
+			float f;
+			/* Adjust top onto pixel boundary */
+			f = (float)(int)(m->f);
+			if (f - m->f > MY_EPSILON)
+				f -= 1.0; /* Ensure it moves upwards */
+			m->d += m->f - f; /* width gets wider as f <= m->f */
+			m->f = f;
+			/* Adjust bottom onto pixel boundary */
+			f = (float)(int)(m->d);
+			if (m->d - f > MY_EPSILON)
+				f += 1.0; /* Ensure it moves down */
+			m->d = f;
+		}
+		else if (m->d < 0)
+		{
+			float f;
+			/* Adjust bottom onto pixel boundary */
+			f = (float)(int)(m->f);
+			if (m->f - f > MY_EPSILON)
+				f += 1.0; /* Ensure it moves down */
+			m->d += m->f - f; /* width gets wider (more -ve) */
+			m->f = f;
+			/* Adjust top onto pixel boundary */
+			f = (float)(int)(m->d);
+			if (f - m->d > MY_EPSILON)
+				f -= 1.0; /* Ensure it moves up */
+			m->d = f;
+		}
+	}
+	else if (fabsf(m->a) < FLT_EPSILON && fabsf(m->d) < FLT_EPSILON)
+	{
+		if (m->b > 0)
+		{
+			float f;
+			/* Adjust left hand side onto pixel boundary */
+			f = (float)(int)(m->f);
+			if (f - m->f > MY_EPSILON)
+				f -= 1.0; /* Ensure it moves left */
+			m->b += m->f - f; /* width gets wider as f <= m->f */
+			m->f = f;
+			/* Adjust right hand side onto pixel boundary */
+			f = (float)(int)(m->b);
+			if (m->b - f > MY_EPSILON)
+				f += 1.0; /* Ensure it moves right */
+			m->b = f;
+		}
+		else if (m->b < 0)
+		{
+			float f;
+			/* Adjust right hand side onto pixel boundary */
+			f = (float)(int)(m->f);
+			if (m->f - f > MY_EPSILON)
+				f += 1.0; /* Ensure it moves right */
+			m->b += m->f - f; /* width gets wider (more -ve) */
+			m->f = f;
+			/* Adjust left hand side onto pixel boundary */
+			f = (float)(int)(m->b);
+			if (f - m->b > MY_EPSILON)
+				f -= 1.0; /* Ensure it moves left */
+			m->b = f;
+		}
+		if (m->c > 0)
+		{
+			float f;
+			/* Adjust top onto pixel boundary */
+			f = (float)(int)(m->e);
+			if (f - m->e > MY_EPSILON)
+				f -= 1.0; /* Ensure it moves upwards */
+			m->c += m->e - f; /* width gets wider as f <= m->e */
+			m->e = f;
+			/* Adjust bottom onto pixel boundary */
+			f = (float)(int)(m->c);
+			if (m->c - f > MY_EPSILON)
+				f += 1.0; /* Ensure it moves down */
+			m->c = f;
+		}
+		else if (m->c < 0)
+		{
+			float f;
+			/* Adjust bottom onto pixel boundary */
+			f = (float)(int)(m->e);
+			if (m->e - f > MY_EPSILON)
+				f += 1.0; /* Ensure it moves down */
+			m->c += m->e - f; /* width gets wider (more -ve) */
+			m->e = f;
+			/* Adjust top onto pixel boundary */
+			f = (float)(int)(m->c);
+			if (f - m->c > MY_EPSILON)
+				f -= 1.0; /* Ensure it moves up */
+			m->c = f;
+		}
+	}
+}
+
 /* Draw an image with an affine transform on destination */
 
 static void
@@ -469,15 +609,7 @@ fz_paint_image_imp(fz_pixmap *dst, fz_bbox scissor, fz_pixmap *shape, fz_pixmap
 	void (*paintfn)(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha, byte *color, byte *hp);
 
 	/* grid fit the image */
-	if (fz_is_rectilinear(ctm))
-	{
-		ctm.a = roundup(ctm.a);
-		ctm.b = roundup(ctm.b);
-		ctm.c = roundup(ctm.c);
-		ctm.d = roundup(ctm.d);
-		ctm.e = floorf(ctm.e);
-		ctm.f = floorf(ctm.f);
-	}
+	fz_gridfit_matrix(&ctm);
 
 	/* turn on interpolation for upscaled and non-rectilinear transforms */
 	dolerp = 0;
@@ -519,6 +651,14 @@ fz_paint_image_imp(fz_pixmap *dst, fz_bbox scissor, fz_pixmap *shape, fz_pixmap
 	u = (fa * x) + (fc * y) + inv.e * 65536 + ((fa + fc) >> 1);
 	v = (fb * x) + (fd * y) + inv.f * 65536 + ((fb + fd) >> 1);
 
+	/* RJW: The following is voodoo. No idea why it works, but it gives
+	 * the best match between scaled/unscaled/interpolated/non-interpolated
+	 * that we have found. */
+	if (dolerp) {
+		u -= 32768;
+		v -= 32768;
+	}
+
 	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
 	n = dst->n;
 	sp = img->samples;
diff --git a/draw/draw_device.c b/draw/draw_device.c
index a8da9e19..5407d618 100644
--- a/draw/draw_device.c
+++ b/draw/draw_device.c
@@ -861,7 +861,10 @@ fz_transform_pixmap(fz_context *ctx, fz_pixmap *image, fz_matrix *ctm, int x, in
 	if (ctm->a != 0 && ctm->b == 0 && ctm->c == 0 && ctm->d != 0)
 	{
 		/* Unrotated or X-flip or Y-flip or XY-flip */
-		scaled = fz_scale_pixmap_gridfit(ctx, image, ctm->e, ctm->f, ctm->a, ctm->d, gridfit);
+		fz_matrix m = *ctm;
+		if (gridfit)
+			fz_gridfit_matrix(&m);
+		scaled = fz_scale_pixmap(ctx, image, m.e, m.f, m.a, m.d);
 		if (scaled == NULL)
 			return NULL;
 		ctm->a = scaled->w;
@@ -874,7 +877,10 @@ fz_transform_pixmap(fz_context *ctx, fz_pixmap *image, fz_matrix *ctm, int x, in
 	if (ctm->a == 0 && ctm->b != 0 && ctm->c != 0 && ctm->d == 0)
 	{
 		/* Other orthogonal flip/rotation cases */
-		scaled = fz_scale_pixmap_gridfit(ctx, image, ctm->f, ctm->e, ctm->b, ctm->c, gridfit);
+		fz_matrix m = *ctm;
+		if (gridfit)
+			fz_gridfit_matrix(&m);
+		scaled = fz_scale_pixmap(ctx, image, m.f, m.e, m.b, m.c);
 		if (scaled == NULL)
 			return NULL;
 		ctm->b = scaled->w;
diff --git a/draw/draw_scale.c b/draw/draw_scale.c
index 4d1b66c5..fd3c3798 100644
--- a/draw/draw_scale.c
+++ b/draw/draw_scale.c
@@ -16,6 +16,21 @@ and then positioning it at (frac(x),frac(y)).
  */
 #define SINGLE_PIXEL_SPECIALS
 
+/* If we're compiling as thumb code, then we need to tell the compiler
+ * to enter and exit ARM mode around our assembly sections. If we move
+ * the ARM functions to a separate file and arrange for it to be compiled
+ * without thumb mode, we can save some time on entry.
+ */
+#ifdef ARCH_ARM
+#ifdef ARCH_THUMB
+#define ENTER_ARM   ".balign 4\nmov r12,pc\nbx r12\n0:.arm\n"
+#define ENTER_THUMB "9:.thumb\n"
+#else
+#define ENTER_ARM
+#define ENTER_THUMB
+#endif
+#endif
+
 #ifdef DEBUG_SCALING
 #ifdef WIN32
 #include <windows.h>
@@ -247,10 +262,10 @@ typedef struct fz_weights_s fz_weights;
 
 struct fz_weights_s
 {
+	int flip;
 	int count;
 	int max_len;
 	int n;
-	int flip;
 	int new_line;
 	int index[1];
 };
@@ -584,6 +599,274 @@ scale_row_to_temp(int *dst, unsigned char *src, fz_weights *weights)
 	}
 }
 
+#ifdef ARCH_ARM
+
+static void
+scale_row_to_temp1(int *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_to_temp2(int *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_from_temp(unsigned char *dst, int *src, fz_weights *weights, int width, int row)
+__attribute__((naked));
+
+static void
+scale_row_to_temp1(int *dst, unsigned char *src, fz_weights *weights)
+{
+	/* possible optimisation in here; unroll inner loops to avoid stall. */
+	asm volatile(
+	ENTER_ARM
+	"stmfd	r13!,{r4-r5,r9,r14}				\n"
+	"@ r0 = dst						\n"
+	"@ r1 = src						\n"
+	"@ r2 = weights						\n"
+	"ldr	r12,[r2],#4		@ r12= flip		\n"
+	"ldr	r3, [r2],#16		@ r3 = count r2 = &index\n"
+	"ldr    r4, [r2]		@ r4 = index[0]		\n"
+	"cmp	r12,#0			@ if (flip)		\n"
+	"beq	4f			@ {			\n"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"add	r0, r0, r3, LSL #2	@ dst += count		\n"
+	"1:							\n"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #0			@ r5 = a = 0		\n"
+	"add	r4, r1, r4		@ r4 = min = &src[r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	3f			@ {			\n"
+	"2:							\n"
+	"ldr	r12,[r2], #4		@ r12 = *contrib++	\n"
+	"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"@stall on r14						\n"
+	"mla	r5, r12,r14,r5		@ g += r14 * r12	\n"
+	"bgt	2b			@ }			\n"
+	"3:							\n"
+	"str	r5,[r0, #-4]!		@ *--dst=a		\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	1b			@ 			\n"
+	"ldmfd	r13!,{r4-r5,r9,PC}	@ pop, return to thumb	\n"
+	"4:"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"5:"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #0			@ r5 = a = 0		\n"
+	"add	r4, r1, r4		@ r4 = min = &src[r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	7f			@ {			\n"
+	"6:							\n"
+	"ldr	r12,[r2], #4		@ r12 = *contrib++	\n"
+	"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"@stall on r14						\n"
+	"mla	r5, r12,r14,r5		@ a += r14 * r12	\n"
+	"bgt	6b			@ }			\n"
+	"7:							\n"
+	"str	r5, [r0], #4		@ *dst++=a		\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	5b			@ 			\n"
+	"ldmfd	r13!,{r4-r5,r9,PC}	@ pop, return to thumb	\n"
+	ENTER_THUMB
+	);
+}
+
+static void
+scale_row_to_temp2(int *dst, unsigned char *src, fz_weights *weights)
+{
+	asm volatile(
+	ENTER_ARM
+	"stmfd	r13!,{r4-r6,r9-r11,r14}				\n"
+	"@ r0 = dst						\n"
+	"@ r1 = src						\n"
+	"@ r2 = weights						\n"
+	"ldr	r12,[r2],#4		@ r12= flip		\n"
+	"ldr	r3, [r2],#16		@ r3 = count r2 = &index\n"
+	"ldr    r4, [r2]		@ r4 = index[0]		\n"
+	"cmp	r12,#0			@ if (flip)		\n"
+	"beq	4f			@ {			\n"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"add	r0, r0, r3, LSL #3	@ dst += 2*count	\n"
+	"1:							\n"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #0			@ r5 = g = 0		\n"
+	"mov	r6, #0			@ r6 = a = 0		\n"
+	"add	r4, r1, r4, LSL #1	@ r4 = min = &src[2*r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	3f			@ {			\n"
+	"2:							\n"
+	"ldr	r14,[r2], #4		@ r14 = *contrib++	\n"
+	"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+	"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"mla	r5, r14,r11,r5		@ g += r11 * r14	\n"
+	"mla	r6, r14,r12,r6		@ a += r12 * r14	\n"
+	"bgt	2b			@ }			\n"
+	"3:							\n"
+	"stmdb	r0!,{r5,r6}		@ *--dst=a;*--dst=g;	\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	1b			@ 			\n"
+	"ldmfd	r13!,{r4-r6,r9-r11,PC}	@ pop, return to thumb	\n"
+	"4:"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"5:"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #0			@ r5 = g = 0		\n"
+	"mov	r6, #0			@ r6 = a = 0		\n"
+	"add	r4, r1, r4, LSL #1	@ r4 = min = &src[2*r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	7f			@ {			\n"
+	"6:							\n"
+	"ldr	r14,[r2], #4		@ r10 = *contrib++	\n"
+	"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+	"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"mla	r5, r14,r11,r5		@ g += r11 * r14	\n"
+	"mla	r6, r14,r12,r6		@ a += r12 * r14	\n"
+	"bgt	6b			@ }			\n"
+	"7:							\n"
+	"stmia	r0!,{r5,r6}		@ *dst++=r;*dst++=g;	\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	5b			@ 			\n"
+	"ldmfd	r13!,{r4-r6,r9-r11,PC}	@ pop, return to thumb	\n"
+	ENTER_THUMB
+	);
+}
+
+static void
+scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
+{
+	asm volatile(
+	ENTER_ARM
+	"stmfd	r13!,{r4-r11,r14}				\n"
+	"@ r0 = dst						\n"
+	"@ r1 = src						\n"
+	"@ r2 = weights						\n"
+	"ldr	r12,[r2],#4		@ r12= flip		\n"
+	"ldr	r3, [r2],#16		@ r3 = count r2 = &index\n"
+	"ldr    r4, [r2]		@ r4 = index[0]		\n"
+	"cmp	r12,#0			@ if (flip)		\n"
+	"beq	4f			@ {			\n"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"add	r0, r0, r3, LSL #4	@ dst += 4*count	\n"
+	"1:							\n"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #0			@ r5 = r = 0		\n"
+	"mov	r6, #0			@ r6 = g = 0		\n"
+	"mov	r7, #0			@ r7 = b = 0		\n"
+	"mov	r8, #0			@ r8 = a = 0		\n"
+	"add	r4, r1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	3f			@ {			\n"
+	"2:							\n"
+	"ldr	r10,[r2], #4		@ r10 = *contrib++	\n"
+	"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+	"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
+	"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
+	"mla	r5, r10,r11,r5		@ r += r11 * r10	\n"
+	"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+	"mla	r6, r10,r12,r6		@ g += r12 * r10	\n"
+	"mla	r7, r10,r14,r7		@ b += r14 * r10	\n"
+	"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"bgt	2b			@ }			\n"
+	"3:							\n"
+	"stmdb	r0!,{r5,r6,r7,r8}	@ *--dst=a;*--dst=b;	\n"
+	"				@ *--dst=g;*--dst=r;	\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	1b			@ 			\n"
+	"ldmfd	r13!,{r4-r11,PC}	@ pop, return to thumb	\n"
+	"4:"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"5:"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #0			@ r5 = r = 0		\n"
+	"mov	r6, #0			@ r6 = g = 0		\n"
+	"mov	r7, #0			@ r7 = b = 0		\n"
+	"mov	r8, #0			@ r8 = a = 0		\n"
+	"add	r4, r1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	7f			@ {			\n"
+	"6:							\n"
+	"ldr	r10,[r2], #4		@ r10 = *contrib++	\n"
+	"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+	"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
+	"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
+	"mla	r5, r10,r11,r5		@ r += r11 * r10	\n"
+	"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+	"mla	r6, r10,r12,r6		@ g += r12 * r10	\n"
+	"mla	r7, r10,r14,r7		@ b += r14 * r10	\n"
+	"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"bgt	6b			@ }			\n"
+	"7:							\n"
+	"stmia	r0!,{r5,r6,r7,r8}	@ *dst++=r;*dst++=g;	\n"
+	"				@ *dst++=b;*dst++=a;	\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	5b			@ 			\n"
+	"ldmfd	r13!,{r4-r11,PC}	@ pop, return to thumb	\n"
+	ENTER_THUMB
+	);
+}
+
+static void
+scale_row_from_temp(unsigned char *dst, int *src, fz_weights *weights, int width, int row)
+{
+	asm volatile(
+	ENTER_ARM
+	"ldr	r12,[r13]		@ r12= row		\n"
+	"add	r2, r2, #20		@ r2 = weights->index	\n"
+	"stmfd	r13!,{r4-r11,r14}				\n"
+	"@ r0 = dst						\n"
+	"@ r1 = src						\n"
+	"@ r2 = &weights->index[0]				\n"
+	"@ r3 = width						\n"
+	"@ r12= row						\n"
+	"ldr    r4, [r2, r12, LSL #2]	@ r4 = index[row]	\n"
+	"add	r2, r2, #4		@ r2 = &index[1]	\n"
+	"mov	r6, r3			@ r6 = x = width	\n"
+	"ldr	r14,[r2, r4, LSL #2]!	@ r2 = contrib = index[index[row]+1]\n"
+	"				@ r14= len = *contrib	\n"
+	"1:							\n"
+	"mov	r5, r1			@ r5 = min = src	\n"
+	"mov	r7, #1<<15		@ r7 = val = 1<<15	\n"
+	"movs	r8, r14			@ r8 = len2 = len	\n"
+	"add	r9, r2, #4		@ r9 = contrib2		\n"
+	"ble	3f			@ while (len2-- > 0) {	\n"
+	"2:							\n"
+	"ldr	r10,[r9], #4		@ r10 = *contrib2++	\n"
+	"ldr	r12,[r5], r3, LSL #2	@ r12 = *min	r5 = min += width\n"
+	"subs	r8, r8, #1		@ len2--		\n"
+	"@ stall r12						\n"
+	"mla	r7, r10,r12,r7		@ val += r12 * r10	\n"
+	"bgt	2b			@ }			\n"
+	"3:							\n"
+	"movs	r7, r7, asr #16		@ r7 = val >>= 16	\n"
+	"movlt	r7, #0			@ if (r7 < 0) r7 = 0	\n"
+	"cmp	r7, #255		@ if (r7 > 255)		\n"
+	"add	r1, r1, #4		@ src++			\n"
+	"movgt	r7, #255		@     r7 = 255		\n"
+	"subs	r6, r6, #1		@ x--			\n"
+	"strb	r7, [r0], #1		@ *dst++ = val		\n"
+	"bgt	1b			@ 			\n"
+	"ldmfd	r13!,{r4-r11,PC}	@ pop, return to thumb	\n"
+	ENTER_THUMB
+	);
+}
+
+#else
+
 static void
 scale_row_to_temp1(int *dst, unsigned char *src, fz_weights *weights)
 {
@@ -672,54 +955,13 @@ static void
 scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
 {
 	int *contrib = &weights->index[weights->index[0]];
-#ifndef ARCH_ARM
 	int len, i;
 	unsigned char *min;
-#endif
 
 	assert(weights->n == 4);
 	if (weights->flip)
 	{
 		dst += 4*weights->count;
-#ifdef ARCH_ARM
-		asm volatile(
-		"1:"
-		"ldr	r4, [%2], #4		@ r4 = *contrib++	\n"
-		"ldr	r9, [%2], #4		@ r9 = len = *contrib++	\n"
-		"mov	r5, #0			@ r5 = r = 0		\n"
-		"mov	r6, #0			@ r6 = g = 0		\n"
-		"mov	r7, #0			@ r7 = b = 0		\n"
-		"mov	r8, #0			@ r8 = a = 0		\n"
-		"add	r4, %1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
-		"cmp	r9, #0			@ while (len-- > 0)	\n"
-		"beq	3f			@ {			\n"
-		"2:							\n"
-		"ldr	r10,[%2], #4		@ r10 = *contrib++	\n"
-		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
-		"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
-		"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
-		"mla	r5, r10,r11,r5		@ r += r11 * r10	\n"
-		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
-		"mla	r6, r10,r12,r6		@ g += r12 * r10	\n"
-		"mla	r7, r10,r14,r7		@ b += r14 * r10	\n"
-		"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
-		"subs	r9, r9, #1		@ r9 = len--		\n"
-		"bgt	2b			@ }			\n"
-		"stmdb	%0!,{r5,r6,r7,r8}	@ *--dst=a;*--dst=b;	\n"
-		"3:				@ *--dst=g;*--dst=r;	\n"
-		"subs	%3, %3, #1		@ i--			\n"
-		"bgt	1b			@ 			\n"
-		:
-		:
-		"r" (dst),
-		"r" (src),
-		"r" (contrib),
-		"r" (weights->count)
-		:
-		"r4","r5","r6","r7","r8","r9","r10","r11","r12","r14",
-		"memory","cc"
-		);
-#else
 		for (i=weights->count; i > 0; i--)
 		{
 			int r = 0;
@@ -740,49 +982,9 @@ scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
 			*--dst = g;
 			*--dst = r;
 		}
-#endif
 	}
 	else
 	{
-#ifdef ARCH_ARM
-		asm volatile(
-		"1:"
-		"ldr	r4, [%2], #4		@ r4 = *contrib++	\n"
-		"ldr	r9, [%2], #4		@ r9 = len = *contrib++	\n"
-		"mov	r5, #0			@ r5 = r = 0		\n"
-		"mov	r6, #0			@ r6 = g = 0		\n"
-		"mov	r7, #0			@ r7 = b = 0		\n"
-		"mov	r8, #0			@ r8 = a = 0		\n"
-		"add	r4, %1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
-		"cmp	r9, #0			@ while (len-- > 0)	\n"
-		"beq	3f			@ {			\n"
-		"2:							\n"
-		"ldr	r10,[%2], #4		@ r10 = *contrib++	\n"
-		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
-		"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
-		"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
-		"mla	r5, r10,r11,r5		@ r += r11 * r10	\n"
-		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
-		"mla	r6, r10,r12,r6		@ g += r12 * r10	\n"
-		"mla	r7, r10,r14,r7		@ b += r14 * r10	\n"
-		"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
-		"subs	r9, r9, #1		@ r9 = len--		\n"
-		"bgt	2b			@ }			\n"
-		"stmia	%0!,{r5,r6,r7,r8}	@ *dst++=r;*dst++=g;	\n"
-		"3:				@ *dst++=b;*dst++=a;	\n"
-		"subs	%3, %3, #1		@ i--			\n"
-		"bgt	1b			@ 			\n"
-		:
-		:
-		"r" (dst),
-		"r" (src),
-		"r" (contrib),
-		"r" (weights->count)
-		:
-		"r4","r5","r6","r7","r8","r9","r10","r11","r12","r14",
-		"memory","cc"
-		);
-#else
 		for (i=weights->count; i > 0; i--)
 		{
 			int r = 0;
@@ -803,7 +1005,6 @@ scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
 			*dst++ = b;
 			*dst++ = a;
 		}
-#endif
 	}
 }
 
@@ -836,6 +1037,7 @@ scale_row_from_temp(unsigned char *dst, int *src, fz_weights *weights, int width
 		src++;
 	}
 }
+#endif
 
 #ifdef SINGLE_PIXEL_SPECIALS
 static void
@@ -1004,64 +1206,6 @@ scale_single_col(unsigned char *dst, unsigned char *src, fz_weights *weights, in
 }
 #endif /* SINGLE_PIXEL_SPECIALS */
 
-fz_pixmap *
-fz_scale_pixmap_gridfit(fz_context *ctx, fz_pixmap *src, float x, float y, float w, float h, int gridfit)
-{
-	if (gridfit) {
-		float n;
-		if (w > 0) {
-			/* Adjust the left hand edge, leftwards to a pixel boundary */
-			n = (float)(int)x;   /* n is now on a pixel boundary */
-			if (n > x)           /* Ensure it's the pixel boundary BELOW x */
-				n -= 1.0f;
-			w += x-n;            /* width gets wider as x >= n */
-			x = n;
-			/* Adjust the right hand edge rightwards to a pixel boundary */
-			n = (float)(int)w;   /* n is now the integer width <= w */
-			if (n != w)          /* If w isn't an integer already, bump it */
-				w = 1.0f + n;/* up to the next integer. */
-		} else {
-			/* Adjust the right hand edge, rightwards to a pixel boundary */
-			n = (float)(int)x;   /* n is now on a pixel boundary */
-			if (n > x)           /* Ensure it's the pixel boundary <= x */
-				n -= 1.0f;
-			if (n != x)          /* If x isn't on a pixel boundary already, */
-				n += 1.0f;   /* make n be the pixel boundary above x. */
-			w -= n-x;            /* Expand width (more negative!) as n >= x */
-			x = n;
-			/* Adjust the left hand edge leftwards to a pixel boundary */
-			n = (float)(int)w;
-			if (n != w)
-				w = n - 1.0f;
-		}
-		if (h > 0) {
-			/* Adjust the bottom edge, downwards to a pixel boundary */
-			n = (float)(int)y;   /* n is now on a pixel boundary */
-			if (n > y)           /* Ensure it's the pixel boundary BELOW y */
-				n -= 1.0f;
-			h += y-n;            /* height gets larger as y >= n */
-			y = n;
-			/* Adjust the top edge upwards to a pixel boundary */
-			n = (float)(int)h;   /* n is now the integer height <= h */
-			if (n != h)          /* If h isn't an integer already, bump it */
-				h = 1.0f + n;/* up to the next integer. */
-		} else {
-			/* Adjust the top edge, upwards to a pixel boundary */
-			n = (float)(int)y;   /* n is now on a pixel boundary */
-			if (n > y)           /* Ensure it's the pixel boundary <= y */
-			n -= 1.0f;
-			if (n != y)          /* If y isn't on a pixel boundary already, */
-				n += 1.0f;   /* make n be the pixel boundary above y. */
-			h -= n-y;            /* Expand height (more negative!) as n >= y */
-			y = n;
-			/* Adjust the bottom edge downwards to a pixel boundary */
-			n = (float)(int)h;
-			if (n != h)
-				h = n - 1.0f;
-		}
-	}
-	return fz_scale_pixmap(ctx, src, x, y, w, h);
-}
 
 fz_pixmap *
 fz_scale_pixmap(fz_context *ctx, fz_pixmap *src, float x, float y, float w, float h)
diff --git a/draw/draw_simple_scale.c b/draw/draw_simple_scale.c
new file mode 100644
index 00000000..dcd5cd68
--- /dev/null
+++ b/draw/draw_simple_scale.c
@@ -0,0 +1,1353 @@
+/*
+This code does smooth scaling of a pixmap.
+
+This function returns a new pixmap representing the area starting at (0,0)
+given by taking the source pixmap src, scaling it to width w, and height h,
+and then positioning it at (frac(x),frac(y)).
+
+This is a cut-down version of draw_scale.c that only copes with filters
+that return values strictly in the 0..1 range, and uses bytes for
+intermediate results rather than ints.
+*/
+
+#include "fitz.h"
+
+/* Do we special case handling of single pixel high/wide images? The
+ * 'purest' handling is given by not special casing them, but certain
+ * files that use such images 'stack' them to give full images. Not
+ * special casing them results in then being fainter and giving noticable
+ * rounding errors.
+ */
+#define SINGLE_PIXEL_SPECIALS
+
+/* If we're compiling as thumb code, then we need to tell the compiler
+ * to enter and exit ARM mode around our assembly sections. If we move
+ * the ARM functions to a separate file and arrange for it to be compiled
+ * without thumb mode, we can save some time on entry.
+ */
+#ifdef ARCH_ARM
+#ifdef ARCH_THUMB
+#define ENTER_ARM   ".balign 4\nmov r12,pc\nbx r12\n0:.arm\n"
+#define ENTER_THUMB "9:.thumb\n"
+#else
+#define ENTER_ARM
+#define ENTER_THUMB
+#endif
+#endif
+
+#ifdef DEBUG_SCALING
+#ifdef WIN32
+#include <windows.h>
+static void debug_print(const char *fmt, ...)
+{
+	va_list args;
+	char text[256];
+	va_start(args, fmt);
+	vsprintf(text, fmt, args);
+	va_end(args);
+	OutputDebugStringA(text);
+	printf(text);
+}
+#else
+static void debug_print(const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	vfprintf(stderr, fmt, args);
+	va_end(args);
+}
+#endif
+#endif
+#ifdef DEBUG_SCALING
+#define DBUG(A) debug_print A
+#else
+#define DBUG(A) do {} while(0==1)
+#endif
+
+/*
+Consider a row of source samples, src, of width src_w, positioned at x,
+scaled to width dst_w.
+
+src[i] is centred at: x + (i + 0.5)*dst_w/src_w
+
+Therefore the distance between the centre of the jth output pixel and
+the centre of the ith source sample is:
+
+dist[j,i] = j + 0.5 - (x + (i + 0.5)*dst_w/src_w)
+
+When scaling up, therefore:
+
+dst[j] = SUM(filter(dist[j,i]) * src[i])
+	(for all ints i)
+
+This can be simplified by noticing that filters are only non zero within
+a given filter width (henceforth called W). So:
+
+dst[j] = SUM(filter(dist[j,i]) * src[i])
+	(for ints i, s.t. (j*src_w/dst_w)-W < i < (j*src_w/dst_w)+W)
+
+When scaling down, each filtered source sample is stretched to be wider
+to avoid aliasing issues. This effectively reduces the distance between
+centres.
+
+dst[j] = SUM(filter(dist[j,i] * F) * F * src[i])
+	(where F = dst_w/src_w)
+	(for ints i, s.t. (j-W)/F < i < (j+W)/F)
+
+*/
+
+typedef struct fz_scale_filter_s fz_scale_filter;
+
+struct fz_scale_filter_s
+{
+	int width;
+	float (*fn)(fz_scale_filter *, float);
+};
+
+/* Image scale filters */
+
+static float
+triangle(fz_scale_filter *filter, float f)
+{
+	if (f >= 1)
+		return 0;
+	return 1-f;
+}
+
+static float
+box(fz_scale_filter *filter, float f)
+{
+	if (f >= 0.5f)
+		return 0;
+	return 1;
+}
+
+static float
+simple(fz_scale_filter *filter, float x)
+{
+	if (x >= 1)
+		return 0;
+	return 1 + (2*x - 3)*x*x;
+}
+
+fz_scale_filter fz_scale_filter_box = { 1, box };
+fz_scale_filter fz_scale_filter_triangle = { 1, triangle };
+fz_scale_filter fz_scale_filter_simple = { 1, simple };
+
+/*
+We build ourselves a set of tables to contain the precalculated weights
+for a given set of scale settings.
+
+The first dst_w entries in index are the index into index of the
+sets of weight for each destination pixel.
+
+Each of the sets of weights is a set of values consisting of:
+	the minimum source pixel index used for this destination pixel
+	the number of weights used for this destination pixel
+	the weights themselves
+
+So to calculate dst[i] we do the following:
+
+	weights = &index[index[i]];
+	min = *weights++;
+	len = *weights++;
+	dst[i] = 0;
+	while (--len > 0)
+		dst[i] += src[min++] * *weights++
+
+in addition, we guarantee that at the end of this process weights will now
+point to the weights value for dst pixel i+1.
+
+In the simplest version of this algorithm, we would scale the whole image
+horizontally first into a temporary buffer, then scale that temporary
+buffer again vertically to give us our result. Using such a simple
+algorithm would mean that could use the same style of weights for both
+horizontal and vertical scaling.
+
+Unfortunately, this would also require a large temporary buffer,
+particularly in the case where we are scaling up.
+
+We therefore modify the algorithm as follows; we scale scanlines from the
+source image horizontally into a temporary buffer, until we have all the
+contributors for a given output scanline. We then produce that output
+scanline from the temporary buffer. In this way we restrict the height
+of the temporary buffer to a small fraction of the final size.
+
+Unfortunately, this means that the pseudo code for recombining a
+scanline of fully scaled pixels is as follows:
+
+	weights = &index[index[y]];
+	min = *weights++;
+	len = *weights++;
+	for (x=0 to dst_w)
+		min2 = min
+		len2 = len
+		weights2 = weights
+		dst[x] = 0;
+		while (--len2 > 0)
+			dst[x] += temp[x][(min2++) % tmp_buf_height] * *weights2++
+
+i.e. it requires a % operation for every source pixel - this is typically
+expensive.
+
+To avoid this, we alter the order in which vertical weights are stored,
+so that they are ordered in the same order as the temporary buffer lines
+would appear. This simplifies the algorithm to:
+
+	weights = &index[index[y]];
+	min = *weights++;
+	len = *weights++;
+	for (x=0 to dst_w)
+		min2 = 0
+		len2 = len
+		weights2 = weights
+		dst[x] = 0;
+		while (--len2 > 0)
+			dst[x] += temp[i][min2++] * *weights2++
+
+This means that len may be larger than it needs to be (due to the
+possible inclusion of a zero weight row or two), but in practise this
+is only an increase of 1 or 2 at worst.
+
+We implement this by generating the weights as normal (but ensuring we
+leave enough space) and then reordering afterwards.
+
+*/
+
+typedef struct fz_weights_s fz_weights;
+
+struct fz_weights_s
+{
+	int flip;
+	int count;
+	int max_len;
+	int n;
+	int new_line;
+	int index[1];
+};
+
+static fz_weights *
+new_weights(fz_context *ctx, fz_scale_filter *filter, int src_w, float dst_w, int dst_w_i, int n, int flip)
+{
+	int max_len;
+	fz_weights *weights;
+
+	if (src_w > dst_w)
+	{
+		/* Scaling down, so there will be a maximum of
+		 * 2*filterwidth*src_w/dst_w src pixels
+		 * contributing to each dst pixel. */
+		max_len = (int)ceilf((2 * filter->width * src_w)/dst_w);
+		if (max_len > src_w)
+			max_len = src_w;
+	}
+	else
+	{
+		/* Scaling up, so there will be a maximum of
+		 * 2*filterwidth src pixels contributing to each dst pixel.
+		 */
+		max_len = 2 * filter->width;
+	}
+	/* We need the size of the struct,
+	 * plus dst_w*sizeof(int) for the index
+	 * plus (2+max_len)*sizeof(int) for the weights
+	 * plus room for an extra set of weights for reordering.
+	 */
+	weights = fz_malloc(ctx, sizeof(*weights)+(max_len+3)*(dst_w_i+1)*sizeof(int));
+	if (weights == NULL)
+		return NULL;
+	weights->count = -1;
+	weights->max_len = max_len;
+	weights->index[0] = dst_w_i;
+	weights->n = n;
+	weights->flip = flip;
+	return weights;
+}
+
+static void
+init_weights(fz_weights *weights, int j)
+{
+	int index;
+
+	assert(weights->count == j-1);
+	weights->count++;
+	weights->new_line = 1;
+	if (j == 0)
+		index = weights->index[0];
+	else
+	{
+		index = weights->index[j-1];
+		index += 2 + weights->index[index+1];
+	}
+	weights->index[j] = index; /* row pointer */
+	weights->index[index] = 0; /* min */
+	weights->index[index+1] = 0; /* len */
+}
+
+static void
+add_weight(fz_weights *weights, int j, int i, fz_scale_filter *filter,
+	float x, float F, float G, int src_w, float dst_w)
+{
+	float dist = j - x + 0.5f - ((i + 0.5f)*dst_w/src_w);
+	float f;
+	int min, len, index, weight;
+
+	dist *= G;
+	if (dist < 0)
+		dist = -dist;
+	f = filter->fn(filter, dist)*F;
+	weight = (int)(256*f+0.5f);
+	if (weight == 0)
+		return;
+
+	/* Ensure i is in range */
+	if (i < 0)
+	{
+		i = 0;
+		weight = 0;
+	}
+	else if (i >= src_w)
+	{
+		i = src_w-1;
+		weight = 0;
+	}
+	if (weight == 0)
+		return;
+
+	DBUG(("add_weight[%d][%d] = %d(%g) dist=%g\n",j,i,weight,f,dist));
+
+	if (weights->new_line)
+	{
+		/* New line */
+		weights->new_line = 0;
+		index = weights->index[j]; /* row pointer */
+		weights->index[index] = i; /* min */
+		weights->index[index+1] = 0; /* len */
+	}
+	index = weights->index[j];
+	min = weights->index[index++];
+	len = weights->index[index++];
+	while (i < min)
+	{
+		/* This only happens in rare cases, but we need to insert
+		 * one earlier. In exceedingly rare cases we may need to
+		 * insert more than one earlier. */
+		int k;
+
+		for (k = len; k > 0; k--)
+		{
+			weights->index[index+k] = weights->index[index+k-1];
+		}
+		weights->index[index] = 0;
+		min--;
+		len++;
+		weights->index[index-2] = min;
+		weights->index[index-1] = len;
+	}
+	if (i-min >= len)
+	{
+		/* The usual case */
+		while (i-min >= ++len)
+		{
+			weights->index[index+len-1] = 0;
+		}
+		assert(len-1 == i-min);
+		weights->index[index+i-min] = weight;
+		weights->index[index-1] = len;
+		assert(len <= weights->max_len);
+	}
+	else
+	{
+		/* Infrequent case */
+		weights->index[index+i-min] += weight;
+	}
+}
+
+static void
+reorder_weights(fz_weights *weights, int j, int src_w)
+{
+	int idx = weights->index[j];
+	int min = weights->index[idx++];
+	int len = weights->index[idx++];
+	int max = weights->max_len;
+	int tmp = idx+max;
+	int i, off;
+
+	/* Copy into the temporary area */
+	memcpy(&weights->index[tmp], &weights->index[idx], sizeof(int)*len);
+
+	/* Pad out if required */
+	assert(len <= max);
+	assert(min+len <= src_w);
+	off = 0;
+	if (len < max)
+	{
+		memset(&weights->index[tmp+len], 0, sizeof(int)*(max-len));
+		len = max;
+		if (min + len > src_w)
+		{
+			off = min + len - src_w;
+			min = src_w - len;
+			weights->index[idx-2] = min;
+		}
+		weights->index[idx-1] = len;
+	}
+
+	/* Copy back into the proper places */
+	for (i = 0; i < len; i++)
+	{
+		weights->index[idx+((min+i+off) % max)] = weights->index[tmp+i];
+	}
+}
+
+/* Due to rounding and edge effects, the sums for the weights sometimes don't
+ * add up to 256. This causes visible rendering effects. Therefore, we take
+ * pains to ensure that they 1) never exceed 256, and 2) add up to exactly
+ * 256 for all pixels that are completely covered. See bug #691629. */
+static void
+check_weights(fz_weights *weights, int j, int w, float x, float wf)
+{
+	int idx, len;
+	int sum = 0;
+	int max = -256;
+	int maxidx = 0;
+	int i;
+
+	idx = weights->index[j];
+	idx++; /* min */
+	len = weights->index[idx++];
+
+	for(i=0; i < len; i++)
+	{
+		int v = weights->index[idx++];
+		sum += v;
+		if (v > max)
+		{
+			max = v;
+			maxidx = idx;
+		}
+	}
+	/* If we aren't the first or last pixel, OR if the sum is too big
+	 * then adjust it. */
+	if (((j != 0) && (j != w-1)) || (sum > 256))
+		weights->index[maxidx-1] += 256-sum;
+	/* Otherwise, if we are the first pixel, and it's fully covered, then
+	 * adjust it. */
+	else if ((j == 0) && (x < 0.0001F) && (sum != 256))
+		weights->index[maxidx-1] += 256-sum;
+	/* Finally, if we are the last pixel, and it's fully covered, then
+	 * adjust it. */
+	else if ((j == w-1) && ((float)w-wf < 0.0001F) && (sum != 256))
+		weights->index[maxidx-1] += 256-sum;
+	DBUG(("total weight %d = %d\n", j, sum));
+}
+
+static fz_weights *
+make_weights(fz_context *ctx, int src_w, float x, float dst_w, fz_scale_filter *filter, int vertical, int dst_w_int, int n, int flip)
+{
+	fz_weights *weights;
+	float F, G;
+	float window;
+	int j;
+
+	if (dst_w < src_w)
+	{
+		/* Scaling down */
+		F = dst_w / src_w;
+		G = 1;
+	}
+	else
+	{
+		/* Scaling up */
+		F = 1;
+		G = src_w / dst_w;
+	}
+	window = filter->width / F;
+	DBUG(("make_weights src_w=%d x=%g dst_w=%g dst_w_int=%d F=%g window=%g\n", src_w, x, dst_w, dst_w_int, F, window));
+	weights	= new_weights(ctx, filter, src_w, dst_w, dst_w_int, n, flip);
+	if (weights == NULL)
+		return NULL;
+	for (j = 0; j < dst_w_int; j++)
+	{
+		/* find the position of the centre of dst[j] in src space */
+		float centre = (j - x + 0.5f)*src_w/dst_w - 0.5f;
+		int l, r;
+		l = ceilf(centre - window);
+		r = floorf(centre + window);
+		DBUG(("%d: centre=%g l=%d r=%d\n", j, centre, l, r));
+		init_weights(weights, j);
+		for (; l <= r; l++)
+		{
+			add_weight(weights, j, l, filter, x, F, G, src_w, dst_w);
+		}
+		check_weights(weights, j, dst_w_int, x, dst_w);
+		if (vertical)
+		{
+			reorder_weights(weights, j, src_w);
+		}
+	}
+	weights->count++; /* weights->count = dst_w_int now */
+	return weights;
+}
+
+static void
+scale_row_to_temp(unsigned char *dst, unsigned char *src, fz_weights *weights)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int len, i, j, n;
+	unsigned char *min;
+	int tmp[FZ_MAX_COLORS];
+	int *t = tmp;
+
+	n = weights->n;
+	for (j = 0; j < n; j++)
+		tmp[j] = 128;
+	if (weights->flip)
+	{
+		dst += (weights->count-1)*n;
+		for (i=weights->count; i > 0; i--)
+		{
+			min = &src[n * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				for (j = n; j > 0; j--)
+					*t++ += *min++ * *contrib;
+				t -= n;
+				contrib++;
+			}
+			for (j = n; j > 0; j--)
+			{
+				*dst++ = (unsigned char)(*t>>8);
+				*t++ = 128;
+			}
+			t -= n;
+			dst -= n*2;
+		}
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			min = &src[n * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				for (j = n; j > 0; j--)
+					*t++ += *min++ * *contrib;
+				t -= n;
+				contrib++;
+			}
+			for (j = n; j > 0; j--)
+			{
+				*dst++ = (unsigned char)(*t>>8);
+				*t++ = 128;
+			}
+			t -= n;
+		}
+	}
+}
+
+#ifdef ARCH_ARM
+
+static void
+scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_to_temp2(unsigned char *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_to_temp4(unsigned char *dst, unsigned char *src, fz_weights *weights)
+__attribute__((naked));
+
+static void
+scale_row_from_temp(unsigned char *dst, unsigned char *src, fz_weights *weights, int width, int row)
+__attribute__((naked));
+
+static void
+scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights)
+{
+	/* possible optimisation in here; unroll inner loops to avoid stall. */
+	asm volatile(
+	ENTER_ARM
+	"stmfd	r13!,{r4-r5,r9,r14}				\n"
+	"@ r0 = dst						\n"
+	"@ r1 = src						\n"
+	"@ r2 = weights						\n"
+	"ldr	r12,[r2],#4		@ r12= flip		\n"
+	"ldr	r3, [r2],#16		@ r3 = count r2 = &index\n"
+	"ldr    r4, [r2]		@ r4 = index[0]		\n"
+	"cmp	r12,#0			@ if (flip)		\n"
+	"beq	4f			@ {			\n"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"add	r0, r0, r3		@ dst += count		\n"
+	"1:							\n"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #128		@ r5 = a = 128		\n"
+	"add	r4, r1, r4		@ r4 = min = &src[r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	3f			@ {			\n"
+	"2:							\n"
+	"ldr	r12,[r2], #4		@ r12 = *contrib++	\n"
+	"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"@stall on r14						\n"
+	"mla	r5, r12,r14,r5		@ g += r14 * r12	\n"
+	"bgt	2b			@ }			\n"
+	"3:							\n"
+	"mov	r5, r5, lsr #8		@ g >>= 8		\n"
+	"strb	r5,[r0, #-1]!		@ *--dst=a		\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	1b			@ 			\n"
+	"ldmfd	r13!,{r4-r5,r9,PC}	@ pop, return to thumb	\n"
+	"4:"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"5:"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #128		@ r5 = a = 128		\n"
+	"add	r4, r1, r4		@ r4 = min = &src[r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	7f			@ {			\n"
+	"6:							\n"
+	"ldr	r12,[r2], #4		@ r12 = *contrib++	\n"
+	"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"@stall on r14						\n"
+	"mla	r5, r12,r14,r5		@ a += r14 * r12	\n"
+	"bgt	6b			@ }			\n"
+	"7:							\n"
+	"mov	r5, r5, LSR #8		@ a >>= 8		\n"
+	"strb	r5, [r0], #1		@ *dst++=a		\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	5b			@ 			\n"
+	"ldmfd	r13!,{r4-r5,r9,PC}	@ pop, return to thumb	\n"
+	ENTER_THUMB
+	);
+}
+
+static void
+scale_row_to_temp2(unsigned char *dst, unsigned char *src, fz_weights *weights)
+{
+	asm volatile(
+	ENTER_ARM
+	"stmfd	r13!,{r4-r6,r9-r11,r14}				\n"
+	"@ r0 = dst						\n"
+	"@ r1 = src						\n"
+	"@ r2 = weights						\n"
+	"ldr	r12,[r2],#4		@ r12= flip		\n"
+	"ldr	r3, [r2],#16		@ r3 = count r2 = &index\n"
+	"ldr    r4, [r2]		@ r4 = index[0]		\n"
+	"cmp	r12,#0			@ if (flip)		\n"
+	"beq	4f			@ {			\n"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"add	r0, r0, r3, LSL #1	@ dst += 2*count	\n"
+	"1:							\n"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #128		@ r5 = g = 128		\n"
+	"mov	r6, #128		@ r6 = a = 128		\n"
+	"add	r4, r1, r4, LSL #1	@ r4 = min = &src[2*r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	3f			@ {			\n"
+	"2:							\n"
+	"ldr	r14,[r2], #4		@ r14 = *contrib++	\n"
+	"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+	"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"mla	r5, r14,r11,r5		@ g += r11 * r14	\n"
+	"mla	r6, r14,r12,r6		@ a += r12 * r14	\n"
+	"bgt	2b			@ }			\n"
+	"3:							\n"
+	"mov	r5, r5, lsr #8		@ g >>= 8		\n"
+	"mov	r6, r6, lsr #8		@ a >>= 8		\n"
+	"strb	r5, [r0, #-2]!		@ *--dst=a		\n"
+	"strb	r6, [r0, #1]		@ *--dst=g		\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	1b			@ 			\n"
+	"ldmfd	r13!,{r4-r6,r9-r11,PC}	@ pop, return to thumb	\n"
+	"4:"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"5:"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r5, #128		@ r5 = g = 128		\n"
+	"mov	r6, #128		@ r6 = a = 128		\n"
+	"add	r4, r1, r4, LSL #1	@ r4 = min = &src[2*r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	7f			@ {			\n"
+	"6:							\n"
+	"ldr	r14,[r2], #4		@ r10 = *contrib++	\n"
+	"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+	"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"mla	r5, r14,r11,r5		@ g += r11 * r14	\n"
+	"mla	r6, r14,r12,r6		@ a += r12 * r14	\n"
+	"bgt	6b			@ }			\n"
+	"7:							\n"
+	"mov	r5, r5, lsr #8		@ g >>= 8		\n"
+	"mov	r6, r6, lsr #8		@ a >>= 8		\n"
+	"strb	r5, [r0], #1		@ *dst++=g		\n"
+	"strb	r6, [r0], #1		@ *dst++=a		\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	5b			@ 			\n"
+	"ldmfd	r13!,{r4-r6,r9-r11,PC}	@ pop, return to thumb	\n"
+	ENTER_THUMB
+	);
+}
+
+static void
+scale_row_to_temp4(unsigned char *dst, unsigned char *src, fz_weights *weights)
+{
+	asm volatile(
+	ENTER_ARM
+	"stmfd	r13!,{r4-r11,r14}				\n"
+	"@ r0 = dst						\n"
+	"@ r1 = src						\n"
+	"@ r2 = weights						\n"
+	"ldr	r12,[r2],#4		@ r12= flip		\n"
+	"ldr	r3, [r2],#16		@ r3 = count r2 = &index\n"
+	"ldr    r4, [r2]		@ r4 = index[0]		\n"
+	"ldr	r5,=0x00800080		@ r5 = rounding		\n"
+	"ldr	r6,=0x00FF00FF		@ r7 = 0x00FF00FF	\n"
+	"cmp	r12,#0			@ if (flip)		\n"
+	"beq	4f			@ {			\n"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"add	r0, r0, r3, LSL #2	@ dst += 4*count	\n"
+	"1:							\n"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r7, r5			@ r7 = b = rounding	\n"
+	"mov	r8, r5			@ r8 = a = rounding	\n"
+	"add	r4, r1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	3f			@ {			\n"
+	"2:							\n"
+	"ldr	r11,[r4], #4		@ r11 = *min++		\n"
+	"ldr	r10,[r2], #4		@ r10 = *contrib++	\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"and	r12,r6, r11		@ r12 = __22__00	\n"
+	"and	r11,r6, r11,LSR #8	@ r11 = __33__11	\n"
+	"mla	r7, r10,r12,r7		@ b += r14 * r10	\n"
+	"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
+	"bgt	2b			@ }			\n"
+	"3:							\n"
+	"and	r7, r6, r7, lsr #8	@ r7 = __22__00		\n"
+	"bic	r8, r8, r6		@ r8 = 33__11__		\n"
+	"orr	r7, r7, r8		@ r7 = 33221100		\n"
+	"str	r7, [r0, #-4]!		@ *--dst=r		\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	1b			@ 			\n"
+	"ldmfd	r13!,{r4-r11,PC}	@ pop, return to thumb	\n"
+	"4:							\n"
+	"add	r2, r2, r4, LSL #2	@ r2 = &index[index[0]] \n"
+	"5:							\n"
+	"ldr	r4, [r2], #4		@ r4 = *contrib++	\n"
+	"ldr	r9, [r2], #4		@ r9 = len = *contrib++	\n"
+	"mov	r7, r5			@ r7 = b = rounding	\n"
+	"mov	r8, r5			@ r8 = a = rounding	\n"
+	"add	r4, r1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
+	"cmp	r9, #0			@ while (len-- > 0)	\n"
+	"beq	7f			@ {			\n"
+	"6:							\n"
+	"ldr	r11,[r4], #4		@ r11 = *min++		\n"
+	"ldr	r10,[r2], #4		@ r10 = *contrib++	\n"
+	"subs	r9, r9, #1		@ r9 = len--		\n"
+	"and	r12,r6, r11		@ r12 = __22__00	\n"
+	"and	r11,r6, r11,LSR #8	@ r11 = __33__11	\n"
+	"mla	r7, r10,r12,r7		@ b += r14 * r10	\n"
+	"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
+	"bgt	6b			@ }			\n"
+	"7:							\n"
+	"and	r7, r6, r7, lsr #8	@ r7 = __22__00		\n"
+	"bic	r8, r8, r6		@ r8 = 33__11__		\n"
+	"orr	r7, r7, r8		@ r7 = 33221100		\n"
+	"str	r7, [r0], #4		@ *dst++=r		\n"
+	"subs	r3, r3, #1		@ i--			\n"
+	"bgt	5b			@ 			\n"
+	"ldmfd	r13!,{r4-r11,PC}	@ pop, return to thumb	\n"
+	ENTER_THUMB
+	);
+}
+
+static void
+scale_row_from_temp(unsigned char *dst, unsigned char *src, fz_weights *weights, int width, int row)
+{
+	asm volatile(
+	ENTER_ARM
+	"ldr	r12,[r13]		@ r12= row		\n"
+	"add	r2, r2, #20		@ r2 = weights->index	\n"
+	"stmfd	r13!,{r4-r11,r14}				\n"
+	"@ r0 = dst						\n"
+	"@ r1 = src						\n"
+	"@ r2 = &weights->index[0]				\n"
+	"@ r3 = width						\n"
+	"@ r12= row						\n"
+	"ldr    r4, [r2, r12, LSL #2]	@ r4 = index[row]	\n"
+	"add	r2, r2, #4		@ r2 = &index[1]	\n"
+	"subs	r6, r3, #4		@ r6 = x = width-4	\n"
+	"ldr	r14,[r2, r4, LSL #2]!	@ r2 = contrib = index[index[row]+1]\n"
+	"				@ r14= len = *contrib	\n"
+	"blt	4f			@ while (x >= 0) {	\n"
+#ifndef ARCH_ARM_CAN_LOAD_UNALIGNED
+	"tst	r3, #3			@ if (r3 & 3)		\n"
+	"blt	4f			@     can't do fast code\n"
+#endif
+	"ldr	r9, =0x00FF00FF		@ r9 = 0x00FF00FF	\n"
+	"1:							\n"
+	"ldr	r5, =0x00800080		@ r5 = val0 = round	\n"
+	"stmfd	r13!,{r1,r2}		@ stash r1,r2,r14	\n"
+	"				@ r1 = min = src	\n"
+	"				@ r2 = contrib2-4	\n"
+	"movs	r8, r14			@ r8 = len2 = len	\n"
+	"mov	r7, r5			@ r7 = val1 = round	\n"
+	"ble	3f			@ while (len2-- > 0) {	\n"
+	"2:							\n"
+	"ldr	r12,[r1], r3		@ r12 = *min	r5 = min += width\n"
+	"ldr	r10,[r2, #4]!		@ r10 = *contrib2++	\n"
+	"subs	r8, r8, #1		@ len2--		\n"
+	"and	r11,r9, r12		@ r11= __22__00		\n"
+	"and	r12,r9, r12,LSR #8	@ r12= __33__11		\n"
+	"mla	r5, r10,r11,r5		@ r5 = val0 += r11 * r10\n"
+	"mla	r7, r10,r12,r7		@ r7 = val1 += r12 * r10\n"
+	"bgt	2b			@ }			\n"
+	"3:							\n"
+	"ldmfd	r13!,{r1,r2}		@ restore r1,r2,r14	\n"
+	"and	r5, r9, r5, LSR #8	@ r5 = __22__00		\n"
+	"and	r7, r7, r9, LSL #8	@ r7 = 33__11__		\n"
+	"orr	r5, r5, r7		@ r5 = 33221100		\n"
+	"subs	r6, r6, #4		@ x--			\n"
+	"add	r1, r1, #4		@ src++			\n"
+	"str	r5, [r0], #4		@ *dst++ = val		\n"
+	"bge	1b			@ 			\n"
+	"4:				@ } (Less than 4 to go)	\n"
+	"adds	r6, r6, #4		@ r6 = x += 4		\n"
+	"beq	8f			@ if (x == 0) done	\n"
+	"5:							\n"
+	"mov	r5, r1			@ r5 = min = src	\n"
+	"mov	r7, #128		@ r7 = val = 128	\n"
+	"movs	r8, r14			@ r8 = len2 = len	\n"
+	"add	r9, r2, #4		@ r9 = contrib2		\n"
+	"ble	7f			@ while (len2-- > 0) {	\n"
+	"6:							\n"
+	"ldr	r10,[r9], #4		@ r10 = *contrib2++	\n"
+	"ldrb	r12,[r5], r3		@ r12 = *min	r5 = min += width\n"
+	"subs	r8, r8, #1		@ len2--		\n"
+	"@ stall r12						\n"
+	"mla	r7, r10,r12,r7		@ val += r12 * r10	\n"
+	"bgt	6b			@ }			\n"
+	"7:							\n"
+	"mov	r7, r7, asr #8		@ r7 = val >>= 8	\n"
+	"subs	r6, r6, #1		@ x--			\n"
+	"add	r1, r1, #1		@ src++			\n"
+	"strb	r7, [r0], #1		@ *dst++ = val		\n"
+	"bgt	5b			@ 			\n"
+	"8:							\n"
+	"ldmfd	r13!,{r4-r11,PC}	@ pop, return to thumb	\n"
+	ENTER_THUMB
+	);
+}
+#else
+
+static void
+scale_row_to_temp1(unsigned char *dst, unsigned char *src, fz_weights *weights)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int len, i;
+	unsigned char *min;
+
+	assert(weights->n == 1);
+	if (weights->flip)
+	{
+		dst += weights->count;
+		for (i=weights->count; i > 0; i--)
+		{
+			int val = 128;
+			min = &src[*contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				val += *min++ * *contrib++;
+			}
+			*--dst = (unsigned char)(val>>8);
+		}
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			int val = 128;
+			min = &src[*contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				val += *min++ * *contrib++;
+			}
+			*dst++ = (unsigned char)(val>>8);
+		}
+	}
+}
+
+static void
+scale_row_to_temp2(unsigned char *dst, unsigned char *src, fz_weights *weights)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int len, i;
+	unsigned char *min;
+
+	assert(weights->n == 2);
+	if (weights->flip)
+	{
+		dst += 2*weights->count;
+		for (i=weights->count; i > 0; i--)
+		{
+			int c1 = 128;
+			int c2 = 128;
+			min = &src[2 * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				c1 += *min++ * *contrib;
+				c2 += *min++ * *contrib++;
+			}
+			*--dst = (unsigned char)(c2>>8);
+			*--dst = (unsigned char)(c1>>8);
+		}
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			int c1 = 128;
+			int c2 = 128;
+			min = &src[2 * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				c1 += *min++ * *contrib;
+				c2 += *min++ * *contrib++;
+			}
+			*dst++ = (unsigned char)(c1>>8);
+			*dst++ = (unsigned char)(c2>>8);
+		}
+	}
+}
+
+static void
+scale_row_to_temp4(unsigned char *dst, unsigned char *src, fz_weights *weights)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int len, i;
+	unsigned char *min;
+
+	assert(weights->n == 4);
+	if (weights->flip)
+	{
+		dst += 4*weights->count;
+		for (i=weights->count; i > 0; i--)
+		{
+			int r = 128;
+			int g = 128;
+			int b = 128;
+			int a = 128;
+			min = &src[4 * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				r += *min++ * *contrib;
+				g += *min++ * *contrib;
+				b += *min++ * *contrib;
+				a += *min++ * *contrib++;
+			}
+			*--dst = (unsigned char)(a>>8);
+			*--dst = (unsigned char)(b>>8);
+			*--dst = (unsigned char)(g>>8);
+			*--dst = (unsigned char)(r>>8);
+		}
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			int r = 128;
+			int g = 128;
+			int b = 128;
+			int a = 128;
+			min = &src[4 * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				r += *min++ * *contrib;
+				g += *min++ * *contrib;
+				b += *min++ * *contrib;
+				a += *min++ * *contrib++;
+			}
+			*dst++ = (unsigned char)(r>>8);
+			*dst++ = (unsigned char)(g>>8);
+			*dst++ = (unsigned char)(b>>8);
+			*dst++ = (unsigned char)(a>>8);
+		}
+	}
+}
+
+static void
+scale_row_from_temp(unsigned char *dst, unsigned char *src, fz_weights *weights, int width, int row)
+{
+	int *contrib = &weights->index[weights->index[row]];
+	int len, x;
+
+	contrib++; /* Skip min */
+	len = *contrib++;
+	for (x=width; x > 0; x--)
+	{
+		unsigned char *min = src;
+		int val = 128;
+		int len2 = len;
+		int *contrib2 = contrib;
+
+		while (len2-- > 0)
+		{
+			val += *min * *contrib2++;
+			min += width;
+		}
+		*dst++ = (unsigned char)(val>>8);
+		src++;
+	}
+}
+#endif
+
+#ifdef SINGLE_PIXEL_SPECIALS
+static void
+duplicate_single_pixel(unsigned char *dst, unsigned char *src, int n, int w, int h)
+{
+	int i;
+
+	for (i = n; i > 0; i--)
+		*dst++ = *src++;
+	for (i = (w*h-1)*n; i > 0; i--)
+	{
+		*dst = dst[-n];
+		dst++;
+	}
+}
+
+static void
+scale_single_row(unsigned char *dst, unsigned char *src, fz_weights *weights, int src_w, int h)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int min, len, i, j, n;
+	int tmp[FZ_MAX_COLORS];
+
+	n = weights->n;
+	/* Scale a single row */
+	for (j = 0; j < n; j++)
+		tmp[j] = 128;
+	if (weights->flip)
+	{
+		dst += (weights->count-1)*n;
+		for (i=weights->count; i > 0; i--)
+		{
+			min = *contrib++;
+			len = *contrib++;
+			min *= n;
+			while (len-- > 0)
+			{
+				for (j = 0; j < n; j++)
+					tmp[j] += src[min++] * *contrib;
+				contrib++;
+			}
+			for (j = 0; j < n; j++)
+			{
+				*dst++ = (unsigned char)(tmp[j]>>8);
+				tmp[j] = 128;
+			}
+			dst -= 2*n;
+		}
+		dst += n * (weights->count+1);
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			min = *contrib++;
+			len = *contrib++;
+			min *= n;
+			while (len-- > 0)
+			{
+				for (j = 0; j < n; j++)
+					tmp[j] += src[min++] * *contrib;
+				contrib++;
+			}
+			for (j = 0; j < n; j++)
+			{
+				*dst++ = (unsigned char)(tmp[j]>>8);
+				tmp[j] = 128;
+			}
+		}
+	}
+	/* And then duplicate it h times */
+	n *= weights->count;
+	while (--h > 0)
+	{
+		memcpy(dst, dst-n, n);
+		dst += n;
+	}
+}
+
+static void
+scale_single_col(unsigned char *dst, unsigned char *src, fz_weights *weights, int src_w, int n, int w, int flip_y)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int min, len, i, j;
+	int tmp[FZ_MAX_COLORS];
+
+	for (j = 0; j < n; j++)
+		tmp[j] = 128;
+	if (flip_y)
+	{
+		src_w = (src_w-1)*n;
+		w = (w-1)*n;
+		for (i=weights->count; i > 0; i--)
+		{
+			/* Scale the next pixel in the column */
+			min = *contrib++;
+			len = *contrib++;
+			min = src_w-min*n;
+			while (len-- > 0)
+			{
+				for (j = 0; j < n; j++)
+					tmp[j] += src[src_w-min+j] * *contrib;
+				contrib++;
+			}
+			for (j = 0; j < n; j++)
+			{
+				*dst++ = (unsigned char)(tmp[j]>>8);
+				tmp[j] = 128;
+			}
+			/* And then duplicate it across the row */
+			for (j = w; j > 0; j--)
+			{
+				*dst = dst[-n];
+				dst++;
+			}
+		}
+	}
+	else
+	{
+		w = (w-1)*n;
+		for (i=weights->count; i > 0; i--)
+		{
+			/* Scale the next pixel in the column */
+			min = *contrib++;
+			len = *contrib++;
+			min *= n;
+			while (len-- > 0)
+			{
+				for (j = 0; j < n; j++)
+					tmp[j] += src[min++] * *contrib;
+				contrib++;
+			}
+			for (j = 0; j < n; j++)
+			{
+				*dst++ = (unsigned char)(tmp[j]>>8);
+				tmp[j] = 128;
+			}
+			/* And then duplicate it across the row */
+			for (j = w; j > 0; j--)
+			{
+				*dst = dst[-n];
+				dst++;
+			}
+		}
+	}
+}
+#endif /* SINGLE_PIXEL_SPECIALS */
+
+fz_pixmap *
+fz_scale_pixmap(fz_context *ctx, fz_pixmap *src, float x, float y, float w, float h)
+{
+	fz_scale_filter *filter = &fz_scale_filter_simple;
+	fz_weights *contrib_rows = NULL;
+	fz_weights *contrib_cols = NULL;
+	fz_pixmap *output = NULL;
+	unsigned char *temp = NULL;
+	int max_row, temp_span, temp_rows, row;
+	int dst_w_int, dst_h_int, dst_x_int, dst_y_int;
+	int flip_x, flip_y;
+
+	DBUG(("Scale: (%d,%d) to (%g,%g) at (%g,%g)\n",src->w,src->h,w,h,x,y));
+
+	/* Find the destination bbox, width/height, and sub pixel offset,
+	 * allowing for whether we're flipping or not. */
+	/* Note that the x and y sub pixel offsets here are different.
+	 * The (x,y) position given describes where the bottom left corner
+	 * of the source image should be mapped to (i.e. where (0,h) in image
+	 * space ends up, not the more logical and sane (0,0)). Also there
+	 * are differences in the way we scale horizontally and vertically.
+	 * When scaling rows horizontally, we always read forwards through
+	 * the source, and store either forwards or in reverse as required.
+	 * When scaling vertically, we always store out forwards, but may
+	 * feed source rows in in a different order.
+	 *
+	 * Consider the image rectange 'r' to which the image is mapped,
+	 * and the (possibly) larger rectangle 'R', given by expanding 'r' to
+	 * complete pixels.
+	 *
+	 * x can either be r.xmin-R.xmin or R.xmax-r.xmax depending on whether
+	 * the image is x flipped or not. Whatever happens 0 <= x < 1.
+	 * y is always R.ymax - r.ymax.
+	 */
+	/* dst_x_int is calculated to be the left of the scaled image, and
+	 * x (the sub_pixel_offset) is the distance in from either the left
+	 * or right pixel expanded edge. */
+	flip_x = (w < 0);
+	if (flip_x)
+	{
+		float tmp;
+		w = -w;
+		dst_x_int = floor(x-w);
+		tmp = ceilf(x);
+		dst_w_int = (int)tmp;
+		x = tmp - x;
+		dst_w_int -= dst_x_int;
+	}
+	else
+	{
+		dst_x_int = floor(x);
+		x -= (float)dst_x_int;
+		dst_w_int = (int)ceilf(x + w);
+	}
+	flip_y = (h < 0);
+	/* dst_y_int is calculated to be the bottom of the scaled image, but
+	 * y (the sub pixel offset) has to end up being the value at the top.
+	 */
+	if (flip_y)
+	{
+		h = -h;
+		dst_y_int = floor(y-h);
+		dst_h_int = (int)ceilf(y) - dst_y_int;
+	} else {
+		dst_y_int = floor(y);
+		y += h;
+		dst_h_int = (int)ceilf(y) - dst_y_int;
+	}
+	/* y is the top edge position in floats. We want it to be the
+	 * distance down from the next pixel boundary. */
+	y = ceilf(y) - y;
+
+	DBUG(("Result image: (%d,%d) at (%d,%d) (subpix=%g,%g)\n", dst_w_int, dst_h_int, dst_x_int, dst_y_int, x, y));
+
+	/* Step 1: Calculate the weights for columns and rows */
+#ifdef SINGLE_PIXEL_SPECIALS
+	if (src->w == 1)
+	{
+		contrib_cols = NULL;
+	}
+	else
+#endif /* SINGLE_PIXEL_SPECIALS */
+	{
+		contrib_cols = make_weights(ctx, src->w, x, w, filter, 0, dst_w_int, src->n, flip_x);
+		if (contrib_cols == NULL)
+			goto cleanup;
+	}
+#ifdef SINGLE_PIXEL_SPECIALS
+	if (src->h == 1)
+	{
+		contrib_rows = NULL;
+	}
+	else
+#endif /* SINGLE_PIXEL_SPECIALS */
+	{
+		contrib_rows = make_weights(ctx, src->h, y, h, filter, 1, dst_h_int, src->n, flip_y);
+		if (contrib_rows == NULL)
+			goto cleanup;
+	}
+
+	assert(contrib_cols == NULL || contrib_cols->count == dst_w_int);
+	assert(contrib_rows == NULL || contrib_rows->count == dst_h_int);
+	output = fz_new_pixmap(ctx, src->colorspace, dst_w_int, dst_h_int);
+	output->x = dst_x_int;
+	output->y = dst_y_int;
+
+	/* Step 2: Apply the weights */
+#ifdef SINGLE_PIXEL_SPECIALS
+	if (contrib_rows == NULL)
+	{
+		/* Only 1 source pixel high. */
+		if (contrib_cols == NULL)
+		{
+			/* Only 1 pixel in the entire image! */
+			duplicate_single_pixel(output->samples, src->samples, src->n, dst_w_int, dst_h_int);
+		}
+		else
+		{
+			/* Scale the row once, then copy it. */
+			scale_single_row(output->samples, src->samples, contrib_cols, src->w, dst_h_int);
+		}
+	}
+	else if (contrib_cols == NULL)
+	{
+		/* Only 1 source pixel wide. Scale the col and duplicate. */
+		scale_single_col(output->samples, src->samples, contrib_rows, src->h, src->n, dst_w_int, flip_y);
+	}
+	else
+#endif /* SINGLE_PIXEL_SPECIALS */
+	{
+		void (*row_scale)(unsigned char *dst, unsigned char *src, fz_weights *weights);
+
+		temp_span = contrib_cols->count * src->n;
+		temp_rows = contrib_rows->max_len;
+		if (temp_span <= 0 || temp_rows > INT_MAX / temp_span)
+			goto cleanup;
+		temp = fz_calloc(ctx, temp_span*temp_rows, sizeof(unsigned char));
+		if (temp == NULL)
+			goto cleanup;
+		switch (src->n)
+		{
+		default:
+			row_scale = scale_row_to_temp;
+			break;
+		case 1: /* Image mask case */
+			row_scale = scale_row_to_temp1;
+			break;
+		case 2: /* Greyscale with alpha case */
+			row_scale = scale_row_to_temp2;
+			break;
+		case 4: /* RGBA */
+			row_scale = scale_row_to_temp4;
+			break;
+		}
+		max_row = 0;
+		for (row = 0; row < contrib_rows->count; row++)
+		{
+			/*
+			Which source rows do we need to have scaled into the
+			temporary buffer in order to be able to do the final
+			scale?
+			*/
+			int row_index = contrib_rows->index[row];
+			int row_min = contrib_rows->index[row_index++];
+			int row_len = contrib_rows->index[row_index++];
+			while (max_row < row_min+row_len)
+			{
+				/* Scale another row */
+				assert(max_row < src->h);
+				DBUG(("scaling row %d to temp\n", max_row));
+				(*row_scale)(&temp[temp_span*(max_row % temp_rows)], &src->samples[(flip_y ? (src->h-1-max_row): max_row)*src->w*src->n], contrib_cols);
+				max_row++;
+			}
+
+			DBUG(("scaling row %d from temp\n", row));
+			scale_row_from_temp(&output->samples[row*output->w*output->n], temp, contrib_rows, temp_span, row);
+		}
+		fz_free(ctx, temp);
+	}
+
+cleanup:
+	fz_free(ctx, contrib_rows);
+	fz_free(ctx, contrib_cols);
+	return output;
+}
author	Robin Watts <robin.watts@artifex.com>	2011-11-14 18:22:13 +0000
committer	Robin Watts <robin.watts@artifex.com>	2011-11-15 15:20:54 +0000
commit	9c0a49060475b2dea1e4c2668bebd1d566113a7b (patch)
tree	49e45a691cf105f4266d5c6b7242a4a3256c1200 /draw
parent	60c0544742931da63db623ad7a79ba3758704cc1 (diff)
parent	fd6def85f22b598d4c278e76138ab7dccbb84c36 (diff)
download	mupdf-9c0a49060475b2dea1e4c2668bebd1d566113a7b.tar.xz