From c8d226b5bfb5dab2db10ea5175966de7bac9640e Mon Sep 17 00:00:00 2001
From: Tor Andersson <tor.andersson@artifex.com>
Date: Mon, 4 Apr 2011 18:08:53 +0200
Subject: draw: Rename files in draw directory.

---
 Makefile           |   21 +-
 draw/arch_arm.c    |  232 +++++++++++
 draw/arch_port.c   |  486 ++++++++++++++++++++++
 draw/archarm.c     |  232 -----------
 draw/archport.c    |  486 ----------------------
 draw/blendmodes.c  |  370 -----------------
 draw/draw_affine.c |  372 +++++++++++++++++
 draw/draw_blend.c  |  370 +++++++++++++++++
 draw/draw_edge.c   |  559 +++++++++++++++++++++++++
 draw/draw_glyph.c  |  134 ++++++
 draw/draw_mesh.c   |  579 ++++++++++++++++++++++++++
 draw/draw_paint.c  |  443 ++++++++++++++++++++
 draw/draw_path.c   |  773 ++++++++++++++++++++++++++++++++++
 draw/draw_scale.c  | 1175 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 draw/draw_unpack.c |  235 +++++++++++
 draw/glyphcache.c  |  134 ------
 draw/imagedraw.c   |  372 -----------------
 draw/imagesmooth.c | 1175 ----------------------------------------------------
 draw/imageunpack.c |  235 -----------
 draw/meshdraw.c    |  579 --------------------------
 draw/pathscan.c    |  559 -------------------------
 draw/pathstroke.c  |  773 ----------------------------------
 draw/porterduff.c  |  443 --------------------
 23 files changed, 5368 insertions(+), 5369 deletions(-)
 create mode 100644 draw/arch_arm.c
 create mode 100644 draw/arch_port.c
 delete mode 100644 draw/archarm.c
 delete mode 100644 draw/archport.c
 delete mode 100644 draw/blendmodes.c
 create mode 100644 draw/draw_affine.c
 create mode 100644 draw/draw_blend.c
 create mode 100644 draw/draw_edge.c
 create mode 100644 draw/draw_glyph.c
 create mode 100644 draw/draw_mesh.c
 create mode 100644 draw/draw_paint.c
 create mode 100644 draw/draw_path.c
 create mode 100644 draw/draw_scale.c
 create mode 100644 draw/draw_unpack.c
 delete mode 100644 draw/glyphcache.c
 delete mode 100644 draw/imagedraw.c
 delete mode 100644 draw/imagesmooth.c
 delete mode 100644 draw/imageunpack.c
 delete mode 100644 draw/meshdraw.c
 delete mode 100644 draw/pathscan.c
 delete mode 100644 draw/pathstroke.c
 delete mode 100644 draw/porterduff.c

diff --git a/Makefile b/Makefile
index b40e9af7..4098e0e3 100644
--- a/Makefile
+++ b/Makefile
@@ -117,17 +117,16 @@ FITZ_OBJ := $(FITZ_SRC:fitz/%.c=$(OBJDIR)/%.o)
 $(FITZ_OBJ): $(FITZ_HDR)
 
 DRAW_SRC := $(DRAW_ARCH_SRC) \
-	draw/archport.c \
-	draw/blendmodes.c \
-	draw/glyphcache.c \
-	draw/imagedraw.c \
-	draw/imagesmooth.c \
-	draw/imageunpack.c \
-	draw/meshdraw.c \
-	draw/pathfill.c \
-	draw/pathscan.c \
-	draw/pathstroke.c \
-	draw/porterduff.c
+	draw/arch_port.c \
+	draw/draw_affine.c \
+	draw/draw_blend.c \
+	draw/draw_edge.c \
+	draw/draw_glyph.c \
+	draw/draw_mesh.c \
+	draw/draw_paint.c \
+	draw/draw_path.c \
+	draw/draw_scale.c \
+	draw/draw_unpack.c
 DRAW_OBJ := $(DRAW_SRC:draw/%.c=$(OBJDIR)/%.o)
 DRAW_OBJ := $(DRAW_OBJ:draw/%.s=$(OBJDIR)/%.o)
 $(DRAW_OBJ): $(FITZ_HDR)
diff --git a/draw/arch_arm.c b/draw/arch_arm.c
new file mode 100644
index 00000000..c601a70c
--- /dev/null
+++ b/draw/arch_arm.c
@@ -0,0 +1,232 @@
+/*
+ * ARM specific render optims live here
+ */
+
+#include "fitz.h"
+
+typedef unsigned char byte;
+
+/* always surround cpu specific code with HAVE_XXX */
+#ifdef ARCH_ARM
+
+/* from imagescalearm.s */
+extern void fz_srow4_arm(byte *src, byte *dst, int w, int denom);
+extern void fz_scol4_arm(byte *src, byte *dst, int w, int denom);
+
+static void
+path_w4i1o4_arm(byte * restrict rgba, byte * restrict src, byte cov, int len, byte * restrict dst)
+{
+	/* The ARM code here is a hand coded implementation of the optimized C version. */
+
+	if (len <= 0)
+		return;
+
+	asm volatile(
+	"ldr	%0, [%0]		@ %0 = rgba			\n"
+	"mov	r11,#0							\n"
+	"mov	r8, #0xFF00						\n"
+	"mov	r14,%0,lsr #24		@ r14= alpha			\n"
+	"orr	%0, %0, #0xFF000000	@ %0 = rgba |= 0xFF000000	\n"
+	"orr	r8, r8, r8, LSL #16	@ r8 = 0xFF00FF00		\n"
+	"adds	r14,r14,r14,LSR #7	@ r14 = alpha += alpha>>7	\n"
+	"beq	9f			@ if (alpha == 0) bale		\n"
+	"and	r6, %0, r8		@ r6 = ga<<8			\n"
+	"bic	%0, %0, r8		@ %0 = rb			\n"
+	"mov	r6, r6, LSR #8		@ r6 = ga			\n"
+	"cmp	r14,#256		@ if (alpha == 256)		\n"
+	"beq	4f			@	no-alpha loop		\n"
+	"B	2f			@ enter the loop		\n"
+	"1:	@ Loop used for when coverage*alpha == 0		\n"
+	"subs	%3, %3, #1		@ len--				\n"
+	"ble	9f							\n"
+	"2:								\n"
+	"ldrb	r12,[%1]		@ r12= *src			\n"
+	"ldr	r9, [%4], #4		@ r9 = drb = *dst32++		\n"
+	"strb	r11,[%1], #1		@ r11= *src++ = 0		\n"
+	"add	%2, r12, %2		@ %2 = cov += r12		\n"
+	"ands	%2, %2, #255		@ %2 = cov &= 255		\n"
+	"beq	1b			@ if coverage == 0 loop back	\n"
+	"add	r10,%2, %2, LSR #7	@ r10= ca = cov+(cov>>7)	\n"
+	"mul	r10,r14,r10		@ r10= ca *= alpha		\n"
+	"and	r7, r8, r9		@ r7 = dga = drb & MASK		\n"
+	"mov	r10,r10,LSR #8		@ r10= ca >>= 8			\n"
+	"and	r9, r8, r9, LSL #8	@ r9 = drb = (drb<<8) & MASK	\n"
+	"sub	r12,r6, r7, LSR #8	@ r12= cga = ga - (dga>>8)	\n"
+	"sub	r5, %0, r9, LSR #8	@ r5 = crb = rb - (drb>>8)	\n"
+	"mla	r7, r12,r10,r7		@ r7 = dga += cga * ca		\n"
+	"subs	%3, %3, #1		@ len--				\n"
+	"mla	r9, r5, r10,r9		@ r9 = drb += crb * ca		\n"
+	"and	r7, r8, r7		@ r7 = dga &= MASK		\n"
+	"and	r9, r8, r9		@ r9 = drb &= MASK		\n"
+	"orr	r9, r7, r9, LSR #8	@ r9 = drb = dga | (drb>>8)	\n"
+	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
+	"bgt	2b							\n"
+	"b	9f							\n"
+	"@ --- Solid alpha loop	---------------------------------------	\n"
+	"3:	@ Loop used when coverage == 256			\n"
+	"orr	r9, %0, r6, LSL #8	@ r9 = rgba			\n"
+	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
+	"4:	@ Loop used for when coverage*alpha == 0		\n"
+	"subs	%3, %3, #1		@ len--				\n"
+	"ble	9f							\n"
+	"5:								\n"
+	"ldrb	r12,[%1]		@ r12= *src			\n"
+	"ldr	r9, [%4], #4		@ r9 = drb = *dst32++		\n"
+	"strb	r11,[%1], #1		@ r11= *src++ = 0		\n"
+	"add	%2, r12, %2		@ %2 = cov += r12		\n"
+	"ands	%2, %2, #255		@ %2 = cov &= 255		\n"
+	"beq	4b			@ if coverage == 0 loop back	\n"
+	"cmp	%2, #255		@ if coverage == solid		\n"
+	"beq	3b			@	loop back		\n"
+	"add	r10,%2, %2, LSR #7	@ r10= ca = cov+(cov>>7)	\n"
+	"and	r7, r8, r9		@ r7 = dga = drb & MASK		\n"
+	"and	r9, r8, r9, LSL #8	@ r9 = dga = (drb<<8) & MASK	\n"
+	"sub	r12,r6, r7, LSR #8	@ r12= cga = ga - (dga>>8)	\n"
+	"sub	r5, %0, r9, LSR #8	@ r5 = crb = rb - (drb>>8)	\n"
+	"mla	r7, r12,r10,r7		@ r7 = dga += cga * ca		\n"
+	"subs	%3, %3, #1		@ len--				\n"
+	"mla	r9, r5, r10,r9		@ r9 = drb += crb * ca		\n"
+	"and	r7, r8, r7		@ r7 = dga &= MASK		\n"
+	"and	r9, r8, r9		@ r9 = drb &= MASK		\n"
+	"orr	r9, r7, r9, LSR #8	@ r9 = drb = dga | (drb>>8)	\n"
+	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
+	"bgt	5b							\n"
+	"9:				@ End				\n"
+	:
+	"+r" (rgba),
+	"+r" (src),
+	"+r" (cov),
+	"+r" (len),
+	"+r" (dst)
+	:
+	:
+	"r5","r6","r7","r8","r9","r10","r11","r12","r14","memory","cc"
+	);
+}
+
+static void loadtile8_arm(byte * restrict src, int sw, byte * restrict dst, int dw, int w, int h, int pad)
+{
+	if ((h == 0) || (w == 0))
+		return;
+
+	switch (pad)
+	{
+	case 0:
+		while (h--)
+		{
+			memcpy(dst, src, w);
+			src += sw;
+			dst += dw;
+		}
+		break;
+
+	case 1:
+		sw -= w;
+		dw -= w<<1;
+		asm volatile(
+			"MOV	r11,#255				\n"
+			"1:						\n"
+			"MOV	r5, %[w]		@ r5 = x = w	\n"
+			"2:						\n"
+			"LDRB	r4, [%[src]], #1	@ r4 = *src++	\n"
+			"SUBS	r5, r5, #1				\n"
+			"STRB	r4, [%[dst]], #1	@ *dst++ = r4	\n"
+			"STRB	r11,[%[dst]], #1	@ *dst++ = 255	\n"
+			"BGT	2b					\n"
+			"ADD	%[src],%[src],%[sw]	@ src += sw	\n"
+			"ADD	%[dst],%[dst],%[dw]	@ dst += dw	\n"
+			"SUBS	%[h],%[h],#1				\n"
+			"BGT	1b					\n"
+			:
+			[src]	"+r" (src),
+			[sw]	"+r" (sw),
+			[dst]	"+r" (dst),
+			[dw]	"+r" (dw),
+			[h]	"+r" (h),
+			[w]	"+r" (w)
+			:
+			:
+			"r4","r5","r11","memory","cc"
+			);
+		break;
+
+	case 3:
+		sw -= w;
+		asm volatile(
+			"MOV	r11,#255				\n"
+			"1:						\n"
+			"MOV	r5, %[w]		@ r5 = x = w	\n"
+			"MOV	r8, %[dst]		@ r8 = dp = dst	\n"
+			"2:						\n"
+			"LDRB	r4, [%[src]], #1	@ r4 = *src++	\n"
+			"LDRB	r6, [%[src]], #1	@ r6 = *src++	\n"
+			"LDRB	r7, [%[src]], #1	@ r7 = *src++	\n"
+			"SUBS	r5, r5, #3				\n"
+			"STRB	r4, [r8], #1		@ *dp++ = r4	\n"
+			"STRB	r6, [r8], #1		@ *dp++ = r6	\n"
+			"STRB	r7, [r8], #1		@ *dp++ = r7	\n"
+			"STRB	r11,[r8], #1		@ *dp++ = 255	\n"
+			"BGT	2b					\n"
+			"ADD	%[src],%[src],%[sw]	@ src += sw	\n"
+			"ADD	%[dst],%[dst],%[dw]	@ dst += dw	\n"
+			"SUBS	%[h],%[h],#1				\n"
+			"BGT	1b					\n"
+			:
+			[src]	"+r" (src),
+			[sw]	"+r" (sw),
+			[dst]	"+r" (dst),
+			[dw]	"+r" (dw),
+			[h]	"+r" (h),
+			[w]	"+r" (w)
+			:
+			:
+			"r4","r5","r6","r7","r8","r11","memory","cc"
+			);
+		break;
+
+	default:
+		sw -= w;
+		asm volatile(
+			"mov	r9,#255					\n"
+			"1:						\n"
+			"mov	r7, %[dst]	@ r7 = dp = dst		\n"
+			"mov	r8, #1		@ r8 = tpad = 1		\n"
+			"mov	r14,%[w]	@ r11= x = w		\n"
+			"2:						\n"
+			"ldrb	r10,[%[src]],#1				\n"
+			"subs	r8, r8, #1				\n"
+			"moveq	r8, %[pad]				\n"
+			"streqb	r9, [r7], #1				\n"
+			"strb	r10,[r7], #1				\n"
+			"subs	r14,r14, #1				\n"
+			"bgt	2b					\n"
+			"add	%[src],%[src],%[sw]			\n"
+			"add	%[dst],%[dst],%[dw]			\n"
+			"subs	%[h], %[h], #1				\n"
+			"bgt	1b					\n"
+			:
+			[src]	"+r" (src),
+			[sw]	"+r" (sw),
+			[dst]	"+r" (dst),
+			[dw]	"+r" (dw),
+			[h]	"+r" (h),
+			[w]	"+r" (w),
+			[pad]	"+r" (pad)
+			:
+			:
+			"r7","r8","r9","r10","r14","memory","cc"
+			);
+		break;
+	}
+}
+
+void
+fz_acceleratearch(void)
+{
+	fz_path_w4i1o4 = path_w4i1o4_arm;
+	fz_loadtile8 = loadtile8_arm;
+	fz_srow4 = fz_srow4_arm;
+	fz_scol4 = fz_scol4_arm;
+}
+
+#endif
diff --git a/draw/arch_port.c b/draw/arch_port.c
new file mode 100644
index 00000000..c7be977a
--- /dev/null
+++ b/draw/arch_port.c
@@ -0,0 +1,486 @@
+#include "fitz.h"
+
+typedef unsigned char byte;
+
+/* These C implementations use SWAR (SIMD-within-a-register) techniques. */
+
+#if 0 /* TODO: move into porterduff.c functions */
+
+#define MASK 0xFF00FF00;
+
+static void
+path_w4i1o4_32bit(byte *rgba,
+	byte * restrict src, byte cov, int len, byte * restrict dst)
+{
+	/* COLOR * coverage + DST * (256-coverage) = (COLOR - DST)*coverage + DST*256 */
+	unsigned int *dst32 = (unsigned int *)(void *)dst;
+	int alpha = rgba[3];
+	unsigned int rb = rgba[0] | (rgba[2] << 16);
+	unsigned int ga = rgba[1] | 0xFF0000;
+
+	if (alpha == 0)
+		return;
+
+	if (alpha != 255)
+	{
+		alpha += alpha>>7; /* alpha is now in the 0...256 range */
+		while (len--)
+		{
+			unsigned int ca, drb, dga, crb, cga;
+			cov += *src; *src++ = 0;
+			ca = cov + (cov>>7); /* ca is in 0...256 range */
+			ca = (ca*alpha)>>8; /* ca is is in 0...256 range */
+			drb = *dst32++;
+			if (ca != 0)
+			{
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
+				crb = rb - (drb>>8);
+				dga += cga * ca;
+				drb += crb * ca;
+				dga &= MASK;
+				drb &= MASK;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
+			}
+		}
+	}
+	else
+	{
+		while (len--)
+		{
+			unsigned int ca, drb, dga, crb, cga;
+			cov += *src; *src++ = 0;
+			ca = cov + (cov>>7); /* ca is in 0...256 range */
+			drb = *dst32++;
+			if (ca == 0)
+				continue;
+			if (ca == 255)
+			{
+				drb = (ga<<8) | rb;
+			}
+			else
+			{
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
+				crb = rb - (drb>>8);
+				dga += cga * ca;
+				drb += crb * ca;
+				dga &= MASK;
+				drb &= MASK;
+				drb = dga |(drb>>8);
+			}
+			dst32[-1] = drb;
+		}
+	}
+}
+
+static void
+text_w4i1o4_32bit(byte *rgba,
+	byte * restrict src, int srcw,
+	byte * restrict dst, int dstw, int w0, int h)
+{
+	unsigned int *dst32 = (unsigned int *)(void *)dst;
+	unsigned int alpha = rgba[3];
+	unsigned int rb = rgba[0] | (rgba[2] << 16);
+	unsigned int ga = rgba[1] | 0xFF0000;
+
+	if (alpha == 0)
+		return;
+
+	srcw -= w0;
+	dstw = (dstw>>2)-w0;
+
+	if (alpha != 255)
+	{
+		alpha += alpha>>7; /* alpha is now in the 0...256 range */
+		while (h--)
+		{
+			int w = w0;
+			while (w--)
+			{
+				unsigned int ca, drb, dga, crb, cga;
+				ca = *src++;
+				drb = *dst32++;
+				ca += ca>>7;
+				ca = (ca*alpha)>>8;
+				if (ca == 0)
+					continue;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
+				crb = rb - (drb>>8);
+				dga += cga * ca;
+				drb += crb * ca;
+				dga &= MASK;
+				drb &= MASK;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
+			}
+			src += srcw;
+			dst32 += dstw;
+		}
+	}
+	else
+	{
+		while (h--)
+		{
+			int w = w0;
+			while (w--)
+			{
+				unsigned int ca, drb, dga, crb, cga;
+				ca = *src++;
+				drb = *dst32++;
+				ca += ca>>7;
+				if (ca == 0)
+					continue;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
+				crb = rb - (drb>>8);
+				dga += cga * ca;
+				drb += crb * ca;
+				dga &= MASK;
+				drb &= MASK;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
+			}
+			src += srcw;
+			dst32 += dstw;
+		}
+	}
+}
+
+static void
+img_4o4_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
+	fz_pixmap *image, int u, int v, int fa, int fb)
+{
+	unsigned int *dst32 = (unsigned int *)(void *)dst;
+	unsigned int *samples = (unsigned int *)(void *)image->samples;
+	int w = image->w;
+	int h = image->h-1;
+
+	while (len--)
+	{
+		unsigned int a, a1, d, d1;
+		int sa;
+		cov += *src; *src = 0; src++;
+		/* (a,a1) = sampleargb(samples, w, h, u, v, argb); */
+		{
+			int ui, ui1, vi, vi1, ud, vd;
+			unsigned int b, b1, c, c1;
+			ui1 = 1;
+			ui = u >> 16;
+			if (ui < 0)
+			{
+				ui = 0;
+				ui1 = 0;
+			}
+			else if (ui >= w-1)
+			{
+				ui = w-1;
+				ui1 = 0;
+			}
+			vi1 = w;
+			vi = v >> 16;
+			if (vi < 0)
+			{
+				vi = 0;
+				vi1 = 0;
+			}
+			else if (vi >= h)
+			{
+				vi = h;
+				vi1 = 0;
+			}
+			ui += vi*w;
+			a = samples[ui];
+			b = samples[ui + ui1];
+			c = samples[ui + vi1];
+			d = samples[ui + ui1 + vi1];
+			ud = (u>>8) & 0xFF;
+			vd = (v>>8) & 0xFF;
+			ud = FZ_EXPAND(ud);
+			vd = FZ_EXPAND(vd);
+			/* (a,a1) = blend(a,b,ud) */
+			a1 = a & MASK;
+			a = (a<<8) & MASK;
+			b1 = (b>>8) & ~MASK;
+			b = b & ~MASK;
+			a = ((b -(a >>8)) * ud + a ) & MASK;
+			a1 = ((b1-(a1>>8)) * ud + a1) & MASK;
+			/* (c,c1) = blend(c,d,ud) */
+			c1 = c & MASK;
+			c = (c<<8) & MASK;
+			d1 = (d>>8) & ~MASK;
+			d = d & ~MASK;
+			c = ((d -(c >>8)) * ud + c ) & MASK;
+			c1 = ((d1-(c1>>8)) * ud + c1) & MASK;
+			/* (a,a1) = blend((a,a1),(c,c1),vd) */
+			a = (((c >>8)-(a >>8)) * vd + a ) & MASK;
+			a1 = (((c1>>8)-(a1>>8)) * vd + a1) & MASK;
+		}
+		sa = (a1>>24);
+		sa = FZ_COMBINE(FZ_EXPAND(sa), FZ_EXPAND(cov));
+		a1 |= 0xFF000000;
+		d = *dst32++;
+		d1 = d & MASK;
+		d = (d<<8) & MASK;
+		a = (((a >>8)-(d >>8)) * sa + d ) & MASK;
+		a1 = (((a1>>8)-(d1>>8)) * sa + d1) & MASK;
+		dst32[-1] = (a>>8) | a1;
+		u += fa;
+		v += fb;
+	}
+}
+
+static void
+img_w4i1o4_32bit(byte *rgba, byte * restrict src, byte cov, int len,
+	byte * restrict dst, fz_pixmap *image, int u, int v, int fa, int fb)
+{
+	byte *samples = image->samples;
+	int w = image->w;
+	int h = image->h-1;
+	int alpha = FZ_EXPAND(rgba[3]);
+	unsigned int rb = rgba[0] | (rgba[2] << 16);
+	unsigned int ga = rgba[1] | 0xFF0000;
+	unsigned int *dst32 = (unsigned int *)(void *)dst;
+
+	if (alpha == 0)
+		return;
+	if (alpha != 256)
+	{
+		while (len--)
+		{
+			unsigned int ca, drb, dga, crb, cga;
+			unsigned int a, b;
+			cov += *src; *src = 0; src++;
+			drb = *dst32++;
+			ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
+			if (ca != 0)
+			{
+				int ui, ui1, vi, vi1, ud, vd;
+				/* a = samplemask(samples, w, h, u, v); */
+				ui1 = 1;
+				ui = u >> 16;
+				if (ui < 0)
+				{
+					ui = 0;
+					ui1 = 0;
+				}
+				else if (ui >= w-1)
+				{
+					ui = w-1;
+					ui1 = 0;
+				}
+				vi1 = w;
+				vi = v >> 16;
+				if (vi < 0)
+				{
+					vi = 0;
+					vi1 = 0;
+				}
+				else if (vi >= h)
+				{
+					vi = h;
+					vi1 = 0;
+				}
+				ui += vi*w;
+				a = samples[ui];
+				b = samples[ui + ui1];
+				a |= samples[ui + vi1]<<16;
+				b |= samples[ui + ui1 + vi1]<<16;
+				ud = (u>>8) & 0xFF;
+				vd = (v>>8) & 0xFF;
+				ud = FZ_EXPAND(ud);
+				vd = FZ_EXPAND(vd);
+				/* a = blend(a,b,ud) */
+				a = ((b-a) * ud + (a<<8)) & MASK;
+				/* a = blend(a,a>>16,vd) */
+				a = (((a>>24)-(a>>8)) * vd + a);
+				a = (a>>8) & 0xFF;
+				ca = FZ_COMBINE(ca, FZ_EXPAND(a));
+			}
+			if (ca != 0)
+			{
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
+				crb = rb - (drb>>8);
+				dga += cga * ca;
+				drb += crb * ca;
+				dga &= MASK;
+				drb &= MASK;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
+			}
+			u += fa;
+			v += fb;
+		}
+	}
+	else
+	{
+		while (len--)
+		{
+			unsigned int ca, drb, dga, crb, cga;
+			unsigned int a, b;
+			cov += *src; *src = 0; src++;
+			drb = *dst32++;
+			if (cov != 0)
+			{
+				int ui, ui1, vi, vi1, ud, vd;
+				/* a = samplemask(samples, w, h, u, v); */
+				ui1 = 1;
+				ui = u >> 16;
+				if (ui < 0)
+				{
+					ui = 0;
+					ui1 = 0;
+				}
+				else if (ui >= w-1)
+				{
+					ui = w-1;
+					ui1 = 0;
+				}
+				vi1 = w;
+				vi = v >> 16;
+				if (vi < 0)
+				{
+					vi = 0;
+					vi1 = 0;
+				}
+				else if (vi >= h)
+				{
+					vi = h;
+					vi1 = 0;
+				}
+				ui += vi*w;
+				a = samples[ui];
+				b = samples[ui + ui1];
+				a |= samples[ui + vi1]<<16;
+				b |= samples[ui + ui1 + vi1]<<16;
+				ud = (u>>8) & 0xFF;
+				vd = (v>>8) & 0xFF;
+				ud = FZ_EXPAND(ud);
+				vd = FZ_EXPAND(vd);
+				/* a = blend(a,b,ud) */
+				a = ((b-a) * ud + (a<<8)) & MASK;
+				/* a = blend(a,a>>16,vd) */
+				a = (((a>>24)-(a>>8)) * vd + a);
+				a = (a>>8) & 0xFF;
+				ca = FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(a));
+				if (ca != 0)
+				{
+					if (ca == 256)
+					{
+						drb = (ga<<8) | rb;
+					}
+					else
+					{
+						dga = drb & MASK;
+						drb = (drb<<8) & MASK;
+						cga = ga - (dga>>8);
+						crb = rb - (drb>>8);
+						dga += cga * ca;
+						drb += crb * ca;
+						dga &= MASK;
+						drb &= MASK;
+						drb = dga | (drb>>8);
+					}
+					dst32[-1] = drb;
+				}
+			}
+			u += fa;
+			v += fb;
+		}
+	}
+}
+
+static void
+img_1o1_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
+	fz_pixmap *image, int u, int v, int fa, int fb)
+{
+	byte *samples = image->samples;
+	int w = image->w;
+	int h = image->h-1;
+
+	while (len--)
+	{
+		unsigned int a, b;
+		cov += *src; *src = 0; src++;
+		if (cov != 0)
+		{
+			int ui, ui1, vi, vi1, ud, vd;
+			/* sa = samplemask(samples, w, h, u, v); */
+			ui1 = 1;
+			ui = u >> 16;
+			if (ui < 0)
+			{
+				ui = 0;
+				ui1 = 0;
+			}
+			else if (ui >= w-1)
+			{
+				ui = w-1;
+				ui1 = 0;
+			}
+			vi1 = w;
+			vi = v >> 16;
+			if (vi < 0)
+			{
+				vi = 0;
+				vi1 = 0;
+			}
+			else if (vi >= h)
+			{
+				vi = h;
+				vi1 = 0;
+			}
+			ui += vi*w;
+			a = samples[ui];
+			b = samples[ui + ui1];
+			a |= samples[ui + vi1]<<16;
+			b |= samples[ui + ui1 + vi1]<<16;
+			ud = (u>>8) & 0xFF;
+			vd = (v>>8) & 0xFF;
+			ud = FZ_EXPAND(ud);
+			vd = FZ_EXPAND(vd);
+			/* a = blend(a,b,ud) */
+			a = ((b-a) * ud + (a<<8)) & MASK;
+			/* a = blend(a,a>>16,vd) */
+			a = (((a>>24)-(a>>8)) * vd + a);
+			a = (a>>8) & 0xFF;
+			a = FZ_COMBINE(FZ_EXPAND(a), FZ_EXPAND(cov));
+			if (a != 0)
+			{
+				if (a == 256)
+					dst[0] = 255;
+				else
+					dst[0] = FZ_BLEND(255, dst[0], a);
+			}
+		}
+		dst++;
+		u += fa;
+		v += fb;
+	}
+}
+
+#endif
+
+void fz_accelerate(void)
+{
+	if (sizeof(int) == 4 && sizeof(unsigned int) == 4 && !fz_isbigendian())
+	{
+//		fz_path_w4i1o4 = path_w4i1o4_32bit;
+//		fz_text_w4i1o4 = text_w4i1o4_32bit;
+//		fz_img_4o4 = img_4o4_32bit;
+//		fz_img_w4i1o4 = img_w4i1o4_32bit;
+//		fz_img_1o1 = img_1o1_32bit;
+	}
+
+#ifdef HAVE_CPUDEP
+	fz_acceleratearch();
+#endif
+}
diff --git a/draw/archarm.c b/draw/archarm.c
deleted file mode 100644
index c601a70c..00000000
--- a/draw/archarm.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * ARM specific render optims live here
- */
-
-#include "fitz.h"
-
-typedef unsigned char byte;
-
-/* always surround cpu specific code with HAVE_XXX */
-#ifdef ARCH_ARM
-
-/* from imagescalearm.s */
-extern void fz_srow4_arm(byte *src, byte *dst, int w, int denom);
-extern void fz_scol4_arm(byte *src, byte *dst, int w, int denom);
-
-static void
-path_w4i1o4_arm(byte * restrict rgba, byte * restrict src, byte cov, int len, byte * restrict dst)
-{
-	/* The ARM code here is a hand coded implementation of the optimized C version. */
-
-	if (len <= 0)
-		return;
-
-	asm volatile(
-	"ldr	%0, [%0]		@ %0 = rgba			\n"
-	"mov	r11,#0							\n"
-	"mov	r8, #0xFF00						\n"
-	"mov	r14,%0,lsr #24		@ r14= alpha			\n"
-	"orr	%0, %0, #0xFF000000	@ %0 = rgba |= 0xFF000000	\n"
-	"orr	r8, r8, r8, LSL #16	@ r8 = 0xFF00FF00		\n"
-	"adds	r14,r14,r14,LSR #7	@ r14 = alpha += alpha>>7	\n"
-	"beq	9f			@ if (alpha == 0) bale		\n"
-	"and	r6, %0, r8		@ r6 = ga<<8			\n"
-	"bic	%0, %0, r8		@ %0 = rb			\n"
-	"mov	r6, r6, LSR #8		@ r6 = ga			\n"
-	"cmp	r14,#256		@ if (alpha == 256)		\n"
-	"beq	4f			@	no-alpha loop		\n"
-	"B	2f			@ enter the loop		\n"
-	"1:	@ Loop used for when coverage*alpha == 0		\n"
-	"subs	%3, %3, #1		@ len--				\n"
-	"ble	9f							\n"
-	"2:								\n"
-	"ldrb	r12,[%1]		@ r12= *src			\n"
-	"ldr	r9, [%4], #4		@ r9 = drb = *dst32++		\n"
-	"strb	r11,[%1], #1		@ r11= *src++ = 0		\n"
-	"add	%2, r12, %2		@ %2 = cov += r12		\n"
-	"ands	%2, %2, #255		@ %2 = cov &= 255		\n"
-	"beq	1b			@ if coverage == 0 loop back	\n"
-	"add	r10,%2, %2, LSR #7	@ r10= ca = cov+(cov>>7)	\n"
-	"mul	r10,r14,r10		@ r10= ca *= alpha		\n"
-	"and	r7, r8, r9		@ r7 = dga = drb & MASK		\n"
-	"mov	r10,r10,LSR #8		@ r10= ca >>= 8			\n"
-	"and	r9, r8, r9, LSL #8	@ r9 = drb = (drb<<8) & MASK	\n"
-	"sub	r12,r6, r7, LSR #8	@ r12= cga = ga - (dga>>8)	\n"
-	"sub	r5, %0, r9, LSR #8	@ r5 = crb = rb - (drb>>8)	\n"
-	"mla	r7, r12,r10,r7		@ r7 = dga += cga * ca		\n"
-	"subs	%3, %3, #1		@ len--				\n"
-	"mla	r9, r5, r10,r9		@ r9 = drb += crb * ca		\n"
-	"and	r7, r8, r7		@ r7 = dga &= MASK		\n"
-	"and	r9, r8, r9		@ r9 = drb &= MASK		\n"
-	"orr	r9, r7, r9, LSR #8	@ r9 = drb = dga | (drb>>8)	\n"
-	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
-	"bgt	2b							\n"
-	"b	9f							\n"
-	"@ --- Solid alpha loop	---------------------------------------	\n"
-	"3:	@ Loop used when coverage == 256			\n"
-	"orr	r9, %0, r6, LSL #8	@ r9 = rgba			\n"
-	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
-	"4:	@ Loop used for when coverage*alpha == 0		\n"
-	"subs	%3, %3, #1		@ len--				\n"
-	"ble	9f							\n"
-	"5:								\n"
-	"ldrb	r12,[%1]		@ r12= *src			\n"
-	"ldr	r9, [%4], #4		@ r9 = drb = *dst32++		\n"
-	"strb	r11,[%1], #1		@ r11= *src++ = 0		\n"
-	"add	%2, r12, %2		@ %2 = cov += r12		\n"
-	"ands	%2, %2, #255		@ %2 = cov &= 255		\n"
-	"beq	4b			@ if coverage == 0 loop back	\n"
-	"cmp	%2, #255		@ if coverage == solid		\n"
-	"beq	3b			@	loop back		\n"
-	"add	r10,%2, %2, LSR #7	@ r10= ca = cov+(cov>>7)	\n"
-	"and	r7, r8, r9		@ r7 = dga = drb & MASK		\n"
-	"and	r9, r8, r9, LSL #8	@ r9 = dga = (drb<<8) & MASK	\n"
-	"sub	r12,r6, r7, LSR #8	@ r12= cga = ga - (dga>>8)	\n"
-	"sub	r5, %0, r9, LSR #8	@ r5 = crb = rb - (drb>>8)	\n"
-	"mla	r7, r12,r10,r7		@ r7 = dga += cga * ca		\n"
-	"subs	%3, %3, #1		@ len--				\n"
-	"mla	r9, r5, r10,r9		@ r9 = drb += crb * ca		\n"
-	"and	r7, r8, r7		@ r7 = dga &= MASK		\n"
-	"and	r9, r8, r9		@ r9 = drb &= MASK		\n"
-	"orr	r9, r7, r9, LSR #8	@ r9 = drb = dga | (drb>>8)	\n"
-	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
-	"bgt	5b							\n"
-	"9:				@ End				\n"
-	:
-	"+r" (rgba),
-	"+r" (src),
-	"+r" (cov),
-	"+r" (len),
-	"+r" (dst)
-	:
-	:
-	"r5","r6","r7","r8","r9","r10","r11","r12","r14","memory","cc"
-	);
-}
-
-static void loadtile8_arm(byte * restrict src, int sw, byte * restrict dst, int dw, int w, int h, int pad)
-{
-	if ((h == 0) || (w == 0))
-		return;
-
-	switch (pad)
-	{
-	case 0:
-		while (h--)
-		{
-			memcpy(dst, src, w);
-			src += sw;
-			dst += dw;
-		}
-		break;
-
-	case 1:
-		sw -= w;
-		dw -= w<<1;
-		asm volatile(
-			"MOV	r11,#255				\n"
-			"1:						\n"
-			"MOV	r5, %[w]		@ r5 = x = w	\n"
-			"2:						\n"
-			"LDRB	r4, [%[src]], #1	@ r4 = *src++	\n"
-			"SUBS	r5, r5, #1				\n"
-			"STRB	r4, [%[dst]], #1	@ *dst++ = r4	\n"
-			"STRB	r11,[%[dst]], #1	@ *dst++ = 255	\n"
-			"BGT	2b					\n"
-			"ADD	%[src],%[src],%[sw]	@ src += sw	\n"
-			"ADD	%[dst],%[dst],%[dw]	@ dst += dw	\n"
-			"SUBS	%[h],%[h],#1				\n"
-			"BGT	1b					\n"
-			:
-			[src]	"+r" (src),
-			[sw]	"+r" (sw),
-			[dst]	"+r" (dst),
-			[dw]	"+r" (dw),
-			[h]	"+r" (h),
-			[w]	"+r" (w)
-			:
-			:
-			"r4","r5","r11","memory","cc"
-			);
-		break;
-
-	case 3:
-		sw -= w;
-		asm volatile(
-			"MOV	r11,#255				\n"
-			"1:						\n"
-			"MOV	r5, %[w]		@ r5 = x = w	\n"
-			"MOV	r8, %[dst]		@ r8 = dp = dst	\n"
-			"2:						\n"
-			"LDRB	r4, [%[src]], #1	@ r4 = *src++	\n"
-			"LDRB	r6, [%[src]], #1	@ r6 = *src++	\n"
-			"LDRB	r7, [%[src]], #1	@ r7 = *src++	\n"
-			"SUBS	r5, r5, #3				\n"
-			"STRB	r4, [r8], #1		@ *dp++ = r4	\n"
-			"STRB	r6, [r8], #1		@ *dp++ = r6	\n"
-			"STRB	r7, [r8], #1		@ *dp++ = r7	\n"
-			"STRB	r11,[r8], #1		@ *dp++ = 255	\n"
-			"BGT	2b					\n"
-			"ADD	%[src],%[src],%[sw]	@ src += sw	\n"
-			"ADD	%[dst],%[dst],%[dw]	@ dst += dw	\n"
-			"SUBS	%[h],%[h],#1				\n"
-			"BGT	1b					\n"
-			:
-			[src]	"+r" (src),
-			[sw]	"+r" (sw),
-			[dst]	"+r" (dst),
-			[dw]	"+r" (dw),
-			[h]	"+r" (h),
-			[w]	"+r" (w)
-			:
-			:
-			"r4","r5","r6","r7","r8","r11","memory","cc"
-			);
-		break;
-
-	default:
-		sw -= w;
-		asm volatile(
-			"mov	r9,#255					\n"
-			"1:						\n"
-			"mov	r7, %[dst]	@ r7 = dp = dst		\n"
-			"mov	r8, #1		@ r8 = tpad = 1		\n"
-			"mov	r14,%[w]	@ r11= x = w		\n"
-			"2:						\n"
-			"ldrb	r10,[%[src]],#1				\n"
-			"subs	r8, r8, #1				\n"
-			"moveq	r8, %[pad]				\n"
-			"streqb	r9, [r7], #1				\n"
-			"strb	r10,[r7], #1				\n"
-			"subs	r14,r14, #1				\n"
-			"bgt	2b					\n"
-			"add	%[src],%[src],%[sw]			\n"
-			"add	%[dst],%[dst],%[dw]			\n"
-			"subs	%[h], %[h], #1				\n"
-			"bgt	1b					\n"
-			:
-			[src]	"+r" (src),
-			[sw]	"+r" (sw),
-			[dst]	"+r" (dst),
-			[dw]	"+r" (dw),
-			[h]	"+r" (h),
-			[w]	"+r" (w),
-			[pad]	"+r" (pad)
-			:
-			:
-			"r7","r8","r9","r10","r14","memory","cc"
-			);
-		break;
-	}
-}
-
-void
-fz_acceleratearch(void)
-{
-	fz_path_w4i1o4 = path_w4i1o4_arm;
-	fz_loadtile8 = loadtile8_arm;
-	fz_srow4 = fz_srow4_arm;
-	fz_scol4 = fz_scol4_arm;
-}
-
-#endif
diff --git a/draw/archport.c b/draw/archport.c
deleted file mode 100644
index c7be977a..00000000
--- a/draw/archport.c
+++ /dev/null
@@ -1,486 +0,0 @@
-#include "fitz.h"
-
-typedef unsigned char byte;
-
-/* These C implementations use SWAR (SIMD-within-a-register) techniques. */
-
-#if 0 /* TODO: move into porterduff.c functions */
-
-#define MASK 0xFF00FF00;
-
-static void
-path_w4i1o4_32bit(byte *rgba,
-	byte * restrict src, byte cov, int len, byte * restrict dst)
-{
-	/* COLOR * coverage + DST * (256-coverage) = (COLOR - DST)*coverage + DST*256 */
-	unsigned int *dst32 = (unsigned int *)(void *)dst;
-	int alpha = rgba[3];
-	unsigned int rb = rgba[0] | (rgba[2] << 16);
-	unsigned int ga = rgba[1] | 0xFF0000;
-
-	if (alpha == 0)
-		return;
-
-	if (alpha != 255)
-	{
-		alpha += alpha>>7; /* alpha is now in the 0...256 range */
-		while (len--)
-		{
-			unsigned int ca, drb, dga, crb, cga;
-			cov += *src; *src++ = 0;
-			ca = cov + (cov>>7); /* ca is in 0...256 range */
-			ca = (ca*alpha)>>8; /* ca is is in 0...256 range */
-			drb = *dst32++;
-			if (ca != 0)
-			{
-				dga = drb & MASK;
-				drb = (drb<<8) & MASK;
-				cga = ga - (dga>>8);
-				crb = rb - (drb>>8);
-				dga += cga * ca;
-				drb += crb * ca;
-				dga &= MASK;
-				drb &= MASK;
-				drb = dga | (drb>>8);
-				dst32[-1] = drb;
-			}
-		}
-	}
-	else
-	{
-		while (len--)
-		{
-			unsigned int ca, drb, dga, crb, cga;
-			cov += *src; *src++ = 0;
-			ca = cov + (cov>>7); /* ca is in 0...256 range */
-			drb = *dst32++;
-			if (ca == 0)
-				continue;
-			if (ca == 255)
-			{
-				drb = (ga<<8) | rb;
-			}
-			else
-			{
-				dga = drb & MASK;
-				drb = (drb<<8) & MASK;
-				cga = ga - (dga>>8);
-				crb = rb - (drb>>8);
-				dga += cga * ca;
-				drb += crb * ca;
-				dga &= MASK;
-				drb &= MASK;
-				drb = dga |(drb>>8);
-			}
-			dst32[-1] = drb;
-		}
-	}
-}
-
-static void
-text_w4i1o4_32bit(byte *rgba,
-	byte * restrict src, int srcw,
-	byte * restrict dst, int dstw, int w0, int h)
-{
-	unsigned int *dst32 = (unsigned int *)(void *)dst;
-	unsigned int alpha = rgba[3];
-	unsigned int rb = rgba[0] | (rgba[2] << 16);
-	unsigned int ga = rgba[1] | 0xFF0000;
-
-	if (alpha == 0)
-		return;
-
-	srcw -= w0;
-	dstw = (dstw>>2)-w0;
-
-	if (alpha != 255)
-	{
-		alpha += alpha>>7; /* alpha is now in the 0...256 range */
-		while (h--)
-		{
-			int w = w0;
-			while (w--)
-			{
-				unsigned int ca, drb, dga, crb, cga;
-				ca = *src++;
-				drb = *dst32++;
-				ca += ca>>7;
-				ca = (ca*alpha)>>8;
-				if (ca == 0)
-					continue;
-				dga = drb & MASK;
-				drb = (drb<<8) & MASK;
-				cga = ga - (dga>>8);
-				crb = rb - (drb>>8);
-				dga += cga * ca;
-				drb += crb * ca;
-				dga &= MASK;
-				drb &= MASK;
-				drb = dga | (drb>>8);
-				dst32[-1] = drb;
-			}
-			src += srcw;
-			dst32 += dstw;
-		}
-	}
-	else
-	{
-		while (h--)
-		{
-			int w = w0;
-			while (w--)
-			{
-				unsigned int ca, drb, dga, crb, cga;
-				ca = *src++;
-				drb = *dst32++;
-				ca += ca>>7;
-				if (ca == 0)
-					continue;
-				dga = drb & MASK;
-				drb = (drb<<8) & MASK;
-				cga = ga - (dga>>8);
-				crb = rb - (drb>>8);
-				dga += cga * ca;
-				drb += crb * ca;
-				dga &= MASK;
-				drb &= MASK;
-				drb = dga | (drb>>8);
-				dst32[-1] = drb;
-			}
-			src += srcw;
-			dst32 += dstw;
-		}
-	}
-}
-
-static void
-img_4o4_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
-	fz_pixmap *image, int u, int v, int fa, int fb)
-{
-	unsigned int *dst32 = (unsigned int *)(void *)dst;
-	unsigned int *samples = (unsigned int *)(void *)image->samples;
-	int w = image->w;
-	int h = image->h-1;
-
-	while (len--)
-	{
-		unsigned int a, a1, d, d1;
-		int sa;
-		cov += *src; *src = 0; src++;
-		/* (a,a1) = sampleargb(samples, w, h, u, v, argb); */
-		{
-			int ui, ui1, vi, vi1, ud, vd;
-			unsigned int b, b1, c, c1;
-			ui1 = 1;
-			ui = u >> 16;
-			if (ui < 0)
-			{
-				ui = 0;
-				ui1 = 0;
-			}
-			else if (ui >= w-1)
-			{
-				ui = w-1;
-				ui1 = 0;
-			}
-			vi1 = w;
-			vi = v >> 16;
-			if (vi < 0)
-			{
-				vi = 0;
-				vi1 = 0;
-			}
-			else if (vi >= h)
-			{
-				vi = h;
-				vi1 = 0;
-			}
-			ui += vi*w;
-			a = samples[ui];
-			b = samples[ui + ui1];
-			c = samples[ui + vi1];
-			d = samples[ui + ui1 + vi1];
-			ud = (u>>8) & 0xFF;
-			vd = (v>>8) & 0xFF;
-			ud = FZ_EXPAND(ud);
-			vd = FZ_EXPAND(vd);
-			/* (a,a1) = blend(a,b,ud) */
-			a1 = a & MASK;
-			a = (a<<8) & MASK;
-			b1 = (b>>8) & ~MASK;
-			b = b & ~MASK;
-			a = ((b -(a >>8)) * ud + a ) & MASK;
-			a1 = ((b1-(a1>>8)) * ud + a1) & MASK;
-			/* (c,c1) = blend(c,d,ud) */
-			c1 = c & MASK;
-			c = (c<<8) & MASK;
-			d1 = (d>>8) & ~MASK;
-			d = d & ~MASK;
-			c = ((d -(c >>8)) * ud + c ) & MASK;
-			c1 = ((d1-(c1>>8)) * ud + c1) & MASK;
-			/* (a,a1) = blend((a,a1),(c,c1),vd) */
-			a = (((c >>8)-(a >>8)) * vd + a ) & MASK;
-			a1 = (((c1>>8)-(a1>>8)) * vd + a1) & MASK;
-		}
-		sa = (a1>>24);
-		sa = FZ_COMBINE(FZ_EXPAND(sa), FZ_EXPAND(cov));
-		a1 |= 0xFF000000;
-		d = *dst32++;
-		d1 = d & MASK;
-		d = (d<<8) & MASK;
-		a = (((a >>8)-(d >>8)) * sa + d ) & MASK;
-		a1 = (((a1>>8)-(d1>>8)) * sa + d1) & MASK;
-		dst32[-1] = (a>>8) | a1;
-		u += fa;
-		v += fb;
-	}
-}
-
-static void
-img_w4i1o4_32bit(byte *rgba, byte * restrict src, byte cov, int len,
-	byte * restrict dst, fz_pixmap *image, int u, int v, int fa, int fb)
-{
-	byte *samples = image->samples;
-	int w = image->w;
-	int h = image->h-1;
-	int alpha = FZ_EXPAND(rgba[3]);
-	unsigned int rb = rgba[0] | (rgba[2] << 16);
-	unsigned int ga = rgba[1] | 0xFF0000;
-	unsigned int *dst32 = (unsigned int *)(void *)dst;
-
-	if (alpha == 0)
-		return;
-	if (alpha != 256)
-	{
-		while (len--)
-		{
-			unsigned int ca, drb, dga, crb, cga;
-			unsigned int a, b;
-			cov += *src; *src = 0; src++;
-			drb = *dst32++;
-			ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
-			if (ca != 0)
-			{
-				int ui, ui1, vi, vi1, ud, vd;
-				/* a = samplemask(samples, w, h, u, v); */
-				ui1 = 1;
-				ui = u >> 16;
-				if (ui < 0)
-				{
-					ui = 0;
-					ui1 = 0;
-				}
-				else if (ui >= w-1)
-				{
-					ui = w-1;
-					ui1 = 0;
-				}
-				vi1 = w;
-				vi = v >> 16;
-				if (vi < 0)
-				{
-					vi = 0;
-					vi1 = 0;
-				}
-				else if (vi >= h)
-				{
-					vi = h;
-					vi1 = 0;
-				}
-				ui += vi*w;
-				a = samples[ui];
-				b = samples[ui + ui1];
-				a |= samples[ui + vi1]<<16;
-				b |= samples[ui + ui1 + vi1]<<16;
-				ud = (u>>8) & 0xFF;
-				vd = (v>>8) & 0xFF;
-				ud = FZ_EXPAND(ud);
-				vd = FZ_EXPAND(vd);
-				/* a = blend(a,b,ud) */
-				a = ((b-a) * ud + (a<<8)) & MASK;
-				/* a = blend(a,a>>16,vd) */
-				a = (((a>>24)-(a>>8)) * vd + a);
-				a = (a>>8) & 0xFF;
-				ca = FZ_COMBINE(ca, FZ_EXPAND(a));
-			}
-			if (ca != 0)
-			{
-				dga = drb & MASK;
-				drb = (drb<<8) & MASK;
-				cga = ga - (dga>>8);
-				crb = rb - (drb>>8);
-				dga += cga * ca;
-				drb += crb * ca;
-				dga &= MASK;
-				drb &= MASK;
-				drb = dga | (drb>>8);
-				dst32[-1] = drb;
-			}
-			u += fa;
-			v += fb;
-		}
-	}
-	else
-	{
-		while (len--)
-		{
-			unsigned int ca, drb, dga, crb, cga;
-			unsigned int a, b;
-			cov += *src; *src = 0; src++;
-			drb = *dst32++;
-			if (cov != 0)
-			{
-				int ui, ui1, vi, vi1, ud, vd;
-				/* a = samplemask(samples, w, h, u, v); */
-				ui1 = 1;
-				ui = u >> 16;
-				if (ui < 0)
-				{
-					ui = 0;
-					ui1 = 0;
-				}
-				else if (ui >= w-1)
-				{
-					ui = w-1;
-					ui1 = 0;
-				}
-				vi1 = w;
-				vi = v >> 16;
-				if (vi < 0)
-				{
-					vi = 0;
-					vi1 = 0;
-				}
-				else if (vi >= h)
-				{
-					vi = h;
-					vi1 = 0;
-				}
-				ui += vi*w;
-				a = samples[ui];
-				b = samples[ui + ui1];
-				a |= samples[ui + vi1]<<16;
-				b |= samples[ui + ui1 + vi1]<<16;
-				ud = (u>>8) & 0xFF;
-				vd = (v>>8) & 0xFF;
-				ud = FZ_EXPAND(ud);
-				vd = FZ_EXPAND(vd);
-				/* a = blend(a,b,ud) */
-				a = ((b-a) * ud + (a<<8)) & MASK;
-				/* a = blend(a,a>>16,vd) */
-				a = (((a>>24)-(a>>8)) * vd + a);
-				a = (a>>8) & 0xFF;
-				ca = FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(a));
-				if (ca != 0)
-				{
-					if (ca == 256)
-					{
-						drb = (ga<<8) | rb;
-					}
-					else
-					{
-						dga = drb & MASK;
-						drb = (drb<<8) & MASK;
-						cga = ga - (dga>>8);
-						crb = rb - (drb>>8);
-						dga += cga * ca;
-						drb += crb * ca;
-						dga &= MASK;
-						drb &= MASK;
-						drb = dga | (drb>>8);
-					}
-					dst32[-1] = drb;
-				}
-			}
-			u += fa;
-			v += fb;
-		}
-	}
-}
-
-static void
-img_1o1_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
-	fz_pixmap *image, int u, int v, int fa, int fb)
-{
-	byte *samples = image->samples;
-	int w = image->w;
-	int h = image->h-1;
-
-	while (len--)
-	{
-		unsigned int a, b;
-		cov += *src; *src = 0; src++;
-		if (cov != 0)
-		{
-			int ui, ui1, vi, vi1, ud, vd;
-			/* sa = samplemask(samples, w, h, u, v); */
-			ui1 = 1;
-			ui = u >> 16;
-			if (ui < 0)
-			{
-				ui = 0;
-				ui1 = 0;
-			}
-			else if (ui >= w-1)
-			{
-				ui = w-1;
-				ui1 = 0;
-			}
-			vi1 = w;
-			vi = v >> 16;
-			if (vi < 0)
-			{
-				vi = 0;
-				vi1 = 0;
-			}
-			else if (vi >= h)
-			{
-				vi = h;
-				vi1 = 0;
-			}
-			ui += vi*w;
-			a = samples[ui];
-			b = samples[ui + ui1];
-			a |= samples[ui + vi1]<<16;
-			b |= samples[ui + ui1 + vi1]<<16;
-			ud = (u>>8) & 0xFF;
-			vd = (v>>8) & 0xFF;
-			ud = FZ_EXPAND(ud);
-			vd = FZ_EXPAND(vd);
-			/* a = blend(a,b,ud) */
-			a = ((b-a) * ud + (a<<8)) & MASK;
-			/* a = blend(a,a>>16,vd) */
-			a = (((a>>24)-(a>>8)) * vd + a);
-			a = (a>>8) & 0xFF;
-			a = FZ_COMBINE(FZ_EXPAND(a), FZ_EXPAND(cov));
-			if (a != 0)
-			{
-				if (a == 256)
-					dst[0] = 255;
-				else
-					dst[0] = FZ_BLEND(255, dst[0], a);
-			}
-		}
-		dst++;
-		u += fa;
-		v += fb;
-	}
-}
-
-#endif
-
-void fz_accelerate(void)
-{
-	if (sizeof(int) == 4 && sizeof(unsigned int) == 4 && !fz_isbigendian())
-	{
-//		fz_path_w4i1o4 = path_w4i1o4_32bit;
-//		fz_text_w4i1o4 = text_w4i1o4_32bit;
-//		fz_img_4o4 = img_4o4_32bit;
-//		fz_img_w4i1o4 = img_w4i1o4_32bit;
-//		fz_img_1o1 = img_1o1_32bit;
-	}
-
-#ifdef HAVE_CPUDEP
-	fz_acceleratearch();
-#endif
-}
diff --git a/draw/blendmodes.c b/draw/blendmodes.c
deleted file mode 100644
index ac2e6c27..00000000
--- a/draw/blendmodes.c
+++ /dev/null
@@ -1,370 +0,0 @@
-#include "fitz.h"
-
-/* PDF 1.4 blend modes. These are slow. */
-
-typedef unsigned char byte;
-
-const char *fz_blendnames[] =
-{
-	"Normal",
-	"Multiply",
-	"Screen",
-	"Overlay",
-	"Darken",
-	"Lighten",
-	"ColorDodge",
-	"ColorBurn",
-	"HardLight",
-	"SoftLight",
-	"Difference",
-	"Exclusion",
-	"Hue",
-	"Saturation",
-	"Color",
-	"Luminosity",
-	nil
-};
-
-/* Separable blend modes */
-
-static inline int
-fz_screen_byte(int b, int s)
-{
-	return b + s - fz_mul255(b, s);
-}
-
-static inline int
-fz_hardlight_byte(int b, int s)
-{
-	int s2 = s << 1;
-	if (s <= 127)
-		return fz_mul255(b, s2);
-	else
-		return fz_screen_byte(b, s2 - 255);
-}
-
-static inline int
-fz_overlay_byte(int b, int s)
-{
-	return fz_hardlight_byte(s, b); /* note swapped order */
-}
-
-static inline int
-fz_darken_byte(int b, int s)
-{
-	return MIN(b, s);
-}
-
-static inline int
-fz_lighten_byte(int b, int s)
-{
-	return MAX(b, s);
-}
-
-static inline int
-fz_colordodge_byte(int b, int s)
-{
-	s = 255 - s;
-	if (b == 0)
-		return 0;
-	else if (b >= s)
-		return 255;
-	else
-		return (0x1fe * b + s) / (s << 1);
-}
-
-static inline int
-fz_colorburn_byte(int b, int s)
-{
-	b = 255 - b;
-	if (b == 0)
-		return 255;
-	else if (b >= s)
-		return 0;
-	else
-		return 0xff - (0x1fe * b + s) / (s << 1);
-}
-
-static inline int
-fz_softlight_byte(int b, int s)
-{
-	/* review this */
-	if (s < 128) {
-		return b - fz_mul255(fz_mul255((255 - (s<<1)), b), 255 - b);
-	}
-	else {
-		int dbd;
-		if (b < 64)
-			dbd = fz_mul255(fz_mul255((b << 4) - 12, b) + 4, b);
-		else
-			dbd = (int)sqrtf(255.0f * b);
-		return b + fz_mul255(((s<<1) - 255), (dbd - b));
-	}
-}
-
-static inline int
-fz_difference_byte(int b, int s)
-{
-	return ABS(b - s);
-}
-
-static inline int
-fz_exclusion_byte(int b, int s)
-{
-	return b + s - (fz_mul255(b, s)<<1);
-}
-
-/* Non-separable blend modes */
-
-static inline void
-fz_luminosity_rgb(int *rd, int *gd, int *bd, int rb, int gb, int bb, int rs, int gs, int bs)
-{
-	int delta, scale;
-	int r, g, b, y;
-
-	/* 0.3, 0.59, 0.11 in fixed point */
-	delta = ((rs - rb) * 77 + (gs - gb) * 151 + (bs - bb) * 28 + 0x80) >> 8;
-	r = rb + delta;
-	g = gb + delta;
-	b = bb + delta;
-
-	if ((r | g | b) & 0x100)
-	{
-		y = (rs * 77 + gs * 151 + bs * 28 + 0x80) >> 8;
-		if (delta > 0)
-		{
-			int max;
-			max = MAX(r, MAX(g, b));
-			scale = ((255 - y) << 16) / (max - y);
-		}
-		else
-		{
-			int min;
-			min = MIN(r, MIN(g, b));
-			scale = (y << 16) / (y - min);
-		}
-		r = y + (((r - y) * scale + 0x8000) >> 16);
-		g = y + (((g - y) * scale + 0x8000) >> 16);
-		b = y + (((b - y) * scale + 0x8000) >> 16);
-	}
-
-	*rd = r;
-	*gd = g;
-	*bd = b;
-}
-
-static void
-fz_saturation_rgb(int *rd, int *gd, int *bd, int rb, int gb, int bb, int rs, int gs, int bs)
-{
-	int minb, maxb;
-	int mins, maxs;
-	int y;
-	int scale;
-	int r, g, b;
-
-	minb = MIN(rb, MIN(gb, bb));
-	maxb = MAX(rb, MAX(gb, bb));
-	if (minb == maxb)
-	{
-		/* backdrop has zero saturation, avoid divide by 0 */
-		*rd = gb;
-		*gd = gb;
-		*bd = gb;
-		return;
-	}
-
-	mins = MIN(rs, MIN(gs, bs));
-	maxs = MAX(rs, MAX(gs, bs));
-
-	scale = ((maxs - mins) << 16) / (maxb - minb);
-	y = (rb * 77 + gb * 151 + bb * 28 + 0x80) >> 8;
-	r = y + ((((rb - y) * scale) + 0x8000) >> 16);
-	g = y + ((((gb - y) * scale) + 0x8000) >> 16);
-	b = y + ((((bb - y) * scale) + 0x8000) >> 16);
-
-	if ((r | g | b) & 0x100)
-	{
-		int scalemin, scalemax;
-		int min, max;
-
-		min = MIN(r, MIN(g, b));
-		max = MAX(r, MAX(g, b));
-
-		if (min < 0)
-			scalemin = (y << 16) / (y - min);
-		else
-			scalemin = 0x10000;
-
-		if (max > 255)
-			scalemax = ((255 - y) << 16) / (max - y);
-		else
-			scalemax = 0x10000;
-
-		scale = MIN(scalemin, scalemax);
-		r = y + (((r - y) * scale + 0x8000) >> 16);
-		g = y + (((g - y) * scale + 0x8000) >> 16);
-		b = y + (((b - y) * scale + 0x8000) >> 16);
-	}
-
-	*rd = r;
-	*gd = g;
-	*bd = b;
-}
-
-static void
-fz_color_rgb(int *rr, int *rg, int *rb, int br, int bg, int bb, int sr, int sg, int sb)
-{
-	fz_luminosity_rgb(rr, rg, rb, sr, sg, sb, br, bg, bb);
-}
-
-static void
-fz_hue_rgb(int *rr, int *rg, int *rb, int br, int bg, int bb, int sr, int sg, int sb)
-{
-	int tr, tg, tb;
-	fz_luminosity_rgb(&tr, &tg, &tb, sr, sg, sb, br, bg, bb);
-	fz_saturation_rgb(rr, rg, rb, tr, tg, tb, br, bg, bb);
-}
-
-/* Blending loops */
-
-void
-fz_blendseparable(byte * restrict bp, byte * restrict sp, int n, int w, fz_blendmode blendmode)
-{
-	int k;
-	int n1 = n - 1;
-	while (w--)
-	{
-		int sa = sp[n1];
-		int ba = bp[n1];
-		int saba = fz_mul255(sa, ba);
-
-		/* ugh, division to get non-premul components */
-		int invsa = sa ? 255 * 256 / sa : 0;
-		int invba = ba ? 255 * 256 / ba : 0;
-
-		for (k = 0; k < n1; k++)
-		{
-			int sc = (sp[k] * invsa) >> 8;
-			int bc = (bp[k] * invba) >> 8;
-			int rc;
-
-			switch (blendmode)
-			{
-			default:
-			case FZ_BNORMAL: rc = sc; break;
-			case FZ_BMULTIPLY: rc = fz_mul255(bc, sc); break;
-			case FZ_BSCREEN: rc = fz_screen_byte(bc, sc); break;
-			case FZ_BOVERLAY: rc = fz_overlay_byte(bc, sc); break;
-			case FZ_BDARKEN: rc = fz_darken_byte(bc, sc); break;
-			case FZ_BLIGHTEN: rc = fz_lighten_byte(bc, sc); break;
-			case FZ_BCOLORDODGE: rc = fz_colordodge_byte(bc, sc); break;
-			case FZ_BCOLORBURN: rc = fz_colorburn_byte(bc, sc); break;
-			case FZ_BHARDLIGHT: rc = fz_hardlight_byte(bc, sc); break;
-			case FZ_BSOFTLIGHT: rc = fz_softlight_byte(bc, sc); break;
-			case FZ_BDIFFERENCE: rc = fz_difference_byte(bc, sc); break;
-			case FZ_BEXCLUSION: rc = fz_exclusion_byte(bc, sc); break;
-			}
-
-			bp[k] = fz_mul255(255 - sa, bp[k]) + fz_mul255(255 - ba, sp[k]) + fz_mul255(saba, rc);
-		}
-
-		bp[k] = ba + sa - saba;
-
-		sp += n;
-		bp += n;
-	}
-}
-
-void
-fz_blendnonseparable(byte * restrict bp, byte * restrict sp, int w, fz_blendmode blendmode)
-{
-	while (w--)
-	{
-		int rr, rg, rb;
-
-		int sa = sp[3];
-		int ba = bp[3];
-		int saba = fz_mul255(sa, ba);
-
-		/* ugh, division to get non-premul components */
-		int invsa = sa ? 255 * 256 / sa : 0;
-		int invba = ba ? 255 * 256 / ba : 0;
-
-		int sr = (sp[0] * invsa) >> 8;
-		int sg = (sp[1] * invsa) >> 8;
-		int sb = (sp[2] * invsa) >> 8;
-
-		int br = (bp[0] * invba) >> 8;
-		int bg = (bp[1] * invba) >> 8;
-		int bb = (bp[2] * invba) >> 8;
-
-		switch (blendmode)
-		{
-		default:
-		case FZ_BHUE:
-			fz_hue_rgb(&rr, &rg, &rb, br, bg, bb, sr, sg, sb);
-			break;
-		case FZ_BSATURATION:
-			fz_saturation_rgb(&rr, &rg, &rb, br, bg, bb, sr, sg, sb);
-			break;
-		case FZ_BCOLOR:
-			fz_color_rgb(&rr, &rg, &rb, br, bg, bb, sr, sg, sb);
-			break;
-		case FZ_BLUMINOSITY:
-			fz_luminosity_rgb(&rr, &rg, &rb, br, bg, bb, sr, sg, sb);
-			break;
-		}
-
-		bp[0] = fz_mul255(255 - sa, bp[0]) + fz_mul255(255 - ba, sp[0]) + fz_mul255(saba, rr);
-		bp[1] = fz_mul255(255 - sa, bp[1]) + fz_mul255(255 - ba, sp[1]) + fz_mul255(saba, rg);
-		bp[2] = fz_mul255(255 - sa, bp[2]) + fz_mul255(255 - ba, sp[2]) + fz_mul255(saba, rb);
-		bp[3] = ba + sa - saba;
-
-		sp += 4;
-		bp += 4;
-	}
-}
-
-void
-fz_blendpixmap(fz_pixmap *dst, fz_pixmap *src, int alpha, fz_blendmode blendmode)
-{
-	unsigned char *sp, *dp;
-	fz_bbox bbox;
-	int x, y, w, h, n;
-
-	/* TODO: fix this hack! */
-	if (alpha < 255)
-	{
-		sp = src->samples;
-		n = src->w * src->h * src->n;
-		while (n--)
-		{
-			*sp = fz_mul255(*sp, alpha);
-			sp++;
-		}
-	}
-
-	bbox = fz_boundpixmap(dst);
-	bbox = fz_intersectbbox(bbox, fz_boundpixmap(src));
-
-	x = bbox.x0;
-	y = bbox.y0;
-	w = bbox.x1 - bbox.x0;
-	h = bbox.y1 - bbox.y0;
-
-	n = src->n;
-	sp = src->samples + ((y - src->y) * src->w + (x - src->x)) * n;
-	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * n;
-
-	assert(src->n == dst->n);
-
-	while (h--)
-	{
-		if (n == 4 && blendmode >= FZ_BHUE)
-			fz_blendnonseparable(dp, sp, w, blendmode);
-		else
-			fz_blendseparable(dp, sp, n, w, blendmode);
-		sp += src->w * n;
-		dp += dst->w * n;
-	}
-}
diff --git a/draw/draw_affine.c b/draw/draw_affine.c
new file mode 100644
index 00000000..044b2938
--- /dev/null
+++ b/draw/draw_affine.c
@@ -0,0 +1,372 @@
+#include "fitz.h"
+
+typedef unsigned char byte;
+
+static inline float roundup(float x)
+{
+	return (x < 0) ? floorf(x) : ceilf(x);
+}
+
+static inline int lerp(int a, int b, int t)
+{
+	return a + (((b - a) * t) >> 16);
+}
+
+static inline int bilerp(int a, int b, int c, int d, int u, int v)
+{
+	return lerp(lerp(a, b, u), lerp(c, d, u), v);
+}
+
+static inline byte *samplenearest(byte *s, int w, int h, int n, int u, int v)
+{
+	if (u < 0) u = 0;
+	if (v < 0) v = 0;
+	if (u >= w) u = w - 1;
+	if (v >= h) v = h - 1;
+	return s + (v * w + u) * n;
+}
+
+/* Blend premultiplied source image in constant alpha over destination */
+
+static inline void
+fz_paintaffinealphaNlerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha)
+{
+	int k;
+
+	while (w--)
+	{
+		int ui = u >> 16;
+		int vi = v >> 16;
+		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
+		{
+			int uf = u & 0xffff;
+			int vf = v & 0xffff;
+			byte *a = samplenearest(sp, sw, sh, n, ui, vi);
+			byte *b = samplenearest(sp, sw, sh, n, ui+1, vi);
+			byte *c = samplenearest(sp, sw, sh, n, ui, vi+1);
+			byte *d = samplenearest(sp, sw, sh, n, ui+1, vi+1);
+			int x = bilerp(a[n-1], b[n-1], c[n-1], d[n-1], uf, vf);
+			int t = 255 - fz_mul255(x, alpha);
+			for (k = 0; k < n; k++)
+			{
+				x = bilerp(a[k], b[k], c[k], d[k], uf, vf);
+				dp[k] = fz_mul255(x, alpha) + fz_mul255(dp[k], t);
+			}
+		}
+		dp += n;
+		u += fa;
+		v += fb;
+	}
+}
+
+static inline void
+fz_paintaffinealphaNnear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha)
+{
+	int k;
+
+	while (w--)
+	{
+		int ui = u >> 16;
+		int vi = v >> 16;
+		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
+		{
+			byte *sample = sp + ((vi * sw + ui) * n);
+			int t = 255 - fz_mul255(sample[n-1], alpha);
+			for (k = 0; k < n; k++)
+				dp[k] = fz_mul255(sample[k], alpha) + fz_mul255(dp[k], t);
+		}
+		dp += n;
+		u += fa;
+		v += fb;
+	}
+}
+
+/* Blend premultiplied source image over destination */
+
+static inline void
+fz_paintaffineNlerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n)
+{
+	int k;
+
+	while (w--)
+	{
+		int ui = u >> 16;
+		int vi = v >> 16;
+		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
+		{
+			int uf = u & 0xffff;
+			int vf = v & 0xffff;
+			byte *a = samplenearest(sp, sw, sh, n, ui, vi);
+			byte *b = samplenearest(sp, sw, sh, n, ui+1, vi);
+			byte *c = samplenearest(sp, sw, sh, n, ui, vi+1);
+			byte *d = samplenearest(sp, sw, sh, n, ui+1, vi+1);
+			int t = 255 - bilerp(a[n-1], b[n-1], c[n-1], d[n-1], uf, vf);
+			for (k = 0; k < n; k++)
+			{
+				int x = bilerp(a[k], b[k], c[k], d[k], uf, vf);
+				dp[k] = x + fz_mul255(dp[k], t);
+			}
+		}
+		dp += n;
+		u += fa;
+		v += fb;
+	}
+}
+
+static inline void
+fz_paintaffineNnear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n)
+{
+	int k;
+
+	while (w--)
+	{
+		int ui = u >> 16;
+		int vi = v >> 16;
+		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
+		{
+			byte *sample = sp + ((vi * sw + ui) * n);
+			int t = 255 - sample[n-1];
+			for (k = 0; k < n; k++)
+				dp[k] = sample[k] + fz_mul255(dp[k], t);
+		}
+		dp += n;
+		u += fa;
+		v += fb;
+	}
+}
+
+/* Blend non-premultiplied color in source image mask over destination */
+
+static inline void
+fz_paintaffinecolorNlerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, byte *color)
+{
+	int sa = color[n-1];
+	int k;
+
+	while (w--)
+	{
+		int ui = u >> 16;
+		int vi = v >> 16;
+		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
+		{
+			int uf = u & 0xffff;
+			int vf = v & 0xffff;
+			byte *a = samplenearest(sp, sw, sh, 1, ui, vi);
+			byte *b = samplenearest(sp, sw, sh, 1, ui+1, vi);
+			byte *c = samplenearest(sp, sw, sh, 1, ui, vi+1);
+			byte *d = samplenearest(sp, sw, sh, 1, ui+1, vi+1);
+			int ma = bilerp(a[0], b[0], c[0], d[0], uf, vf);
+			int masa = FZ_COMBINE(FZ_EXPAND(ma), sa);
+			for (k = 0; k < n - 1; k++)
+				dp[k] = FZ_BLEND(color[k], dp[k], masa);
+			dp[k] = FZ_BLEND(255, dp[k], masa);
+		}
+		dp += n;
+		u += fa;
+		v += fb;
+	}
+}
+
+static inline void
+fz_paintaffinecolorNnear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, byte *color)
+{
+	int sa = color[n-1];
+	int k;
+
+	while (w--)
+	{
+		int ui = u >> 16;
+		int vi = v >> 16;
+		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
+		{
+			int ma = sp[vi * sw + ui];
+			int masa = FZ_COMBINE(FZ_EXPAND(ma), sa);
+			for (k = 0; k < n - 1; k++)
+				dp[k] = FZ_BLEND(color[k], dp[k], masa);
+			dp[k] = FZ_BLEND(255, dp[k], masa);
+		}
+		dp += n;
+		u += fa;
+		v += fb;
+	}
+}
+
+static void
+fz_paintaffinelerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha)
+{
+	if (alpha == 255)
+	{
+		switch (n)
+		{
+		case 1: fz_paintaffineNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 1); break;
+		case 2: fz_paintaffineNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 2); break;
+		case 4: fz_paintaffineNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 4); break;
+		default: fz_paintaffineNlerp(dp, sp, sw, sh, u, v, fa, fb, w, n); break;
+		}
+	}
+	else if (alpha > 0)
+	{
+		switch (n)
+		{
+		case 1: fz_paintaffinealphaNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 1, alpha); break;
+		case 2: fz_paintaffinealphaNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 2, alpha); break;
+		case 4: fz_paintaffinealphaNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 4, alpha); break;
+		default: fz_paintaffinealphaNlerp(dp, sp, sw, sh, u, v, fa, fb, w, n, alpha); break;
+		}
+	}
+}
+
+static void
+fz_paintaffinenear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha)
+{
+	if (alpha == 255)
+	{
+		switch (n)
+		{
+		case 1: fz_paintaffineNnear(dp, sp, sw, sh, u, v, fa, fb, w, 1); break;
+		case 2: fz_paintaffineNnear(dp, sp, sw, sh, u, v, fa, fb, w, 2); break;
+		case 4: fz_paintaffineNnear(dp, sp, sw, sh, u, v, fa, fb, w, 4); break;
+		default: fz_paintaffineNnear(dp, sp, sw, sh, u, v, fa, fb, w, n); break;
+		}
+	}
+	else if (alpha > 0)
+	{
+		switch (n)
+		{
+		case 1: fz_paintaffinealphaNnear(dp, sp, sw, sh, u, v, fa, fb, w, 1, alpha); break;
+		case 2: fz_paintaffinealphaNnear(dp, sp, sw, sh, u, v, fa, fb, w, 2, alpha); break;
+		case 4: fz_paintaffinealphaNnear(dp, sp, sw, sh, u, v, fa, fb, w, 4, alpha); break;
+		default: fz_paintaffinealphaNnear(dp, sp, sw, sh, u, v, fa, fb, w, n, alpha); break;
+		}
+	}
+}
+
+static void
+fz_paintaffinecolorlerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, byte *color)
+{
+	switch (n)
+	{
+	case 2: fz_paintaffinecolorNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 2, color); break;
+	case 4: fz_paintaffinecolorNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 4, color); break;
+	default: fz_paintaffinecolorNlerp(dp, sp, sw, sh, u, v, fa, fb, w, n, color); break;
+	}
+}
+
+static void
+fz_paintaffinecolornear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, byte *color)
+{
+	switch (n)
+	{
+	case 2: fz_paintaffinecolorNnear(dp, sp, sw, sh, u, v, fa, fb, w, 2, color); break;
+	case 4: fz_paintaffinecolorNnear(dp, sp, sw, sh, u, v, fa, fb, w, 4, color); break;
+	default: fz_paintaffinecolorNnear(dp, sp, sw, sh, u, v, fa, fb, w, n, color); break;
+	}
+}
+
+/* Draw an image with an affine transform on destination */
+
+static void
+fz_paintimageimp(fz_pixmap *dst, fz_bbox scissor, fz_pixmap *img, fz_matrix ctm, byte *color, int alpha)
+{
+	byte *dp, *sp;
+	int u, v, fa, fb, fc, fd;
+	int x, y, w, h;
+	int sw, sh, n;
+	fz_matrix inv;
+	fz_bbox bbox;
+	int dolerp;
+
+	/* grid fit the image */
+	if (fz_isrectilinear(ctm))
+	{
+		ctm.a = roundup(ctm.a);
+		ctm.b = roundup(ctm.b);
+		ctm.c = roundup(ctm.c);
+		ctm.d = roundup(ctm.d);
+		ctm.e = floorf(ctm.e);
+		ctm.f = floorf(ctm.f);
+	}
+
+	/* turn on interpolation for upscaled and non-rectilinear transforms */
+	dolerp = 0;
+	if (!fz_isrectilinear(ctm))
+		dolerp = 1;
+	if (sqrtf(ctm.a * ctm.a + ctm.b * ctm.b) > img->w)
+		dolerp = 1;
+	if (sqrtf(ctm.c * ctm.c + ctm.d * ctm.d) > img->h)
+		dolerp = 1;
+
+	/* except when we shouldn't, at large magnifications */
+	if (!img->interpolate)
+	{
+		if (sqrtf(ctm.a * ctm.a + ctm.b * ctm.b) > img->w * 2)
+			dolerp = 0;
+		if (sqrtf(ctm.c * ctm.c + ctm.d * ctm.d) > img->h * 2)
+			dolerp = 0;
+	}
+
+	bbox = fz_roundrect(fz_transformrect(ctm, fz_unitrect));
+	bbox = fz_intersectbbox(bbox, scissor);
+	x = bbox.x0;
+	y = bbox.y0;
+	w = bbox.x1 - bbox.x0;
+	h = bbox.y1 - bbox.y0;
+
+	/* map from screen space (x,y) to image space (u,v) */
+	inv = fz_scale(1.0f / img->w, -1.0f / img->h);
+	inv = fz_concat(inv, fz_translate(0, 1));
+	inv = fz_concat(inv, ctm);
+	inv = fz_invertmatrix(inv);
+
+	fa = inv.a * 65536;
+	fb = inv.b * 65536;
+	fc = inv.c * 65536;
+	fd = inv.d * 65536;
+
+	/* Calculate initial texture positions. Do a half step to start. */
+	u = (fa * x) + (fc * y) + inv.e * 65536 + ((fa+fc)>>1);
+	v = (fb * x) + (fd * y) + inv.f * 65536 + ((fb+fd)>>1);
+
+	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
+	n = dst->n;
+	sp = img->samples;
+	sw = img->w;
+	sh = img->h;
+
+	/* TODO: if (fb == 0 && fa == 1) call fz_paintspan */
+
+	while (h--)
+	{
+		if (dolerp)
+		{
+			if (color)
+				fz_paintaffinecolorlerp(dp, sp, sw, sh, u, v, fa, fb, w, n, color);
+			else
+				fz_paintaffinelerp(dp, sp, sw, sh, u, v, fa, fb, w, n, alpha);
+		}
+		else
+		{
+			if (color)
+				fz_paintaffinecolornear(dp, sp, sw, sh, u, v, fa, fb, w, n, color);
+			else
+				fz_paintaffinenear(dp, sp, sw, sh, u, v, fa, fb, w, n, alpha);
+		}
+		dp += dst->w * n;
+		u += fc;
+		v += fd;
+	}
+}
+
+void
+fz_paintimagecolor(fz_pixmap *dst, fz_bbox scissor, fz_pixmap *img, fz_matrix ctm, byte *color)
+{
+	assert(img->n == 1);
+	fz_paintimageimp(dst, scissor, img, ctm, color, 255);
+}
+
+void
+fz_paintimage(fz_pixmap *dst, fz_bbox scissor, fz_pixmap *img, fz_matrix ctm, int alpha)
+{
+	assert(dst->n == img->n);
+	fz_paintimageimp(dst, scissor, img, ctm, nil, alpha);
+}
diff --git a/draw/draw_blend.c b/draw/draw_blend.c
new file mode 100644
index 00000000..ac2e6c27
--- /dev/null
+++ b/draw/draw_blend.c
@@ -0,0 +1,370 @@
+#include "fitz.h"
+
+/* PDF 1.4 blend modes. These are slow. */
+
+typedef unsigned char byte;
+
+const char *fz_blendnames[] =
+{
+	"Normal",
+	"Multiply",
+	"Screen",
+	"Overlay",
+	"Darken",
+	"Lighten",
+	"ColorDodge",
+	"ColorBurn",
+	"HardLight",
+	"SoftLight",
+	"Difference",
+	"Exclusion",
+	"Hue",
+	"Saturation",
+	"Color",
+	"Luminosity",
+	nil
+};
+
+/* Separable blend modes */
+
+static inline int
+fz_screen_byte(int b, int s)
+{
+	return b + s - fz_mul255(b, s);
+}
+
+static inline int
+fz_hardlight_byte(int b, int s)
+{
+	int s2 = s << 1;
+	if (s <= 127)
+		return fz_mul255(b, s2);
+	else
+		return fz_screen_byte(b, s2 - 255);
+}
+
+static inline int
+fz_overlay_byte(int b, int s)
+{
+	return fz_hardlight_byte(s, b); /* note swapped order */
+}
+
+static inline int
+fz_darken_byte(int b, int s)
+{
+	return MIN(b, s);
+}
+
+static inline int
+fz_lighten_byte(int b, int s)
+{
+	return MAX(b, s);
+}
+
+static inline int
+fz_colordodge_byte(int b, int s)
+{
+	s = 255 - s;
+	if (b == 0)
+		return 0;
+	else if (b >= s)
+		return 255;
+	else
+		return (0x1fe * b + s) / (s << 1);
+}
+
+static inline int
+fz_colorburn_byte(int b, int s)
+{
+	b = 255 - b;
+	if (b == 0)
+		return 255;
+	else if (b >= s)
+		return 0;
+	else
+		return 0xff - (0x1fe * b + s) / (s << 1);
+}
+
+static inline int
+fz_softlight_byte(int b, int s)
+{
+	/* review this */
+	if (s < 128) {
+		return b - fz_mul255(fz_mul255((255 - (s<<1)), b), 255 - b);
+	}
+	else {
+		int dbd;
+		if (b < 64)
+			dbd = fz_mul255(fz_mul255((b << 4) - 12, b) + 4, b);
+		else
+			dbd = (int)sqrtf(255.0f * b);
+		return b + fz_mul255(((s<<1) - 255), (dbd - b));
+	}
+}
+
+static inline int
+fz_difference_byte(int b, int s)
+{
+	return ABS(b - s);
+}
+
+static inline int
+fz_exclusion_byte(int b, int s)
+{
+	return b + s - (fz_mul255(b, s)<<1);
+}
+
+/* Non-separable blend modes */
+
+static inline void
+fz_luminosity_rgb(int *rd, int *gd, int *bd, int rb, int gb, int bb, int rs, int gs, int bs)
+{
+	int delta, scale;
+	int r, g, b, y;
+
+	/* 0.3, 0.59, 0.11 in fixed point */
+	delta = ((rs - rb) * 77 + (gs - gb) * 151 + (bs - bb) * 28 + 0x80) >> 8;
+	r = rb + delta;
+	g = gb + delta;
+	b = bb + delta;
+
+	if ((r | g | b) & 0x100)
+	{
+		y = (rs * 77 + gs * 151 + bs * 28 + 0x80) >> 8;
+		if (delta > 0)
+		{
+			int max;
+			max = MAX(r, MAX(g, b));
+			scale = ((255 - y) << 16) / (max - y);
+		}
+		else
+		{
+			int min;
+			min = MIN(r, MIN(g, b));
+			scale = (y << 16) / (y - min);
+		}
+		r = y + (((r - y) * scale + 0x8000) >> 16);
+		g = y + (((g - y) * scale + 0x8000) >> 16);
+		b = y + (((b - y) * scale + 0x8000) >> 16);
+	}
+
+	*rd = r;
+	*gd = g;
+	*bd = b;
+}
+
+static void
+fz_saturation_rgb(int *rd, int *gd, int *bd, int rb, int gb, int bb, int rs, int gs, int bs)
+{
+	int minb, maxb;
+	int mins, maxs;
+	int y;
+	int scale;
+	int r, g, b;
+
+	minb = MIN(rb, MIN(gb, bb));
+	maxb = MAX(rb, MAX(gb, bb));
+	if (minb == maxb)
+	{
+		/* backdrop has zero saturation, avoid divide by 0 */
+		*rd = gb;
+		*gd = gb;
+		*bd = gb;
+		return;
+	}
+
+	mins = MIN(rs, MIN(gs, bs));
+	maxs = MAX(rs, MAX(gs, bs));
+
+	scale = ((maxs - mins) << 16) / (maxb - minb);
+	y = (rb * 77 + gb * 151 + bb * 28 + 0x80) >> 8;
+	r = y + ((((rb - y) * scale) + 0x8000) >> 16);
+	g = y + ((((gb - y) * scale) + 0x8000) >> 16);
+	b = y + ((((bb - y) * scale) + 0x8000) >> 16);
+
+	if ((r | g | b) & 0x100)
+	{
+		int scalemin, scalemax;
+		int min, max;
+
+		min = MIN(r, MIN(g, b));
+		max = MAX(r, MAX(g, b));
+
+		if (min < 0)
+			scalemin = (y << 16) / (y - min);
+		else
+			scalemin = 0x10000;
+
+		if (max > 255)
+			scalemax = ((255 - y) << 16) / (max - y);
+		else
+			scalemax = 0x10000;
+
+		scale = MIN(scalemin, scalemax);
+		r = y + (((r - y) * scale + 0x8000) >> 16);
+		g = y + (((g - y) * scale + 0x8000) >> 16);
+		b = y + (((b - y) * scale + 0x8000) >> 16);
+	}
+
+	*rd = r;
+	*gd = g;
+	*bd = b;
+}
+
+static void
+fz_color_rgb(int *rr, int *rg, int *rb, int br, int bg, int bb, int sr, int sg, int sb)
+{
+	fz_luminosity_rgb(rr, rg, rb, sr, sg, sb, br, bg, bb);
+}
+
+static void
+fz_hue_rgb(int *rr, int *rg, int *rb, int br, int bg, int bb, int sr, int sg, int sb)
+{
+	int tr, tg, tb;
+	fz_luminosity_rgb(&tr, &tg, &tb, sr, sg, sb, br, bg, bb);
+	fz_saturation_rgb(rr, rg, rb, tr, tg, tb, br, bg, bb);
+}
+
+/* Blending loops */
+
+void
+fz_blendseparable(byte * restrict bp, byte * restrict sp, int n, int w, fz_blendmode blendmode)
+{
+	int k;
+	int n1 = n - 1;
+	while (w--)
+	{
+		int sa = sp[n1];
+		int ba = bp[n1];
+		int saba = fz_mul255(sa, ba);
+
+		/* ugh, division to get non-premul components */
+		int invsa = sa ? 255 * 256 / sa : 0;
+		int invba = ba ? 255 * 256 / ba : 0;
+
+		for (k = 0; k < n1; k++)
+		{
+			int sc = (sp[k] * invsa) >> 8;
+			int bc = (bp[k] * invba) >> 8;
+			int rc;
+
+			switch (blendmode)
+			{
+			default:
+			case FZ_BNORMAL: rc = sc; break;
+			case FZ_BMULTIPLY: rc = fz_mul255(bc, sc); break;
+			case FZ_BSCREEN: rc = fz_screen_byte(bc, sc); break;
+			case FZ_BOVERLAY: rc = fz_overlay_byte(bc, sc); break;
+			case FZ_BDARKEN: rc = fz_darken_byte(bc, sc); break;
+			case FZ_BLIGHTEN: rc = fz_lighten_byte(bc, sc); break;
+			case FZ_BCOLORDODGE: rc = fz_colordodge_byte(bc, sc); break;
+			case FZ_BCOLORBURN: rc = fz_colorburn_byte(bc, sc); break;
+			case FZ_BHARDLIGHT: rc = fz_hardlight_byte(bc, sc); break;
+			case FZ_BSOFTLIGHT: rc = fz_softlight_byte(bc, sc); break;
+			case FZ_BDIFFERENCE: rc = fz_difference_byte(bc, sc); break;
+			case FZ_BEXCLUSION: rc = fz_exclusion_byte(bc, sc); break;
+			}
+
+			bp[k] = fz_mul255(255 - sa, bp[k]) + fz_mul255(255 - ba, sp[k]) + fz_mul255(saba, rc);
+		}
+
+		bp[k] = ba + sa - saba;
+
+		sp += n;
+		bp += n;
+	}
+}
+
+void
+fz_blendnonseparable(byte * restrict bp, byte * restrict sp, int w, fz_blendmode blendmode)
+{
+	while (w--)
+	{
+		int rr, rg, rb;
+
+		int sa = sp[3];
+		int ba = bp[3];
+		int saba = fz_mul255(sa, ba);
+
+		/* ugh, division to get non-premul components */
+		int invsa = sa ? 255 * 256 / sa : 0;
+		int invba = ba ? 255 * 256 / ba : 0;
+
+		int sr = (sp[0] * invsa) >> 8;
+		int sg = (sp[1] * invsa) >> 8;
+		int sb = (sp[2] * invsa) >> 8;
+
+		int br = (bp[0] * invba) >> 8;
+		int bg = (bp[1] * invba) >> 8;
+		int bb = (bp[2] * invba) >> 8;
+
+		switch (blendmode)
+		{
+		default:
+		case FZ_BHUE:
+			fz_hue_rgb(&rr, &rg, &rb, br, bg, bb, sr, sg, sb);
+			break;
+		case FZ_BSATURATION:
+			fz_saturation_rgb(&rr, &rg, &rb, br, bg, bb, sr, sg, sb);
+			break;
+		case FZ_BCOLOR:
+			fz_color_rgb(&rr, &rg, &rb, br, bg, bb, sr, sg, sb);
+			break;
+		case FZ_BLUMINOSITY:
+			fz_luminosity_rgb(&rr, &rg, &rb, br, bg, bb, sr, sg, sb);
+			break;
+		}
+
+		bp[0] = fz_mul255(255 - sa, bp[0]) + fz_mul255(255 - ba, sp[0]) + fz_mul255(saba, rr);
+		bp[1] = fz_mul255(255 - sa, bp[1]) + fz_mul255(255 - ba, sp[1]) + fz_mul255(saba, rg);
+		bp[2] = fz_mul255(255 - sa, bp[2]) + fz_mul255(255 - ba, sp[2]) + fz_mul255(saba, rb);
+		bp[3] = ba + sa - saba;
+
+		sp += 4;
+		bp += 4;
+	}
+}
+
+void
+fz_blendpixmap(fz_pixmap *dst, fz_pixmap *src, int alpha, fz_blendmode blendmode)
+{
+	unsigned char *sp, *dp;
+	fz_bbox bbox;
+	int x, y, w, h, n;
+
+	/* TODO: fix this hack! */
+	if (alpha < 255)
+	{
+		sp = src->samples;
+		n = src->w * src->h * src->n;
+		while (n--)
+		{
+			*sp = fz_mul255(*sp, alpha);
+			sp++;
+		}
+	}
+
+	bbox = fz_boundpixmap(dst);
+	bbox = fz_intersectbbox(bbox, fz_boundpixmap(src));
+
+	x = bbox.x0;
+	y = bbox.y0;
+	w = bbox.x1 - bbox.x0;
+	h = bbox.y1 - bbox.y0;
+
+	n = src->n;
+	sp = src->samples + ((y - src->y) * src->w + (x - src->x)) * n;
+	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * n;
+
+	assert(src->n == dst->n);
+
+	while (h--)
+	{
+		if (n == 4 && blendmode >= FZ_BHUE)
+			fz_blendnonseparable(dp, sp, w, blendmode);
+		else
+			fz_blendseparable(dp, sp, n, w, blendmode);
+		sp += src->w * n;
+		dp += dst->w * n;
+	}
+}
diff --git a/draw/draw_edge.c b/draw/draw_edge.c
new file mode 100644
index 00000000..aa956077
--- /dev/null
+++ b/draw/draw_edge.c
@@ -0,0 +1,559 @@
+#include "fitz.h"
+
+#define BBOX_MIN -(1<<20)
+#define BBOX_MAX (1<<20)
+
+/* divide and floor towards -inf */
+static inline int fz_idiv(int a, int b)
+{
+	return a < 0 ? (a - b + 1) / b : a / b;
+}
+
+enum { HSCALE = 17, VSCALE = 15, SF = 1 };
+
+/*
+ * Global Edge List -- list of straight path segments for scan conversion
+ *
+ * Stepping along the edges is with bresenham's line algorithm.
+ *
+ * See Mike Abrash -- Graphics Programming Black Book (notably chapter 40)
+ */
+
+fz_gel *
+fz_newgel(void)
+{
+	fz_gel *gel;
+
+	gel = fz_malloc(sizeof(fz_gel));
+	gel->cap = 512;
+	gel->len = 0;
+	gel->edges = fz_calloc(gel->cap, sizeof(fz_edge));
+
+	gel->clip.x0 = gel->clip.y0 = BBOX_MAX;
+	gel->clip.x1 = gel->clip.y1 = BBOX_MIN;
+
+	gel->bbox.x0 = gel->bbox.y0 = BBOX_MAX;
+	gel->bbox.x1 = gel->bbox.y1 = BBOX_MIN;
+
+	return gel;
+}
+
+void
+fz_resetgel(fz_gel *gel, fz_bbox clip)
+{
+	if (fz_isinfiniterect(clip))
+	{
+		gel->clip.x0 = gel->clip.y0 = BBOX_MAX;
+		gel->clip.x1 = gel->clip.y1 = BBOX_MIN;
+	}
+	else {
+		gel->clip.x0 = clip.x0 * HSCALE;
+		gel->clip.x1 = clip.x1 * HSCALE;
+		gel->clip.y0 = clip.y0 * VSCALE;
+		gel->clip.y1 = clip.y1 * VSCALE;
+	}
+
+	gel->bbox.x0 = gel->bbox.y0 = BBOX_MAX;
+	gel->bbox.x1 = gel->bbox.y1 = BBOX_MIN;
+
+	gel->len = 0;
+}
+
+void
+fz_freegel(fz_gel *gel)
+{
+	fz_free(gel->edges);
+	fz_free(gel);
+}
+
+fz_bbox
+fz_boundgel(fz_gel *gel)
+{
+	fz_bbox bbox;
+	if (gel->len == 0)
+		return fz_emptybbox;
+	bbox.x0 = fz_idiv(gel->bbox.x0, HSCALE);
+	bbox.y0 = fz_idiv(gel->bbox.y0, VSCALE);
+	bbox.x1 = fz_idiv(gel->bbox.x1, HSCALE) + 1;
+	bbox.y1 = fz_idiv(gel->bbox.y1, VSCALE) + 1;
+	return bbox;
+}
+
+enum { INSIDE, OUTSIDE, LEAVE, ENTER };
+
+#define cliplerpy(v,m,x0,y0,x1,y1,t) cliplerpx(v,m,y0,x0,y1,x1,t)
+
+static int
+cliplerpx(int val, int m, int x0, int y0, int x1, int y1, int *out)
+{
+	int v0out = m ? x0 > val : x0 < val;
+	int v1out = m ? x1 > val : x1 < val;
+
+	if (v0out + v1out == 0)
+		return INSIDE;
+
+	if (v0out + v1out == 2)
+		return OUTSIDE;
+
+	if (v1out)
+	{
+		*out = y0 + (y1 - y0) * (val - x0) / (x1 - x0);
+		return LEAVE;
+	}
+
+	else
+	{
+		*out = y1 + (y0 - y1) * (val - x1) / (x0 - x1);
+		return ENTER;
+	}
+}
+
+static void
+fz_insertgelraw(fz_gel *gel, int x0, int y0, int x1, int y1)
+{
+	fz_edge *edge;
+	int dx, dy;
+	int winding;
+	int width;
+	int tmp;
+
+	if (y0 == y1)
+		return;
+
+	if (y0 > y1) {
+		winding = -1;
+		tmp = x0; x0 = x1; x1 = tmp;
+		tmp = y0; y0 = y1; y1 = tmp;
+	}
+	else
+		winding = 1;
+
+	if (x0 < gel->bbox.x0) gel->bbox.x0 = x0;
+	if (x0 > gel->bbox.x1) gel->bbox.x1 = x0;
+	if (x1 < gel->bbox.x0) gel->bbox.x0 = x1;
+	if (x1 > gel->bbox.x1) gel->bbox.x1 = x1;
+
+	if (y0 < gel->bbox.y0) gel->bbox.y0 = y0;
+	if (y1 > gel->bbox.y1) gel->bbox.y1 = y1;
+
+	if (gel->len + 1 == gel->cap) {
+		gel->cap = gel->cap + 512;
+		gel->edges = fz_realloc(gel->edges, gel->cap, sizeof(fz_edge));
+	}
+
+	edge = &gel->edges[gel->len++];
+
+	dy = y1 - y0;
+	dx = x1 - x0;
+	width = ABS(dx);
+
+	edge->xdir = dx > 0 ? 1 : -1;
+	edge->ydir = winding;
+	edge->x = x0;
+	edge->y = y0;
+	edge->h = dy;
+	edge->adjdown = dy;
+
+	/* initial error term going l->r and r->l */
+	if (dx >= 0)
+		edge->e = 0;
+	else
+		edge->e = -dy + 1;
+
+	/* y-major edge */
+	if (dy >= width) {
+		edge->xmove = 0;
+		edge->adjup = width;
+	}
+
+	/* x-major edge */
+	else {
+		edge->xmove = (width / dy) * edge->xdir;
+		edge->adjup = width % dy;
+	}
+}
+
+void
+fz_insertgel(fz_gel *gel, float fx0, float fy0, float fx1, float fy1)
+{
+	int x0, y0, x1, y1;
+	int d, v;
+
+	fx0 = floorf(fx0 * HSCALE);
+	fx1 = floorf(fx1 * HSCALE);
+	fy0 = floorf(fy0 * VSCALE);
+	fy1 = floorf(fy1 * VSCALE);
+
+	x0 = CLAMP(fx0, BBOX_MIN, BBOX_MAX);
+	y0 = CLAMP(fy0, BBOX_MIN, BBOX_MAX);
+	x1 = CLAMP(fx1, BBOX_MIN, BBOX_MAX);
+	y1 = CLAMP(fy1, BBOX_MIN, BBOX_MAX);
+
+	d = cliplerpy(gel->clip.y0, 0, x0, y0, x1, y1, &v);
+	if (d == OUTSIDE) return;
+	if (d == LEAVE) { y1 = gel->clip.y0; x1 = v; }
+	if (d == ENTER) { y0 = gel->clip.y0; x0 = v; }
+
+	d = cliplerpy(gel->clip.y1, 1, x0, y0, x1, y1, &v);
+	if (d == OUTSIDE) return;
+	if (d == LEAVE) { y1 = gel->clip.y1; x1 = v; }
+	if (d == ENTER) { y0 = gel->clip.y1; x0 = v; }
+
+	d = cliplerpx(gel->clip.x0, 0, x0, y0, x1, y1, &v);
+	if (d == OUTSIDE) {
+		x0 = x1 = gel->clip.x0;
+	}
+	if (d == LEAVE) {
+		fz_insertgelraw(gel, gel->clip.x0, v, gel->clip.x0, y1);
+		x1 = gel->clip.x0;
+		y1 = v;
+	}
+	if (d == ENTER) {
+		fz_insertgelraw(gel, gel->clip.x0, y0, gel->clip.x0, v);
+		x0 = gel->clip.x0;
+		y0 = v;
+	}
+
+	d = cliplerpx(gel->clip.x1, 1, x0, y0, x1, y1, &v);
+	if (d == OUTSIDE) {
+		x0 = x1 = gel->clip.x1;
+	}
+	if (d == LEAVE) {
+		fz_insertgelraw(gel, gel->clip.x1, v, gel->clip.x1, y1);
+		x1 = gel->clip.x1;
+		y1 = v;
+	}
+	if (d == ENTER) {
+		fz_insertgelraw(gel, gel->clip.x1, y0, gel->clip.x1, v);
+		x0 = gel->clip.x1;
+		y0 = v;
+	}
+
+	fz_insertgelraw(gel, x0, y0, x1, y1);
+}
+
+void
+fz_sortgel(fz_gel *gel)
+{
+	fz_edge *a = gel->edges;
+	int n = gel->len;
+
+	int h, i, k;
+	fz_edge t;
+
+	h = 1;
+	if (n < 14) {
+		h = 1;
+	}
+	else {
+		while (h < n)
+			h = 3 * h + 1;
+		h /= 3;
+		h /= 3;
+	}
+
+	while (h > 0)
+	{
+		for (i = 0; i < n; i++) {
+			t = a[i];
+			k = i - h;
+			/* TODO: sort on y major, x minor */
+			while (k >= 0 && a[k].y > t.y) {
+				a[k + h] = a[k];
+				k -= h;
+			}
+			a[k + h] = t;
+		}
+
+		h /= 3;
+	}
+}
+
+int
+fz_isrectgel(fz_gel *gel)
+{
+	/* a rectangular path is converted into two vertical edges of identical height */
+	if (gel->len == 2)
+	{
+		fz_edge *a = gel->edges + 0;
+		fz_edge *b = gel->edges + 1;
+		return a->y == b->y && a->h == b->h &&
+			a->xmove == 0 && a->adjup == 0 &&
+			b->xmove == 0 && b->adjup == 0;
+	}
+	return 0;
+}
+
+/*
+ * Active Edge List -- keep track of active edges while sweeping
+ */
+
+fz_ael *
+fz_newael(void)
+{
+	fz_ael *ael;
+	ael = fz_malloc(sizeof(fz_ael));
+	ael->cap = 64;
+	ael->len = 0;
+	ael->edges = fz_calloc(ael->cap, sizeof(fz_edge*));
+	return ael;
+}
+
+void
+fz_freeael(fz_ael *ael)
+{
+	fz_free(ael->edges);
+	fz_free(ael);
+}
+
+static inline void
+sortael(fz_edge **a, int n)
+{
+	int h, i, k;
+	fz_edge *t;
+
+	h = 1;
+	if (n < 14) {
+		h = 1;
+	}
+	else {
+		while (h < n)
+			h = 3 * h + 1;
+		h /= 3;
+		h /= 3;
+	}
+
+	while (h > 0)
+	{
+		for (i = 0; i < n; i++) {
+			t = a[i];
+			k = i - h;
+			while (k >= 0 && a[k]->x > t->x) {
+				a[k + h] = a[k];
+				k -= h;
+			}
+			a[k + h] = t;
+		}
+
+		h /= 3;
+	}
+}
+
+static fz_error
+insertael(fz_ael *ael, fz_gel *gel, int y, int *e)
+{
+	/* insert edges that start here */
+	while (*e < gel->len && gel->edges[*e].y == y) {
+		if (ael->len + 1 == ael->cap) {
+			int newcap = ael->cap + 64;
+			fz_edge **newedges = fz_realloc(ael->edges, newcap, sizeof(fz_edge*));
+			ael->edges = newedges;
+			ael->cap = newcap;
+		}
+		ael->edges[ael->len++] = &gel->edges[(*e)++];
+	}
+
+	/* shell-sort the edges by increasing x */
+	sortael(ael->edges, ael->len);
+
+	return fz_okay;
+}
+
+static void
+advanceael(fz_ael *ael)
+{
+	fz_edge *edge;
+	int i = 0;
+
+	while (i < ael->len)
+	{
+		edge = ael->edges[i];
+
+		edge->h --;
+
+		/* terminator! */
+		if (edge->h == 0) {
+			ael->edges[i] = ael->edges[--ael->len];
+		}
+
+		else {
+			edge->x += edge->xmove;
+			edge->e += edge->adjup;
+			if (edge->e > 0) {
+				edge->x += edge->xdir;
+				edge->e -= edge->adjdown;
+			}
+			i ++;
+		}
+	}
+}
+
+/*
+ * Scan convert
+ */
+
+static inline void
+addspan(unsigned char *list, int x0, int x1, int xofs)
+{
+	int x0pix, x0sub;
+	int x1pix, x1sub;
+
+	if (x0 == x1)
+		return;
+
+	/* x between 0 and width of bbox */
+	x0 -= xofs;
+	x1 -= xofs;
+
+	x0pix = x0 / HSCALE;
+	x0sub = x0 % HSCALE;
+	x1pix = x1 / HSCALE;
+	x1sub = x1 % HSCALE;
+
+	if (x0pix == x1pix)
+	{
+		list[x0pix] += x1sub - x0sub;
+		list[x0pix+1] += x0sub - x1sub;
+	}
+
+	else
+	{
+		list[x0pix] += HSCALE - x0sub;
+		list[x0pix+1] += x0sub;
+		list[x1pix] += x1sub - HSCALE;
+		list[x1pix+1] += -x1sub;
+	}
+}
+
+static inline void
+nonzerowinding(fz_ael *ael, unsigned char *list, int xofs)
+{
+	int winding = 0;
+	int x = 0;
+	int i;
+	for (i = 0; i < ael->len; i++)
+	{
+		if (!winding && (winding + ael->edges[i]->ydir))
+			x = ael->edges[i]->x;
+		if (winding && !(winding + ael->edges[i]->ydir))
+			addspan(list, x, ael->edges[i]->x, xofs);
+		winding += ael->edges[i]->ydir;
+	}
+}
+
+static inline void
+evenodd(fz_ael *ael, unsigned char *list, int xofs)
+{
+	int even = 0;
+	int x = 0;
+	int i;
+	for (i = 0; i < ael->len; i++)
+	{
+		if (!even)
+			x = ael->edges[i]->x;
+		else
+			addspan(list, x, ael->edges[i]->x, xofs);
+		even = !even;
+	}
+}
+
+static inline void
+undelta(unsigned char *list, int n)
+{
+	int d = 0;
+	while (n--)
+	{
+		d += *list;
+		*list++ = d;
+	}
+}
+
+static inline void
+blit(fz_pixmap *dest, int x, int y, unsigned char *mp, int w, unsigned char *color)
+{
+	unsigned char *dp;
+
+	dp = dest->samples + ( (y - dest->y) * dest->w + (x - dest->x) ) * dest->n;
+
+	if (color)
+		fz_paintspancolor(dp, mp, dest->n, w, color);
+	else
+		fz_paintspan(dp, mp, 1, w, 255);
+}
+
+fz_error
+fz_scanconvert(fz_gel *gel, fz_ael *ael, int eofill, fz_bbox clip,
+	fz_pixmap *dest, unsigned char *color)
+{
+	fz_error error;
+	unsigned char *deltas;
+	int y, e;
+	int yd, yc;
+
+	int xmin = fz_idiv(gel->bbox.x0, HSCALE);
+	int xmax = fz_idiv(gel->bbox.x1, HSCALE) + 1;
+
+	int xofs = xmin * HSCALE;
+
+	int skipx = clip.x0 - xmin;
+	int clipn = clip.x1 - clip.x0;
+
+	if (gel->len == 0)
+		return fz_okay;
+
+	assert(clip.x0 >= xmin);
+	assert(clip.x1 <= xmax);
+
+	deltas = fz_malloc(xmax - xmin + 1);
+	memset(deltas, 0, xmax - xmin + 1);
+
+	e = 0;
+	y = gel->edges[0].y;
+	yc = fz_idiv(y, VSCALE);
+	yd = yc;
+
+	while (ael->len > 0 || e < gel->len)
+	{
+		yc = fz_idiv(y, VSCALE);
+		if (yc != yd)
+		{
+			if (yd >= clip.y0 && yd < clip.y1)
+			{
+				undelta(deltas, skipx + clipn);
+				blit(dest, xmin + skipx, yd, deltas + skipx, clipn, color);
+				memset(deltas, 0, skipx + clipn);
+			}
+		}
+		yd = yc;
+
+		error = insertael(ael, gel, y, &e);
+		if (error) {
+			fz_free(deltas);
+			return error;
+		}
+
+		if (yd >= clip.y0 && yd < clip.y1)
+		{
+			if (eofill)
+				evenodd(ael, deltas, xofs);
+			else
+				nonzerowinding(ael, deltas, xofs);
+		}
+
+		advanceael(ael);
+
+		if (ael->len > 0)
+			y ++;
+		else if (e < gel->len)
+			y = gel->edges[e].y;
+	}
+
+	if (yd >= clip.y0 && yd < clip.y1)
+	{
+		undelta(deltas, skipx + clipn);
+		blit(dest, xmin + skipx, yd, deltas + skipx, clipn, color);
+	}
+
+	fz_free(deltas);
+	return fz_okay;
+}
diff --git a/draw/draw_glyph.c b/draw/draw_glyph.c
new file mode 100644
index 00000000..15bb7cae
--- /dev/null
+++ b/draw/draw_glyph.c
@@ -0,0 +1,134 @@
+#include "fitz.h"
+
+#define MAXFONTSIZE 1000
+#define MAXGLYPHSIZE 256
+#define MAXCACHESIZE (1024*1024)
+
+typedef struct fz_glyphkey_s fz_glyphkey;
+
+struct fz_glyphcache_s
+{
+	fz_hashtable *hash;
+	int total;
+};
+
+struct fz_glyphkey_s
+{
+	fz_font *font;
+	int a, b;
+	int c, d;
+	unsigned short cid;
+	unsigned char e, f;
+};
+
+fz_glyphcache *
+fz_newglyphcache(void)
+{
+	fz_glyphcache *cache;
+
+	cache = fz_malloc(sizeof(fz_glyphcache));
+	cache->hash = fz_newhash(509, sizeof(fz_glyphkey));
+	cache->total = 0;
+
+	return cache;
+}
+
+static void
+fz_evictglyphcache(fz_glyphcache *cache)
+{
+	fz_glyphkey *key;
+	fz_pixmap *pixmap;
+	int i;
+
+	for (i = 0; i < fz_hashlen(cache->hash); i++)
+	{
+		key = fz_hashgetkey(cache->hash, i);
+		if (key->font)
+			fz_dropfont(key->font);
+		pixmap = fz_hashgetval(cache->hash, i);
+		if (pixmap)
+			fz_droppixmap(pixmap);
+	}
+
+	cache->total = 0;
+
+	fz_emptyhash(cache->hash);
+}
+
+void
+fz_freeglyphcache(fz_glyphcache *cache)
+{
+	fz_evictglyphcache(cache);
+	fz_freehash(cache->hash);
+	fz_free(cache);
+}
+
+fz_pixmap *
+fz_renderstrokedglyph(fz_glyphcache *cache, fz_font *font, int cid, fz_matrix trm, fz_matrix ctm, fz_strokestate *stroke)
+{
+	if (font->ftface)
+		return fz_renderftstrokedglyph(font, cid, trm, ctm, stroke);
+	return fz_renderglyph(cache, font, cid, trm);
+}
+
+fz_pixmap *
+fz_renderglyph(fz_glyphcache *cache, fz_font *font, int cid, fz_matrix ctm)
+{
+	fz_glyphkey key;
+	fz_pixmap *val;
+	float size = fz_matrixexpansion(ctm);
+
+	if (size > MAXFONTSIZE)
+	{
+		/* TODO: this case should be handled by rendering glyph as a path fill */
+		fz_warn("font size too large (%g), not rendering glyph", size);
+		return nil;
+	}
+
+	memset(&key, 0, sizeof key);
+	key.font = font;
+	key.cid = cid;
+	key.a = ctm.a * 65536;
+	key.b = ctm.b * 65536;
+	key.c = ctm.c * 65536;
+	key.d = ctm.d * 65536;
+	key.e = (ctm.e - floorf(ctm.e)) * 256;
+	key.f = (ctm.f - floorf(ctm.f)) * 256;
+
+	val = fz_hashfind(cache->hash, &key);
+	if (val)
+		return fz_keeppixmap(val);
+
+	ctm.e = floorf(ctm.e) + key.e / 256.0f;
+	ctm.f = floorf(ctm.f) + key.f / 256.0f;
+
+	if (font->ftface)
+	{
+		val = fz_renderftglyph(font, cid, ctm);
+	}
+	else if (font->t3procs)
+	{
+		val = fz_rendert3glyph(font, cid, ctm);
+	}
+	else
+	{
+		fz_warn("assert: uninitialized font structure");
+		return nil;
+	}
+
+	if (val)
+	{
+		if (val->w < MAXGLYPHSIZE && val->h < MAXGLYPHSIZE)
+		{
+			if (cache->total + val->w * val->h > MAXCACHESIZE)
+				fz_evictglyphcache(cache);
+			fz_keepfont(key.font);
+			fz_hashinsert(cache->hash, &key, val);
+			cache->total += val->w * val->h;
+			return fz_keeppixmap(val);
+		}
+		return val;
+	}
+
+	return nil;
+}
diff --git a/draw/draw_mesh.c b/draw/draw_mesh.c
new file mode 100644
index 00000000..79437505
--- /dev/null
+++ b/draw/draw_mesh.c
@@ -0,0 +1,579 @@
+#include "fitz.h"
+
+/*
+ * polygon clipping
+ */
+
+enum { IN, OUT, ENTER, LEAVE };
+enum { MAXV = 3 + 4 };
+enum { MAXN = 2 + FZ_MAXCOLORS };
+
+static int clipx(float val, int ismax, float *v1, float *v2, int n)
+{
+	float t;
+	int i;
+	int v1o = ismax ? v1[0] > val : v1[0] < val;
+	int v2o = ismax ? v2[0] > val : v2[0] < val;
+	if (v1o + v2o == 0)
+		return IN;
+	if (v1o + v2o == 2)
+		return OUT;
+	if (v2o)
+	{
+		t = (val - v1[0]) / (v2[0] - v1[0]);
+		v2[0] = val;
+		v2[1] = v1[1] + t * (v2[1] - v1[1]);
+		for (i = 2; i < n; i++)
+			v2[i] = v1[i] + t * (v2[i] - v1[i]);
+		return LEAVE;
+	}
+	else
+	{
+		t = (val - v2[0]) / (v1[0] - v2[0]);
+		v1[0] = val;
+		v1[1] = v2[1] + t * (v1[1] - v2[1]);
+		for (i = 2; i < n; i++)
+			v1[i] = v2[i] + t * (v1[i] - v2[i]);
+		return ENTER;
+	}
+}
+
+static int clipy(float val, int ismax, float *v1, float *v2, int n)
+{
+	float t;
+	int i;
+	int v1o = ismax ? v1[1] > val : v1[1] < val;
+	int v2o = ismax ? v2[1] > val : v2[1] < val;
+	if (v1o + v2o == 0)
+		return IN;
+	if (v1o + v2o == 2)
+		return OUT;
+	if (v2o)
+	{
+		t = (val - v1[1]) / (v2[1] - v1[1]);
+		v2[0] = v1[0] + t * (v2[0] - v1[0]);
+		v2[1] = val;
+		for (i = 2; i < n; i++)
+			v2[i] = v1[i] + t * (v2[i] - v1[i]);
+		return LEAVE;
+	}
+	else
+	{
+		t = (val - v2[1]) / (v1[1] - v2[1]);
+		v1[0] = v2[0] + t * (v1[0] - v2[0]);
+		v1[1] = val;
+		for (i = 2; i < n; i++)
+			v1[i] = v2[i] + t * (v1[i] - v2[i]);
+		return ENTER;
+	}
+}
+
+static inline void copyvert(float *dst, float *src, int n)
+{
+	while (n--)
+		*dst++ = *src++;
+}
+
+static int clippoly(float src[MAXV][MAXN],
+	float dst[MAXV][MAXN], int len, int n,
+	float val, int isy, int ismax)
+{
+	float cv1[MAXN];
+	float cv2[MAXN];
+	int v1, v2, cp;
+	int r;
+
+	v1 = len - 1;
+	cp = 0;
+
+	for (v2 = 0; v2 < len; v2++)
+	{
+		copyvert(cv1, src[v1], n);
+		copyvert(cv2, src[v2], n);
+
+		if (isy)
+			r = clipy(val, ismax, cv1, cv2, n);
+		else
+			r = clipx(val, ismax, cv1, cv2, n);
+
+		switch (r)
+		{
+		case IN:
+			copyvert(dst[cp++], cv2, n);
+			break;
+		case OUT:
+			break;
+		case LEAVE:
+			copyvert(dst[cp++], cv2, n);
+			break;
+		case ENTER:
+			copyvert(dst[cp++], cv1, n);
+			copyvert(dst[cp++], cv2, n);
+			break;
+		}
+		v1 = v2;
+	}
+
+	return cp;
+}
+
+/*
+ * gouraud shaded polygon scan conversion
+ */
+
+static inline void
+paintscan(fz_pixmap *pix, int y, int x1, int x2, int *v1, int *v2, int n)
+{
+	unsigned char *p = pix->samples + ((y - pix->y) * pix->w + (x1 - pix->x)) * pix->n;
+	int v[FZ_MAXCOLORS];
+	int dv[FZ_MAXCOLORS];
+	int w = x2 - x1;
+	int k;
+
+	assert(w >= 0);
+	assert(y >= pix->y);
+	assert(y < pix->y + pix->h);
+	assert(x1 >= pix->x);
+	assert(x2 <= pix->x + pix->w);
+
+	if (w == 0)
+		return;
+
+	for (k = 0; k < n; k++)
+	{
+		v[k] = v1[k];
+		dv[k] = (v2[k] - v1[k]) / w;
+	}
+
+	while (w--)
+	{
+		for (k = 0; k < n; k++)
+		{
+			*p++ = v[k] >> 16;
+			v[k] += dv[k];
+		}
+		*p++ = 255;
+	}
+}
+
+static inline int
+findnext(int gel[MAXV][MAXN], int len, int a, int *s, int *e, int d)
+{
+	int b;
+
+	while (1)
+	{
+		b = a + d;
+		if (b == len)
+			b = 0;
+		if (b == -1)
+			b = len - 1;
+
+		if (gel[b][1] == gel[a][1])
+		{
+			a = b;
+			continue;
+		}
+
+		if (gel[b][1] > gel[a][1])
+		{
+			*s = a;
+			*e = b;
+			return 0;
+		}
+
+		return 1;
+	}
+}
+
+static inline void
+loadedge(int gel[MAXV][MAXN], int s, int e, int *ael, int *del, int n)
+{
+	int swp, k, dy;
+
+	if (gel[s][1] > gel[e][1])
+	{
+		swp = s; s = e; e = swp;
+	}
+
+	dy = gel[e][1] - gel[s][1];
+
+	ael[0] = gel[s][0];
+	del[0] = (gel[e][0] - gel[s][0]) / dy;
+	for (k = 2; k < n; k++)
+	{
+		ael[k] = gel[s][k];
+		del[k] = (gel[e][k] - gel[s][k]) / dy;
+	}
+}
+
+static inline void
+stepedge(int *ael, int *del, int n)
+{
+	int k;
+	ael[0] += del[0];
+	for (k = 2; k < n; k++)
+		ael[k] += del[k];
+}
+
+static void
+fz_painttriangle(fz_pixmap *pix, float *av, float *bv, float *cv, int n, fz_bbox bbox)
+{
+	float poly[MAXV][MAXN];
+	float temp[MAXV][MAXN];
+	float cx0 = bbox.x0;
+	float cy0 = bbox.y0;
+	float cx1 = bbox.x1;
+	float cy1 = bbox.y1;
+
+	int gel[MAXV][MAXN];
+	int ael[2][MAXN];
+	int del[2][MAXN];
+	int y, s0, s1, e0, e1;
+	int top, bot, len;
+
+	int i, k;
+
+	copyvert(poly[0], av, n);
+	copyvert(poly[1], bv, n);
+	copyvert(poly[2], cv, n);
+
+	len = clippoly(poly, temp, 3, n, cx0, 0, 0);
+	len = clippoly(temp, poly, len, n, cx1, 0, 1);
+	len = clippoly(poly, temp, len, n, cy0, 1, 0);
+	len = clippoly(temp, poly, len, n, cy1, 1, 1);
+
+	if (len < 3)
+		return;
+
+	for (i = 0; i < len; i++)
+	{
+		gel[i][0] = floorf(poly[i][0] + 0.5f) * 65536; /* trunc and fix */
+		gel[i][1] = floorf(poly[i][1] + 0.5f);	/* y is not fixpoint */
+		for (k = 2; k < n; k++)
+			gel[i][k] = poly[i][k] * 65536;	/* fix with precision */
+	}
+
+	top = bot = 0;
+	for (i = 0; i < len; i++)
+	{
+		if (gel[i][1] < gel[top][1])
+			top = i;
+		if (gel[i][1] > gel[bot][1])
+			bot = i;
+	}
+
+	if (gel[bot][1] - gel[top][1] == 0)
+		return;
+
+	y = gel[top][1];
+
+	if (findnext(gel, len, top, &s0, &e0, 1))
+		return;
+	if (findnext(gel, len, top, &s1, &e1, -1))
+		return;
+
+	loadedge(gel, s0, e0, ael[0], del[0], n);
+	loadedge(gel, s1, e1, ael[1], del[1], n);
+
+	while (1)
+	{
+		int x0 = ael[0][0] >> 16;
+		int x1 = ael[1][0] >> 16;
+
+		if (ael[0][0] < ael[1][0])
+			paintscan(pix, y, x0, x1, ael[0]+2, ael[1]+2, n-2);
+		else
+			paintscan(pix, y, x1, x0, ael[1]+2, ael[0]+2, n-2);
+
+		stepedge(ael[0], del[0], n);
+		stepedge(ael[1], del[1], n);
+		y ++;
+
+		if (y >= gel[e0][1])
+		{
+			if (findnext(gel, len, e0, &s0, &e0, 1))
+				return;
+			loadedge(gel, s0, e0, ael[0], del[0], n);
+		}
+
+		if (y >= gel[e1][1])
+		{
+			if (findnext(gel, len, e1, &s1, &e1, -1))
+				return;
+			loadedge(gel, s1, e1, ael[1], del[1], n);
+		}
+	}
+}
+
+static void
+fz_paintquad(fz_pixmap *pix,
+		fz_point p0, fz_point p1, fz_point p2, fz_point p3,
+		float c0, float c1, float c2, float c3,
+		int n, fz_bbox bbox)
+{
+	float v[4][3];
+
+	v[0][0] = p0.x;
+	v[0][1] = p0.y;
+	v[0][2] = c0;
+
+	v[1][0] = p1.x;
+	v[1][1] = p1.y;
+	v[1][2] = c1;
+
+	v[2][0] = p2.x;
+	v[2][1] = p2.y;
+	v[2][2] = c2;
+
+	v[3][0] = p3.x;
+	v[3][1] = p3.y;
+	v[3][2] = c3;
+
+	fz_painttriangle(pix, v[0], v[2], v[3], n, bbox);
+	fz_painttriangle(pix, v[0], v[3], v[1], n, bbox);
+}
+
+/*
+ * linear, radial and mesh painting
+ */
+
+#define HUGENUM 32000 /* how far to extend axial/radial shadings */
+#define RADSEGS 32 /* how many segments to generate for radial meshes */
+
+static fz_point
+fz_pointoncircle(fz_point p, float r, float theta)
+{
+	p.x = p.x + cosf(theta) * r;
+	p.y = p.y + sinf(theta) * r;
+
+	return p;
+}
+
+static void
+fz_paintlinear(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
+{
+	fz_point p0, p1;
+	fz_point v0, v1, v2, v3;
+	fz_point e0, e1;
+	float theta;
+
+	p0.x = shade->mesh[0];
+	p0.y = shade->mesh[1];
+	p0 = fz_transformpoint(ctm, p0);
+
+	p1.x = shade->mesh[3];
+	p1.y = shade->mesh[4];
+	p1 = fz_transformpoint(ctm, p1);
+
+	theta = atan2f(p1.y - p0.y, p1.x - p0.x);
+	theta += (float)M_PI * 0.5f;
+
+	v0 = fz_pointoncircle(p0, HUGENUM, theta);
+	v1 = fz_pointoncircle(p1, HUGENUM, theta);
+	v2 = fz_pointoncircle(p0, -HUGENUM, theta);
+	v3 = fz_pointoncircle(p1, -HUGENUM, theta);
+
+	fz_paintquad(dest, v0, v1, v2, v3, 0, 255, 0, 255, 3, bbox);
+
+	if (shade->extend[0])
+	{
+		e0.x = v0.x - (p1.x - p0.x) * HUGENUM;
+		e0.y = v0.y - (p1.y - p0.y) * HUGENUM;
+
+		e1.x = v2.x - (p1.x - p0.x) * HUGENUM;
+		e1.y = v2.y - (p1.y - p0.y) * HUGENUM;
+
+		fz_paintquad(dest, e0, e1, v0, v2, 0, 0, 0, 0, 3, bbox);
+	}
+
+	if (shade->extend[1])
+	{
+		e0.x = v1.x + (p1.x - p0.x) * HUGENUM;
+		e0.y = v1.y + (p1.y - p0.y) * HUGENUM;
+
+		e1.x = v3.x + (p1.x - p0.x) * HUGENUM;
+		e1.y = v3.y + (p1.y - p0.y) * HUGENUM;
+
+		fz_paintquad(dest, e0, e1, v1, v3, 255, 255, 255, 255, 3, bbox);
+	}
+}
+
+static void
+fz_paintannulus(fz_matrix ctm,
+		fz_point p0, float r0, float c0,
+		fz_point p1, float r1, float c1,
+		fz_pixmap *dest, fz_bbox bbox)
+{
+	fz_point t0, t1, t2, t3, b0, b1, b2, b3;
+	float theta, step;
+	int i;
+
+	theta = atan2f(p1.y - p0.y, p1.x - p0.x);
+	step = (float)M_PI * 2 / RADSEGS;
+
+	for (i = 0; i < RADSEGS / 2; i++)
+	{
+		t0 = fz_pointoncircle(p0, r0, theta + i * step);
+		t1 = fz_pointoncircle(p0, r0, theta + i * step + step);
+		t2 = fz_pointoncircle(p1, r1, theta + i * step);
+		t3 = fz_pointoncircle(p1, r1, theta + i * step + step);
+		b0 = fz_pointoncircle(p0, r0, theta - i * step);
+		b1 = fz_pointoncircle(p0, r0, theta - i * step - step);
+		b2 = fz_pointoncircle(p1, r1, theta - i * step);
+		b3 = fz_pointoncircle(p1, r1, theta - i * step - step);
+
+		t0 = fz_transformpoint(ctm, t0);
+		t1 = fz_transformpoint(ctm, t1);
+		t2 = fz_transformpoint(ctm, t2);
+		t3 = fz_transformpoint(ctm, t3);
+		b0 = fz_transformpoint(ctm, b0);
+		b1 = fz_transformpoint(ctm, b1);
+		b2 = fz_transformpoint(ctm, b2);
+		b3 = fz_transformpoint(ctm, b3);
+
+		fz_paintquad(dest, t0, t1, t2, t3, c0, c0, c1, c1, 3, bbox);
+		fz_paintquad(dest, b0, b1, b2, b3, c0, c0, c1, c1, 3, bbox);
+	}
+}
+
+static void
+fz_paintradial(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
+{
+	fz_point p0, p1;
+	float r0, r1;
+	fz_point e;
+	float er, rs;
+
+	p0.x = shade->mesh[0];
+	p0.y = shade->mesh[1];
+	r0 = shade->mesh[2];
+
+	p1.x = shade->mesh[3];
+	p1.y = shade->mesh[4];
+	r1 = shade->mesh[5];
+
+	if (shade->extend[0])
+	{
+		if (r0 < r1)
+			rs = r0 / (r0 - r1);
+		else
+			rs = -HUGENUM;
+
+		e.x = p0.x + (p1.x - p0.x) * rs;
+		e.y = p0.y + (p1.y - p0.y) * rs;
+		er = r0 + (r1 - r0) * rs;
+
+		fz_paintannulus(ctm, e, er, 0, p0, r0, 0, dest, bbox);
+	}
+
+	fz_paintannulus(ctm, p0, r0, 0, p1, r1, 255, dest, bbox);
+
+	if (shade->extend[1])
+	{
+		if (r0 > r1)
+			rs = r1 / (r1 - r0);
+		else
+			rs = -HUGENUM;
+
+		e.x = p1.x + (p0.x - p1.x) * rs;
+		e.y = p1.y + (p0.y - p1.y) * rs;
+		er = r1 + (r0 - r1) * rs;
+
+		fz_paintannulus(ctm, p1, r1, 255, e, er, 255, dest, bbox);
+	}
+}
+
+static void
+fz_paintmesh(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
+{
+	float tri[3][MAXN];
+	fz_point p;
+	float *mesh;
+	int ntris;
+	int i, k;
+
+	mesh = shade->mesh;
+
+	if (shade->usefunction)
+		ntris = shade->meshlen / 9;
+	else
+		ntris = shade->meshlen / ((2 + shade->colorspace->n) * 3);
+
+	while (ntris--)
+	{
+		for (k = 0; k < 3; k++)
+		{
+			p.x = *mesh++;
+			p.y = *mesh++;
+			p = fz_transformpoint(ctm, p);
+			tri[k][0] = p.x;
+			tri[k][1] = p.y;
+			if (shade->usefunction)
+				tri[k][2] = *mesh++ * 255;
+			else
+			{
+				fz_convertcolor(shade->colorspace, mesh, dest->colorspace, tri[k] + 2);
+				for (i = 0; i < dest->colorspace->n; i++)
+					tri[k][i + 2] *= 255;
+				mesh += shade->colorspace->n;
+			}
+		}
+		fz_painttriangle(dest, tri[0], tri[1], tri[2], 2 + dest->colorspace->n, bbox);
+	}
+}
+
+void
+fz_paintshade(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
+{
+	unsigned char clut[256][FZ_MAXCOLORS];
+	fz_pixmap *temp, *conv;
+	float color[FZ_MAXCOLORS];
+	int i, k;
+
+	ctm = fz_concat(shade->matrix, ctm);
+
+	if (shade->usefunction)
+	{
+		for (i = 0; i < 256; i++)
+		{
+			fz_convertcolor(shade->colorspace, shade->function[i], dest->colorspace, color);
+			for (k = 0; k < dest->colorspace->n; k++)
+				clut[i][k] = color[k] * 255;
+			clut[i][k] = shade->function[i][shade->colorspace->n] * 255;
+		}
+		conv = fz_newpixmapwithrect(dest->colorspace, bbox);
+		temp = fz_newpixmapwithrect(fz_devicegray, bbox);
+		fz_clearpixmap(temp);
+	}
+	else
+	{
+		temp = dest;
+	}
+
+	switch (shade->type)
+	{
+	case FZ_LINEAR: fz_paintlinear(shade, ctm, temp, bbox); break;
+	case FZ_RADIAL: fz_paintradial(shade, ctm, temp, bbox); break;
+	case FZ_MESH: fz_paintmesh(shade, ctm, temp, bbox); break;
+	}
+
+	if (shade->usefunction)
+	{
+		unsigned char *s = temp->samples;
+		unsigned char *d = conv->samples;
+		int len = temp->w * temp->h;
+		while (len--)
+		{
+			int v = *s++;
+			int a = fz_mul255(*s++, clut[v][conv->n - 1]);
+			for (k = 0; k < conv->n - 1; k++)
+				*d++ = fz_mul255(clut[v][k], a);
+			*d++ = a;
+		}
+		fz_paintpixmap(dest, conv, 255);
+		fz_droppixmap(conv);
+		fz_droppixmap(temp);
+	}
+}
+
diff --git a/draw/draw_paint.c b/draw/draw_paint.c
new file mode 100644
index 00000000..69df467a
--- /dev/null
+++ b/draw/draw_paint.c
@@ -0,0 +1,443 @@
+#include "fitz.h"
+
+/*
+
+The functions in this file implement various flavours of Porter-Duff blending.
+
+We take the following as definitions:
+
+	Cx = Color (from plane x)
+	ax = Alpha (from plane x)
+	cx = Cx.ax = Premultiplied color (from plane x)
+
+The general PorterDuff blending equation is:
+
+	Blend Z = X op Y	cz = Fx.cx + Fy. cy	where Fx and Fy depend on op
+
+The two operations we use in this file are: '(X in Y) over Z' and
+'S over Z'. The definitions of the 'over' and 'in' operations are as
+follows:
+
+	For S over Z,	Fs = 1, Fz = 1-as
+	For X in Y,	Fx = ay, Fy = 0
+
+We have 2 choices; we can either work with premultiplied data, or non
+premultiplied data. Our
+
+First the premultiplied case:
+
+	Let S = (X in Y)
+	Let R = (X in Y) over Z = S over Z
+
+	cs	= cx.Fx + cy.Fy	(where Fx = ay, Fy = 0)
+		= cx.ay
+	as	= ax.Fx + ay.Fy
+		= ax.ay
+
+	cr	= cs.Fs + cz.Fz	(where Fs = 1, Fz = 1-as)
+		= cs + cz.(1-as)
+		= cx.ay + cz.(1-ax.ay)
+	ar	= as.Fs + az.Fz
+		= as + az.(1-as)
+		= ax.ay + az.(1-ax.ay)
+
+This has various nice properties, like not needing any divisions, and
+being symmetric in color and alpha, so this is what we use. Because we
+went through the pain of deriving the non premultiplied forms, we list
+them here too, though they are not used.
+
+Non Pre-multiplied case:
+
+	Cs.as	= Fx.Cx.ax + Fy.Cy.ay	(where Fx = ay, Fy = 0)
+		= Cx.ay.ax
+	Cs	= (Cx.ay.ax)/(ay.ax)
+		= Cx
+	Cr.ar	= Fs.Cs.as + Fz.Cz.az	(where Fs = 1, Fz = 1-as)
+		= Cs.as	+ (1-as).Cz.az
+		= Cx.ax.ay + Cz.az.(1-ax.ay)
+	Cr	= (Cx.ax.ay + Cz.az.(1-ax.ay))/(ax.ay + az.(1-ax-ay))
+
+Much more complex, it seems. However, if we could restrict ourselves to
+the case where we were always plotting onto an opaque background (i.e.
+az = 1), then:
+
+	Cr	= Cx.(ax.ay) + Cz.(1-ax.ay)
+		= (Cx-Cz)*(1-ax.ay) + Cz	(a single MLA operation)
+	ar	= 1
+
+Sadly, this is not true in the general case, so we abandon this effort
+and stick to using the premultiplied form.
+
+*/
+
+typedef unsigned char byte;
+
+/* Blend a non-premultiplied color in mask over destination */
+
+static inline void
+fz_paintspancolor2(byte * restrict dp, byte * restrict mp, int w, byte *color)
+{
+	int sa = FZ_EXPAND(color[1]);
+	int g = color[0];
+	while (w--)
+	{
+		int ma = *mp++;
+		ma = FZ_COMBINE(FZ_EXPAND(ma), sa);
+		dp[0] = FZ_BLEND(g, dp[0], ma);
+		dp[1] = FZ_BLEND(255, dp[1], ma);
+		dp += 2;
+	}
+}
+
+static inline void
+fz_paintspancolor4(byte * restrict dp, byte * restrict mp, int w, byte *color)
+{
+	int sa = FZ_EXPAND(color[3]);
+	int r = color[0];
+	int g = color[1];
+	int b = color[2];
+	while (w--)
+	{
+		int ma = *mp++;
+		ma = FZ_COMBINE(FZ_EXPAND(ma), sa);
+		dp[0] = FZ_BLEND(r, dp[0], ma);
+		dp[1] = FZ_BLEND(g, dp[1], ma);
+		dp[2] = FZ_BLEND(b, dp[2], ma);
+		dp[3] = FZ_BLEND(255, dp[3], ma);
+		dp += 4;
+	}
+}
+
+static inline void
+fz_paintspancolorN(byte * restrict dp, byte * restrict mp, int n, int w, byte *color)
+{
+	int sa = FZ_EXPAND(color[n-1]);
+	int k;
+	n--;
+	while (w--)
+	{
+		int ma = *mp++;
+		ma = FZ_COMBINE(FZ_EXPAND(ma), sa);
+		for (k = 0; k < n; k++)
+			dp[k] = FZ_BLEND(color[k], dp[k], ma);
+		dp[k] = FZ_BLEND(255, dp[k], ma);
+		dp += n;
+	}
+}
+
+void
+fz_paintspancolor(byte * restrict dp, byte * restrict mp, int n, int w, byte *color)
+{
+	switch (n)
+	{
+	case 2: fz_paintspancolor2(dp, mp, w, color); break;
+	case 4: fz_paintspancolor4(dp, mp, w, color); break;
+	default: fz_paintspancolorN(dp, mp, n, w, color); break;
+	}
+}
+
+/* Blend source in mask over destination */
+
+static inline void
+fz_paintspanmask2(byte * restrict dp, byte * restrict sp, byte * restrict mp, int w)
+{
+	while (w--)
+	{
+		int masa;
+		int ma = *mp++;
+		ma = FZ_EXPAND(ma);
+		masa = FZ_COMBINE(sp[1], ma);
+		masa = 255 - masa;
+		masa = FZ_EXPAND(masa);
+		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
+		sp++; dp++;
+		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
+		sp++; dp++;
+	}
+}
+
+static inline void
+fz_paintspanmask4(byte * restrict dp, byte * restrict sp, byte * restrict mp, int w)
+{
+	while (w--)
+	{
+		int masa;
+		int ma = *mp++;
+		ma = FZ_EXPAND(ma);
+		masa = FZ_COMBINE(sp[3], ma);
+		masa = 255 - masa;
+		masa = FZ_EXPAND(masa);
+		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
+		sp++; dp++;
+		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
+		sp++; dp++;
+		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
+		sp++; dp++;
+		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
+		sp++; dp++;
+	}
+}
+
+static inline void
+fz_paintspanmaskN(byte * restrict dp, byte * restrict sp, byte * restrict mp, int n, int w)
+{
+	n--;
+	while (w--)
+	{
+		int k = n;
+		int masa;
+		int ma = *mp++;
+		ma = FZ_EXPAND(ma);
+		masa = FZ_COMBINE(sp[n-1], ma);
+		masa = 255-masa;
+		masa = FZ_EXPAND(masa);
+		while (k--)
+		{
+			*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
+			sp++; dp++;
+		}
+	}
+}
+
+static void
+fz_paintspanmask(byte * restrict dp, byte * restrict sp, byte * restrict mp, int n, int w)
+{
+	switch (n)
+	{
+	case 2: fz_paintspanmask2(dp, sp, mp, w); break;
+	case 4: fz_paintspanmask4(dp, sp, mp, w); break;
+	default: fz_paintspanmaskN(dp, sp, mp, n, w); break;
+	}
+}
+
+/* Blend source in constant alpha over destination */
+
+static inline void
+fz_paintspan2alpha(byte * restrict dp, byte * restrict sp, int w, int alpha)
+{
+	alpha = FZ_EXPAND(alpha);
+	while (w--)
+	{
+		int masa = FZ_COMBINE(sp[1], alpha);
+		*dp = FZ_BLEND(*sp, *dp, masa);
+		dp++; sp++;
+		*dp = FZ_BLEND(*sp, *dp, masa);
+		dp++; sp++;
+	}
+}
+
+static inline void
+fz_paintspan4alpha(byte * restrict dp, byte * restrict sp, int w, int alpha)
+{
+	alpha = FZ_EXPAND(alpha);
+	while (w--)
+	{
+		int masa = FZ_COMBINE(sp[3], alpha);
+		*dp = FZ_BLEND(*sp, *dp, masa);
+		sp++; dp++;
+		*dp = FZ_BLEND(*sp, *dp, masa);
+		sp++; dp++;
+		*dp = FZ_BLEND(*sp, *dp, masa);
+		sp++; dp++;
+		*dp = FZ_BLEND(*sp, *dp, masa);
+		sp++; dp++;
+	}
+}
+
+static inline void
+fz_paintspanNalpha(byte * restrict dp, byte * restrict sp, int n, int w, int alpha)
+{
+	alpha = FZ_EXPAND(alpha);
+	while (w--)
+	{
+		int masa = FZ_COMBINE(sp[n-1], alpha);
+		int k = n;
+		while (k--)
+		{
+			*dp = FZ_BLEND(*sp++, *dp, masa);
+			dp++;
+		}
+	}
+}
+
+/* Blend source over destination */
+
+static inline void
+fz_paintspan1(byte * restrict dp, byte * restrict sp, int w)
+{
+	while (w--)
+	{
+		int t = FZ_EXPAND(255 - sp[0]);
+		*dp = *sp++ + FZ_COMBINE(*dp, t);
+		dp ++;
+	}
+}
+
+static inline void
+fz_paintspan2(byte * restrict dp, byte * restrict sp, int w)
+{
+	while (w--)
+	{
+		int t = FZ_EXPAND(255 - sp[1]);
+		*dp = *sp++ + FZ_COMBINE(*dp, t);
+		dp++;
+		*dp = *sp++ + FZ_COMBINE(*dp, t);
+		dp++;
+	}
+}
+
+static inline void
+fz_paintspan4(byte * restrict dp, byte * restrict sp, int w)
+{
+	while (w--)
+	{
+		int t = FZ_EXPAND(255 - sp[3]);
+		*dp = *sp++ + FZ_COMBINE(*dp, t);
+		dp++;
+		*dp = *sp++ + FZ_COMBINE(*dp, t);
+		dp++;
+		*dp = *sp++ + FZ_COMBINE(*dp, t);
+		dp++;
+		*dp = *sp++ + FZ_COMBINE(*dp, t);
+		dp++;
+	}
+}
+
+static inline void
+fz_paintspanN(byte * restrict dp, byte * restrict sp, int n, int w)
+{
+	while (w--)
+	{
+		int k = n;
+		int t = FZ_EXPAND(255 - sp[n-1]);
+		while (k--)
+		{
+			*dp = *sp++ + FZ_COMBINE(*dp, t);
+			dp++;
+		}
+	}
+}
+
+void
+fz_paintspan(byte * restrict dp, byte * restrict sp, int n, int w, int alpha)
+{
+	if (alpha == 255)
+	{
+		switch (n)
+		{
+		case 1: fz_paintspan1(dp, sp, w); break;
+		case 2: fz_paintspan2(dp, sp, w); break;
+		case 4: fz_paintspan4(dp, sp, w); break;
+		default: fz_paintspanN(dp, sp, n, w); break;
+		}
+	}
+	else if (alpha > 0)
+	{
+		switch (n)
+		{
+		case 2: fz_paintspan2alpha(dp, sp, w, alpha); break;
+		case 4: fz_paintspan4alpha(dp, sp, w, alpha); break;
+		default: fz_paintspanNalpha(dp, sp, n, w, alpha); break;
+		}
+	}
+}
+
+/*
+ * Pixmap blending functions
+ */
+
+void
+fz_paintpixmapbbox(fz_pixmap *dst, fz_pixmap *src, int alpha, fz_bbox bbox)
+{
+	unsigned char *sp, *dp;
+	int x, y, w, h, n;
+
+	assert(dst->n == src->n);
+
+	bbox = fz_intersectbbox(bbox, fz_boundpixmap(dst));
+	bbox = fz_intersectbbox(bbox, fz_boundpixmap(src));
+
+	x = bbox.x0;
+	y = bbox.y0;
+	w = bbox.x1 - bbox.x0;
+	h = bbox.y1 - bbox.y0;
+	if ((w | h) == 0)
+		return;
+
+	n = src->n;
+	sp = src->samples + ((y - src->y) * src->w + (x - src->x)) * src->n;
+	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
+
+	while (h--)
+	{
+		fz_paintspan(dp, sp, n, w, alpha);
+		sp += src->w * n;
+		dp += dst->w * n;
+	}
+}
+
+void
+fz_paintpixmap(fz_pixmap *dst, fz_pixmap *src, int alpha)
+{
+	unsigned char *sp, *dp;
+	fz_bbox bbox;
+	int x, y, w, h, n;
+
+	assert(dst->n == src->n);
+
+	bbox = fz_boundpixmap(dst);
+	bbox = fz_intersectbbox(bbox, fz_boundpixmap(src));
+
+	x = bbox.x0;
+	y = bbox.y0;
+	w = bbox.x1 - bbox.x0;
+	h = bbox.y1 - bbox.y0;
+	if ((w | h) == 0)
+		return;
+
+	n = src->n;
+	sp = src->samples + ((y - src->y) * src->w + (x - src->x)) * src->n;
+	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
+
+	while (h--)
+	{
+		fz_paintspan(dp, sp, n, w, alpha);
+		sp += src->w * n;
+		dp += dst->w * n;
+	}
+}
+
+void
+fz_paintpixmapmask(fz_pixmap *dst, fz_pixmap *src, fz_pixmap *msk)
+{
+	unsigned char *sp, *dp, *mp;
+	fz_bbox bbox;
+	int x, y, w, h, n;
+
+	assert(dst->n == src->n);
+	assert(msk->n == 1);
+
+	bbox = fz_boundpixmap(dst);
+	bbox = fz_intersectbbox(bbox, fz_boundpixmap(src));
+	bbox = fz_intersectbbox(bbox, fz_boundpixmap(msk));
+
+	x = bbox.x0;
+	y = bbox.y0;
+	w = bbox.x1 - bbox.x0;
+	h = bbox.y1 - bbox.y0;
+	if ((w | h) == 0)
+		return;
+
+	n = src->n;
+	sp = src->samples + ((y - src->y) * src->w + (x - src->x)) * src->n;
+	mp = msk->samples + ((y - msk->y) * msk->w + (x - msk->x)) * msk->n;
+	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
+
+	while (h--)
+	{
+		fz_paintspanmask(dp, sp, mp, n, w);
+		sp += src->w * n;
+		dp += dst->w * n;
+		mp += msk->w;
+	}
+}
diff --git a/draw/draw_path.c b/draw/draw_path.c
new file mode 100644
index 00000000..23ea3515
--- /dev/null
+++ b/draw/draw_path.c
@@ -0,0 +1,773 @@
+#include "fitz.h"
+
+#define MAXDEPTH 8
+
+enum { BUTT = 0, ROUND = 1, SQUARE = 2, MITER = 0, BEVEL = 2 };
+
+static void
+line(fz_gel *gel, fz_matrix *ctm, float x0, float y0, float x1, float y1)
+{
+	float tx0 = ctm->a * x0 + ctm->c * y0 + ctm->e;
+	float ty0 = ctm->b * x0 + ctm->d * y0 + ctm->f;
+	float tx1 = ctm->a * x1 + ctm->c * y1 + ctm->e;
+	float ty1 = ctm->b * x1 + ctm->d * y1 + ctm->f;
+	fz_insertgel(gel, tx0, ty0, tx1, ty1);
+}
+
+static void
+bezier(fz_gel *gel, fz_matrix *ctm, float flatness,
+	float xa, float ya,
+	float xb, float yb,
+	float xc, float yc,
+	float xd, float yd, int depth)
+{
+	float dmax;
+	float xab, yab;
+	float xbc, ybc;
+	float xcd, ycd;
+	float xabc, yabc;
+	float xbcd, ybcd;
+	float xabcd, yabcd;
+
+	/* termination check */
+	dmax = ABS(xa - xb);
+	dmax = MAX(dmax, ABS(ya - yb));
+	dmax = MAX(dmax, ABS(xd - xc));
+	dmax = MAX(dmax, ABS(yd - yc));
+	if (dmax < flatness || depth >= MAXDEPTH)
+	{
+		line(gel, ctm, xa, ya, xd, yd);
+		return;
+	}
+
+	xab = xa + xb;
+	yab = ya + yb;
+	xbc = xb + xc;
+	ybc = yb + yc;
+	xcd = xc + xd;
+	ycd = yc + yd;
+
+	xabc = xab + xbc;
+	yabc = yab + ybc;
+	xbcd = xbc + xcd;
+	ybcd = ybc + ycd;
+
+	xabcd = xabc + xbcd;
+	yabcd = yabc + ybcd;
+
+	xab *= 0.5f; yab *= 0.5f;
+	xbc *= 0.5f; ybc *= 0.5f;
+	xcd *= 0.5f; ycd *= 0.5f;
+
+	xabc *= 0.25f; yabc *= 0.25f;
+	xbcd *= 0.25f; ybcd *= 0.25f;
+
+	xabcd *= 0.125f; yabcd *= 0.125f;
+
+	bezier(gel, ctm, flatness, xa, ya, xab, yab, xabc, yabc, xabcd, yabcd, depth + 1);
+	bezier(gel, ctm, flatness, xabcd, yabcd, xbcd, ybcd, xcd, ycd, xd, yd, depth + 1);
+}
+
+void
+fz_fillpath(fz_gel *gel, fz_path *path, fz_matrix ctm, float flatness)
+{
+	float x1, y1, x2, y2, x3, y3;
+	float cx = 0;
+	float cy = 0;
+	float bx = 0;
+	float by = 0;
+	int i = 0;
+
+	while (i < path->len)
+	{
+		switch (path->els[i++].k)
+		{
+		case FZ_MOVETO:
+			/* implicit closepath before moveto */
+			if (i && (cx != bx || cy != by))
+				line(gel, &ctm, cx, cy, bx, by);
+			x1 = path->els[i++].v;
+			y1 = path->els[i++].v;
+			cx = bx = x1;
+			cy = by = y1;
+			break;
+
+		case FZ_LINETO:
+			x1 = path->els[i++].v;
+			y1 = path->els[i++].v;
+			line(gel, &ctm, cx, cy, x1, y1);
+			cx = x1;
+			cy = y1;
+			break;
+
+		case FZ_CURVETO:
+			x1 = path->els[i++].v;
+			y1 = path->els[i++].v;
+			x2 = path->els[i++].v;
+			y2 = path->els[i++].v;
+			x3 = path->els[i++].v;
+			y3 = path->els[i++].v;
+			bezier(gel, &ctm, flatness, cx, cy, x1, y1, x2, y2, x3, y3, 0);
+			cx = x3;
+			cy = y3;
+			break;
+
+		case FZ_CLOSEPATH:
+			line(gel, &ctm, cx, cy, bx, by);
+			cx = bx;
+			cy = by;
+			break;
+		}
+	}
+
+	if (i && (cx != bx || cy != by))
+		line(gel, &ctm, cx, cy, bx, by);
+}
+
+struct sctx
+{
+	fz_gel *gel;
+	fz_matrix *ctm;
+	float flatness;
+
+	int linecap;
+	int linejoin;
+	float linewidth;
+	float miterlimit;
+	fz_point beg[2];
+	fz_point seg[2];
+	int sn, bn;
+	int dot;
+
+	float *dashlist;
+	float dashphase;
+	int dashlen;
+	int toggle;
+	int offset;
+	float phase;
+	fz_point cur;
+};
+
+static void
+fz_addline(struct sctx *s, float x0, float y0, float x1, float y1)
+{
+	float tx0 = s->ctm->a * x0 + s->ctm->c * y0 + s->ctm->e;
+	float ty0 = s->ctm->b * x0 + s->ctm->d * y0 + s->ctm->f;
+	float tx1 = s->ctm->a * x1 + s->ctm->c * y1 + s->ctm->e;
+	float ty1 = s->ctm->b * x1 + s->ctm->d * y1 + s->ctm->f;
+	fz_insertgel(s->gel, tx0, ty0, tx1, ty1);
+}
+
+static void
+fz_addarc(struct sctx *s,
+	float xc, float yc,
+	float x0, float y0,
+	float x1, float y1)
+{
+	float th0, th1, r;
+	float theta;
+	float ox, oy, nx, ny;
+	int n, i;
+
+	r = fabsf(s->linewidth);
+	theta = 2 * (float)M_SQRT2 * sqrtf(s->flatness / r);
+	th0 = atan2f(y0, x0);
+	th1 = atan2f(y1, x1);
+
+	if (r > 0)
+	{
+		if (th0 < th1)
+			th0 += (float)M_PI * 2;
+		n = ceilf((th0 - th1) / theta);
+	}
+	else
+	{
+		if (th1 < th0)
+			th1 += (float)M_PI * 2;
+		n = ceilf((th1 - th0) / theta);
+	}
+
+	ox = x0;
+	oy = y0;
+	for (i = 1; i < n; i++)
+	{
+		theta = th0 + (th1 - th0) * i / n;
+		nx = cosf(theta) * r;
+		ny = sinf(theta) * r;
+		fz_addline(s, xc + ox, yc + oy, xc + nx, yc + ny);
+		ox = nx;
+		oy = ny;
+	}
+
+	fz_addline(s, xc + ox, yc + oy, xc + x1, yc + y1);
+}
+
+static void
+fz_linestroke(struct sctx *s, fz_point a, fz_point b)
+{
+	float dx = b.x - a.x;
+	float dy = b.y - a.y;
+	float scale = s->linewidth / sqrtf(dx * dx + dy * dy);
+	float dlx = dy * scale;
+	float dly = -dx * scale;
+	fz_addline(s, a.x - dlx, a.y - dly, b.x - dlx, b.y - dly);
+	fz_addline(s, b.x + dlx, b.y + dly, a.x + dlx, a.y + dly);
+}
+
+static void
+fz_linejoin(struct sctx *s, fz_point a, fz_point b, fz_point c)
+{
+	float miterlimit = s->miterlimit;
+	float linewidth = s->linewidth;
+	int linejoin = s->linejoin;
+	float dx0, dy0;
+	float dx1, dy1;
+	float dlx0, dly0;
+	float dlx1, dly1;
+	float dmx, dmy;
+	float dmr2;
+	float scale;
+	float cross;
+
+	dx0 = b.x - a.x;
+	dy0 = b.y - a.y;
+
+	dx1 = c.x - b.x;
+	dy1 = c.y - b.y;
+
+	if (dx0 * dx0 + dy0 * dy0 < FLT_EPSILON)
+		linejoin = BEVEL;
+	if (dx1 * dx1 + dy1 * dy1 < FLT_EPSILON)
+		linejoin = BEVEL;
+
+	scale = linewidth / sqrtf(dx0 * dx0 + dy0 * dy0);
+	dlx0 = dy0 * scale;
+	dly0 = -dx0 * scale;
+
+	scale = linewidth / sqrtf(dx1 * dx1 + dy1 * dy1);
+	dlx1 = dy1 * scale;
+	dly1 = -dx1 * scale;
+
+	cross = dx1 * dy0 - dx0 * dy1;
+
+	dmx = (dlx0 + dlx1) * 0.5f;
+	dmy = (dly0 + dly1) * 0.5f;
+	dmr2 = dmx * dmx + dmy * dmy;
+
+	if (cross * cross < FLT_EPSILON && dx0 * dx1 + dy0 * dy1 >= 0)
+		linejoin = BEVEL;
+
+	if (linejoin == MITER)
+		if (dmr2 * miterlimit * miterlimit < linewidth * linewidth)
+			linejoin = BEVEL;
+
+	if (linejoin == BEVEL)
+	{
+		fz_addline(s, b.x - dlx0, b.y - dly0, b.x - dlx1, b.y - dly1);
+		fz_addline(s, b.x + dlx1, b.y + dly1, b.x + dlx0, b.y + dly0);
+	}
+
+	if (linejoin == MITER)
+	{
+		scale = linewidth * linewidth / dmr2;
+		dmx *= scale;
+		dmy *= scale;
+
+		if (cross < 0)
+		{
+			fz_addline(s, b.x - dlx0, b.y - dly0, b.x - dlx1, b.y - dly1);
+			fz_addline(s, b.x + dlx1, b.y + dly1, b.x + dmx, b.y + dmy);
+			fz_addline(s, b.x + dmx, b.y + dmy, b.x + dlx0, b.y + dly0);
+		}
+		else
+		{
+			fz_addline(s, b.x + dlx1, b.y + dly1, b.x + dlx0, b.y + dly0);
+			fz_addline(s, b.x - dlx0, b.y - dly0, b.x - dmx, b.y - dmy);
+			fz_addline(s, b.x - dmx, b.y - dmy, b.x - dlx1, b.y - dly1);
+		}
+	}
+
+	if (linejoin == ROUND)
+	{
+		if (cross < 0)
+		{
+			fz_addline(s, b.x - dlx0, b.y - dly0, b.x - dlx1, b.y - dly1);
+			fz_addarc(s, b.x, b.y, dlx1, dly1, dlx0, dly0);
+		}
+		else
+		{
+			fz_addline(s, b.x + dlx1, b.y + dly1, b.x + dlx0, b.y + dly0);
+			fz_addarc(s, b.x, b.y, -dlx0, -dly0, -dlx1, -dly1);
+		}
+	}
+}
+
+static void
+fz_linecap(struct sctx *s, fz_point a, fz_point b)
+{
+	float flatness = s->flatness;
+	float linewidth = s->linewidth;
+	int linecap = s->linecap;
+
+	float dx = b.x - a.x;
+	float dy = b.y - a.y;
+
+	float scale = linewidth / sqrtf(dx * dx + dy * dy);
+	float dlx = dy * scale;
+	float dly = -dx * scale;
+
+	if (linecap == BUTT)
+		fz_addline(s, b.x - dlx, b.y - dly, b.x + dlx, b.y + dly);
+
+	if (linecap == ROUND)
+	{
+		int i;
+		int n = ceilf((float)M_PI / (2.0f * (float)M_SQRT2 * sqrtf(flatness / linewidth)));
+		float ox = b.x - dlx;
+		float oy = b.y - dly;
+		for (i = 1; i < n; i++)
+		{
+			float theta = (float)M_PI * i / n;
+			float cth = cosf(theta);
+			float sth = sinf(theta);
+			float nx = b.x - dlx * cth - dly * sth;
+			float ny = b.y - dly * cth + dlx * sth;
+			fz_addline(s, ox, oy, nx, ny);
+			ox = nx;
+			oy = ny;
+		}
+		fz_addline(s, ox, oy, b.x + dlx, b.y + dly);
+	}
+
+	if (linecap == SQUARE)
+	{
+		fz_addline(s, b.x - dlx, b.y - dly,
+			b.x - dlx - dly,
+			b.y - dly + dlx);
+		fz_addline(s, b.x - dlx - dly,
+			b.y - dly + dlx,
+			b.x + dlx - dly,
+			b.y + dly + dlx);
+		fz_addline(s, b.x + dlx - dly,
+			b.y + dly + dlx,
+			b.x + dlx, b.y + dly);
+	}
+}
+
+static void
+fz_linedot(struct sctx *s, fz_point a)
+{
+	float flatness = s->flatness;
+	float linewidth = s->linewidth;
+	int n = ceilf((float)M_PI / ((float)M_SQRT2 * sqrtf(flatness / linewidth)));
+	float ox = a.x - linewidth;
+	float oy = a.y;
+	int i;
+
+	for (i = 1; i < n; i++)
+	{
+		float theta = (float)M_PI * 2 * i / n;
+		float cth = cosf(theta);
+		float sth = sinf(theta);
+		float nx = a.x - cth * linewidth;
+		float ny = a.y + sth * linewidth;
+		fz_addline(s, ox, oy, nx, ny);
+		ox = nx;
+		oy = ny;
+	}
+
+	fz_addline(s, ox, oy, a.x - linewidth, a.y);
+}
+
+static void
+fz_strokeflush(struct sctx *s)
+{
+	if (s->sn == 2)
+	{
+		fz_linecap(s, s->beg[1], s->beg[0]);
+		fz_linecap(s, s->seg[0], s->seg[1]);
+	}
+	else if (s->dot)
+	{
+		fz_linedot(s, s->beg[0]);
+	}
+}
+
+static void
+fz_strokemoveto(struct sctx *s, fz_point cur)
+{
+	fz_strokeflush(s);
+	s->seg[0] = cur;
+	s->beg[0] = cur;
+	s->sn = 1;
+	s->bn = 1;
+	s->dot = 0;
+}
+
+static void
+fz_strokelineto(struct sctx *s, fz_point cur)
+{
+	float dx = cur.x - s->seg[s->sn-1].x;
+	float dy = cur.y - s->seg[s->sn-1].y;
+
+	if (dx * dx + dy * dy < FLT_EPSILON)
+	{
+		s->dot = 1;
+		return;
+	}
+
+	fz_linestroke(s, s->seg[s->sn-1], cur);
+
+	if (s->sn == 2)
+	{
+		fz_linejoin(s, s->seg[0], s->seg[1], cur);
+		s->seg[0] = s->seg[1];
+		s->seg[1] = cur;
+	}
+
+	if (s->sn == 1)
+		s->seg[s->sn++] = cur;
+	if (s->bn == 1)
+		s->beg[s->bn++] = cur;
+}
+
+static void
+fz_strokeclosepath(struct sctx *s)
+{
+	if (s->sn == 2)
+	{
+		fz_strokelineto(s, s->beg[0]);
+		if (s->seg[1].x == s->beg[0].x && s->seg[1].y == s->beg[0].y)
+			fz_linejoin(s, s->seg[0], s->beg[0], s->beg[1]);
+		else
+			fz_linejoin(s, s->seg[1], s->beg[0], s->beg[1]);
+	}
+	else if (s->dot)
+	{
+		fz_linedot(s, s->beg[0]);
+	}
+
+	s->seg[0] = s->beg[0];
+	s->bn = 1;
+	s->sn = 1;
+	s->dot = 0;
+}
+
+static void
+fz_strokebezier(struct sctx *s,
+	float xa, float ya,
+	float xb, float yb,
+	float xc, float yc,
+	float xd, float yd, int depth)
+{
+	float dmax;
+	float xab, yab;
+	float xbc, ybc;
+	float xcd, ycd;
+	float xabc, yabc;
+	float xbcd, ybcd;
+	float xabcd, yabcd;
+
+	/* termination check */
+	dmax = ABS(xa - xb);
+	dmax = MAX(dmax, ABS(ya - yb));
+	dmax = MAX(dmax, ABS(xd - xc));
+	dmax = MAX(dmax, ABS(yd - yc));
+	if (dmax < s->flatness || depth >= MAXDEPTH)
+	{
+		fz_point p;
+		p.x = xd;
+		p.y = yd;
+		fz_strokelineto(s, p);
+		return;
+	}
+
+	xab = xa + xb;
+	yab = ya + yb;
+	xbc = xb + xc;
+	ybc = yb + yc;
+	xcd = xc + xd;
+	ycd = yc + yd;
+
+	xabc = xab + xbc;
+	yabc = yab + ybc;
+	xbcd = xbc + xcd;
+	ybcd = ybc + ycd;
+
+	xabcd = xabc + xbcd;
+	yabcd = yabc + ybcd;
+
+	xab *= 0.5f; yab *= 0.5f;
+	xbc *= 0.5f; ybc *= 0.5f;
+	xcd *= 0.5f; ycd *= 0.5f;
+
+	xabc *= 0.25f; yabc *= 0.25f;
+	xbcd *= 0.25f; ybcd *= 0.25f;
+
+	xabcd *= 0.125f; yabcd *= 0.125f;
+
+	fz_strokebezier(s, xa, ya, xab, yab, xabc, yabc, xabcd, yabcd, depth + 1);
+	fz_strokebezier(s, xabcd, yabcd, xbcd, ybcd, xcd, ycd, xd, yd, depth + 1);
+}
+
+void
+fz_strokepath(fz_gel *gel, fz_path *path, fz_strokestate *stroke, fz_matrix ctm, float flatness, float linewidth)
+{
+	struct sctx s;
+	fz_point p0, p1, p2, p3;
+	int i;
+
+	s.gel = gel;
+	s.ctm = &ctm;
+	s.flatness = flatness;
+
+	s.linecap = stroke->linecap;
+	s.linejoin = stroke->linejoin;
+	s.linewidth = linewidth * 0.5f; /* hairlines use a different value from the path value */
+	s.miterlimit = stroke->miterlimit;
+	s.sn = 0;
+	s.bn = 0;
+	s.dot = 0;
+
+	i = 0;
+
+	if (path->len > 0 && path->els[0].k != FZ_MOVETO)
+	{
+		fz_warn("assert: path must begin with moveto");
+		return;
+	}
+
+	p0.x = p0.y = 0;
+
+	while (i < path->len)
+	{
+		switch (path->els[i++].k)
+		{
+		case FZ_MOVETO:
+			p1.x = path->els[i++].v;
+			p1.y = path->els[i++].v;
+			fz_strokemoveto(&s, p1);
+			p0 = p1;
+			break;
+
+		case FZ_LINETO:
+			p1.x = path->els[i++].v;
+			p1.y = path->els[i++].v;
+			fz_strokelineto(&s, p1);
+			p0 = p1;
+			break;
+
+		case FZ_CURVETO:
+			p1.x = path->els[i++].v;
+			p1.y = path->els[i++].v;
+			p2.x = path->els[i++].v;
+			p2.y = path->els[i++].v;
+			p3.x = path->els[i++].v;
+			p3.y = path->els[i++].v;
+			fz_strokebezier(&s, p0.x, p0.y, p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, 0);
+			p0 = p3;
+			break;
+
+		case FZ_CLOSEPATH:
+			fz_strokeclosepath(&s);
+			break;
+		}
+	}
+
+	fz_strokeflush(&s);
+}
+
+static void
+fz_dashmoveto(struct sctx *s, fz_point a)
+{
+	s->toggle = 1;
+	s->offset = 0;
+	s->phase = s->dashphase;
+
+	while (s->phase >= s->dashlist[s->offset])
+	{
+		s->toggle = !s->toggle;
+		s->phase -= s->dashlist[s->offset];
+		s->offset ++;
+		if (s->offset == s->dashlen)
+			s->offset = 0;
+	}
+
+	s->cur = a;
+
+	if (s->toggle)
+		fz_strokemoveto(s, a);
+}
+
+static void
+fz_dashlineto(struct sctx *s, fz_point b)
+{
+	float dx, dy;
+	float total, used, ratio;
+	fz_point a;
+	fz_point m;
+
+	a = s->cur;
+	dx = b.x - a.x;
+	dy = b.y - a.y;
+	total = sqrtf(dx * dx + dy * dy);
+	used = 0;
+
+	while (total - used > s->dashlist[s->offset] - s->phase)
+	{
+		used += s->dashlist[s->offset] - s->phase;
+		ratio = used / total;
+		m.x = a.x + ratio * dx;
+		m.y = a.y + ratio * dy;
+
+		if (s->toggle)
+			fz_strokelineto(s, m);
+		else
+			fz_strokemoveto(s, m);
+
+		s->toggle = !s->toggle;
+		s->phase = 0;
+		s->offset ++;
+		if (s->offset == s->dashlen)
+			s->offset = 0;
+	}
+
+	s->phase += total - used;
+
+	s->cur = b;
+
+	if (s->toggle)
+		fz_strokelineto(s, b);
+}
+
+static void
+fz_dashbezier(struct sctx *s,
+	float xa, float ya,
+	float xb, float yb,
+	float xc, float yc,
+	float xd, float yd, int depth)
+{
+	float dmax;
+	float xab, yab;
+	float xbc, ybc;
+	float xcd, ycd;
+	float xabc, yabc;
+	float xbcd, ybcd;
+	float xabcd, yabcd;
+
+	/* termination check */
+	dmax = ABS(xa - xb);
+	dmax = MAX(dmax, ABS(ya - yb));
+	dmax = MAX(dmax, ABS(xd - xc));
+	dmax = MAX(dmax, ABS(yd - yc));
+	if (dmax < s->flatness || depth >= MAXDEPTH)
+	{
+		fz_point p;
+		p.x = xd;
+		p.y = yd;
+		fz_dashlineto(s, p);
+		return;
+	}
+
+	xab = xa + xb;
+	yab = ya + yb;
+	xbc = xb + xc;
+	ybc = yb + yc;
+	xcd = xc + xd;
+	ycd = yc + yd;
+
+	xabc = xab + xbc;
+	yabc = yab + ybc;
+	xbcd = xbc + xcd;
+	ybcd = ybc + ycd;
+
+	xabcd = xabc + xbcd;
+	yabcd = yabc + ybcd;
+
+	xab *= 0.5f; yab *= 0.5f;
+	xbc *= 0.5f; ybc *= 0.5f;
+	xcd *= 0.5f; ycd *= 0.5f;
+
+	xabc *= 0.25f; yabc *= 0.25f;
+	xbcd *= 0.25f; ybcd *= 0.25f;
+
+	xabcd *= 0.125f; yabcd *= 0.125f;
+
+	fz_dashbezier(s, xa, ya, xab, yab, xabc, yabc, xabcd, yabcd, depth + 1);
+	fz_dashbezier(s, xabcd, yabcd, xbcd, ybcd, xcd, ycd, xd, yd, depth + 1);
+}
+
+void
+fz_dashpath(fz_gel *gel, fz_path *path, fz_strokestate *stroke, fz_matrix ctm, float flatness, float linewidth)
+{
+	struct sctx s;
+	fz_point p0, p1, p2, p3, beg;
+	int i;
+
+	s.gel = gel;
+	s.ctm = &ctm;
+	s.flatness = flatness;
+
+	s.linecap = stroke->linecap;
+	s.linejoin = stroke->linejoin;
+	s.linewidth = linewidth * 0.5f;
+	s.miterlimit = stroke->miterlimit;
+	s.sn = 0;
+	s.bn = 0;
+	s.dot = 0;
+
+	s.dashlist = stroke->dashlist;
+	s.dashphase = stroke->dashphase;
+	s.dashlen = stroke->dashlen;
+	s.toggle = 0;
+	s.offset = 0;
+	s.phase = 0;
+
+	i = 0;
+
+	if (path->len > 0 && path->els[0].k != FZ_MOVETO)
+	{
+		fz_warn("assert: path must begin with moveto");
+		return;
+	}
+
+	p0.x = p0.y = 0;
+
+	while (i < path->len)
+	{
+		switch (path->els[i++].k)
+		{
+		case FZ_MOVETO:
+			p1.x = path->els[i++].v;
+			p1.y = path->els[i++].v;
+			fz_dashmoveto(&s, p1);
+			beg = p0 = p1;
+			break;
+
+		case FZ_LINETO:
+			p1.x = path->els[i++].v;
+			p1.y = path->els[i++].v;
+			fz_dashlineto(&s, p1);
+			p0 = p1;
+			break;
+
+		case FZ_CURVETO:
+			p1.x = path->els[i++].v;
+			p1.y = path->els[i++].v;
+			p2.x = path->els[i++].v;
+			p2.y = path->els[i++].v;
+			p3.x = path->els[i++].v;
+			p3.y = path->els[i++].v;
+			fz_dashbezier(&s, p0.x, p0.y, p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, 0);
+			p0 = p3;
+			break;
+
+		case FZ_CLOSEPATH:
+			fz_dashlineto(&s, beg);
+			p0 = p1 = beg;
+			break;
+		}
+	}
+
+	fz_strokeflush(&s);
+}
diff --git a/draw/draw_scale.c b/draw/draw_scale.c
new file mode 100644
index 00000000..6254e3e4
--- /dev/null
+++ b/draw/draw_scale.c
@@ -0,0 +1,1175 @@
+/*
+This code does smooth scaling of a pixmap.
+
+This function returns a new pixmap representing the area starting at (0,0)
+given by taking the source pixmap src, scaling it to width w, and height h,
+and then positioning it at (frac(x),frac(y)).
+*/
+
+#include "fitz.h"
+
+/* Do we special case handling of single pixel high/wide images? The
+ * 'purest' handling is given by not special casing them, but certain
+ * files that use such images 'stack' them to give full images. Not
+ * special casing them results in then being fainter and giving noticable
+ * rounding errors.
+ */
+#define SINGLE_PIXEL_SPECIALS
+
+#ifdef DEBUG_SCALING
+#ifdef WIN32
+#include <windows.h>
+static void debug_print(const char *fmt, ...)
+{
+	va_list args;
+	char text[256];
+	va_start(args, fmt);
+	vsprintf(text, fmt, args);
+	va_end(args);
+	OutputDebugStringA(text);
+	printf(text);
+}
+#else
+static void debug_print(const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	vfprintf(stderr, fmt, args);
+	va_end(args);
+}
+#endif
+#endif
+#ifdef DEBUG_SCALING
+#define DBUG(A) debug_print A
+#else
+#define DBUG(A) do {} while(0==1)
+#endif
+
+/*
+Consider a row of source samples, src, of width src_w, positioned at x,
+scaled to width dst_w.
+
+src[i] is centred at: x + (i + 0.5)*dst_w/src_w
+
+Therefore the distance between the centre of the jth output pixel and
+the centre of the ith source sample is:
+
+dist[j,i] = j + 0.5 - (x + (i + 0.5)*dst_w/src_w)
+
+When scaling up, therefore:
+
+dst[j] = SUM(filter(dist[j,i]) * src[i])
+	(for all ints i)
+
+This can be simplified by noticing that filters are only non zero within
+a given filter width (henceforth called W). So:
+
+dst[j] = SUM(filter(dist[j,i]) * src[i])
+	(for ints i, s.t. (j*src_w/dst_w)-W < i < (j*src_w/dst_w)+W)
+
+When scaling down, each filtered source sample is stretched to be wider
+to avoid aliasing issues. This effectively reduces the distance between
+centres.
+
+dst[j] = SUM(filter(dist[j,i] * F) * F * src[i])
+	(where F = dst_w/src_w)
+	(for ints i, s.t. (j-W)/F < i < (j+W)/F)
+
+*/
+
+typedef struct fz_scalefilter_s fz_scalefilter;
+
+struct fz_scalefilter_s
+{
+	int width;
+	float (*fn)(fz_scalefilter *, float);
+};
+
+/* Image scale filters */
+
+static float
+triangle(fz_scalefilter *filter, float f)
+{
+	if (f >= 1)
+		return 0;
+	return 1-f;
+}
+
+static float
+box(fz_scalefilter *filter, float f)
+{
+	if (f >= 0.5f)
+		return 0;
+	return 1;
+}
+
+static float
+simple(fz_scalefilter *filter, float x)
+{
+	if (x >= 1)
+		return 0;
+	return 1 + (2*x - 3)*x*x;
+}
+
+static float
+lanczos2(fz_scalefilter *filter, float x)
+{
+	if (x >= 2)
+		return 0;
+	return sinf(M_PI*x) * sinf(M_PI*x/2) / (M_PI*x) / (M_PI*x/2);
+}
+
+static float
+lanczos3(fz_scalefilter *filter, float f)
+{
+	if (f >= 3)
+		return 0;
+	return sinf(M_PI*f) * sinf(M_PI*f/3) / (M_PI*f) / (M_PI*f/3);
+}
+
+/*
+The Mitchell family of filters is defined:
+
+	f(x) =	1 { (12-9B-6C)x^3 + (-18+12B+6C)x^2 + (6-2B)	for x < 1
+		- {
+		6 { (-B-6C)x^3+(6B+30C)x^2+(-12B-48C)x+(8B+24C)	for 1<=x<=2
+
+The 'best' ones lie along the line B+2C = 1.
+The literature suggests that B=1/3, C=1/3 is best.
+
+	f(x) =	1 { (12-3-2)x^3 - (-18+4+2)x^2 + (16/3)	for x < 1
+		- {
+		6 { (-7/3)x^3 + 12x^2 - 20x + (32/3)	for 1<=x<=2
+
+	f(x) =	1 { 21x^3 - 36x^2 + 16			for x < 1
+		- {
+		18{ -7x^3 + 36x^2 - 60x + 32		for 1<=x<=2
+*/
+
+static float
+mitchell(fz_scalefilter *filter, float x)
+{
+	if (x >= 2)
+		return 0;
+	if (x >= 1)
+		return (32 + x*(-60 + x*(36 - 7*x)))/18;
+	return (16 + x*x*(-36 + 21*x))/18;
+}
+
+fz_scalefilter fz_scalefilter_box = { 1, box };
+fz_scalefilter fz_scalefilter_triangle = { 1, triangle };
+fz_scalefilter fz_scalefilter_simple = { 1, simple };
+fz_scalefilter fz_scalefilter_lanczos2 = { 2, lanczos2 };
+fz_scalefilter fz_scalefilter_lanczos3 = { 3, lanczos3 };
+fz_scalefilter fz_scalefilter_mitchell = { 2, mitchell };
+
+/*
+We build ourselves a set of tables to contain the precalculated weights
+for a given set of scale settings.
+
+The first dst_w entries in index are the index into index of the
+sets of weight for each destination pixel.
+
+Each of the sets of weights is a set of values consisting of:
+	the minimum source pixel index used for this destination pixel
+	the number of weights used for this destination pixel
+	the weights themselves
+
+So to calculate dst[i] we do the following:
+
+	weights = &index[index[i]];
+	min = *weights++;
+	len = *weights++;
+	dst[i] = 0;
+	while (--len > 0)
+		dst[i] += src[min++] * *weights++
+
+in addition, we guarantee that at the end of this process weights will now
+point to the weights value for dst pixel i+1.
+
+In the simplest version of this algorithm, we would scale the whole image
+horizontally first into a temporary buffer, then scale that temporary
+buffer again vertically to give us our result. Using such a simple
+algorithm would mean that could use the same style of weights for both
+horizontal and vertical scaling.
+
+Unfortunately, this would also require a large temporary buffer,
+particularly in the case where we are scaling up.
+
+We therefore modify the algorithm as follows; we scale scanlines from the
+source image horizontally into a temporary buffer, until we have all the
+contributors for a given output scanline. We then produce that output
+scanline from the temporary buffer. In this way we restrict the height
+of the temporary buffer to a small fraction of the final size.
+
+Unfortunately, this means that the pseudo code for recombining a
+scanline of fully scaled pixels is as follows:
+
+	weights = &index[index[y]];
+	min = *weights++;
+	len = *weights++;
+	for (x=0 to dst_w)
+		min2 = min
+		len2 = len
+		weights2 = weights
+		dst[x] = 0;
+		while (--len2 > 0)
+			dst[x] += temp[x][(min2++) % tmp_buf_height] * *weights2++
+
+i.e. it requires a % operation for every source pixel - this is typically
+expensive.
+
+To avoid this, we alter the order in which vertical weights are stored,
+so that they are ordered in the same order as the temporary buffer lines
+would appear. This simplifies the algorithm to:
+
+	weights = &index[index[y]];
+	min = *weights++;
+	len = *weights++;
+	for (x=0 to dst_w)
+		min2 = 0
+		len2 = len
+		weights2 = weights
+		dst[x] = 0;
+		while (--len2 > 0)
+			dst[x] += temp[i][min2++] * *weights2++
+
+This means that len may be larger than it needs to be (due to the
+possible inclusion of a zero weight row or two), but in practise this
+is only an increase of 1 or 2 at worst.
+
+We implement this by generating the weights as normal (but ensuring we
+leave enough space) and then reordering afterwards.
+
+*/
+
+typedef struct fz_weights_s fz_weights;
+
+struct fz_weights_s
+{
+	int count;
+	int max_len;
+	int n;
+	int flip;
+	int new_line;
+	int index[1];
+};
+
+static fz_weights *
+newweights(fz_scalefilter *filter, int src_w, float dst_w, int dst_w_i, int n, int flip)
+{
+	int max_len;
+	fz_weights *weights;
+
+	if (src_w > dst_w)
+	{
+		/* Scaling down, so there will be a maximum of
+		 * 2*filterwidth*src_w/dst_w src pixels
+		 * contributing to each dst pixel. */
+		max_len = (int)ceilf((2 * filter->width * src_w)/dst_w);
+		if (max_len > src_w)
+			max_len = src_w;
+	}
+	else
+	{
+		/* Scaling up, so there will be a maximum of
+		 * 2*filterwidth src pixels contributing to each dst pixel.
+		 */
+		max_len = 2 * filter->width;
+	}
+	/* We need the size of the struct,
+	 * plus dst_w*sizeof(int) for the index
+	 * plus (2+max_len)*sizeof(int) for the weights
+	 * plus room for an extra set of weights for reordering.
+	 */
+	weights = fz_malloc(sizeof(*weights)+(max_len+3)*(dst_w_i+1)*sizeof(int));
+	if (weights == NULL)
+		return NULL;
+	weights->count = -1;
+	weights->max_len = max_len;
+	weights->index[0] = dst_w_i;
+	weights->n = n;
+	weights->flip = flip;
+	return weights;
+}
+
+static void
+init_weights(fz_weights *weights, int j)
+{
+	int index;
+
+	assert(weights->count == j-1);
+	weights->count++;
+	weights->new_line = 1;
+	if (j == 0)
+		index = weights->index[0];
+	else
+	{
+		index = weights->index[j-1];
+		index += 2 + weights->index[index+1];
+	}
+	weights->index[j] = index; /* row pointer */
+	weights->index[index] = 0; /* min */
+	weights->index[index+1] = 0; /* len */
+}
+
+static void
+add_weight(fz_weights *weights, int j, int i, fz_scalefilter *filter,
+	float x, float F, float G, int src_w, float dst_w)
+{
+	float dist = j - x + 0.5f - ((i + 0.5f)*dst_w/src_w);
+	float f;
+	int min, len, index, weight;
+
+	dist *= G;
+	if (dist < 0)
+		dist = -dist;
+	f = filter->fn(filter, dist)*F;
+	weight = (int)(256*f+0.5f);
+	if (weight == 0)
+		return;
+
+	/* wrap i back into range */
+#ifdef MIRROR_WRAP
+	do
+	{
+		if (i < 0)
+			i = -1-i;
+		else if (i >= src_w)
+			i = 2*src_w-1-i;
+		else
+			break;
+	}
+	while (1);
+#elif defined(WRAP)
+	if (i < 0)
+		i = 0;
+	else if (i >= src_w)
+		i = src_w-1;
+#else
+	if (i < 0)
+	{
+		i = 0;
+		weight = 0;
+	}
+	else if (i >= src_w)
+	{
+		i = src_w-1;
+		weight = 0;
+	}
+#endif
+
+	DBUG(("add_weight[%d][%d] = %d(%g) dist=%g\n",j,i,weight,f,dist));
+
+	if (weights->new_line)
+	{
+		/* New line */
+		weights->new_line = 0;
+		index = weights->index[j]; /* row pointer */
+		weights->index[index] = i; /* min */
+		weights->index[index+1] = 0; /* len */
+	}
+	index = weights->index[j];
+	min = weights->index[index++];
+	len = weights->index[index++];
+	while (i < min)
+	{
+		/* This only happens in rare cases, but we need to insert
+		 * one earlier. In exceedingly rare cases we may need to
+		 * insert more than one earlier. */
+		int k;
+
+		for (k = len; k > 0; k--)
+		{
+			weights->index[index+k] = weights->index[index+k-1];
+		}
+		weights->index[index] = 0;
+		min--;
+		len++;
+		weights->index[index-2] = min;
+		weights->index[index-1] = len;
+	}
+	if (i-min >= len)
+	{
+		/* The usual case */
+		while (i-min >= ++len)
+		{
+			weights->index[index+len-1] = 0;
+		}
+		assert(len-1 == i-min);
+		weights->index[index+i-min] = weight;
+		weights->index[index-1] = len;
+		assert(len <= weights->max_len);
+	}
+	else
+	{
+		/* Infrequent case */
+		weights->index[index+i-min] += weight;
+	}
+}
+
+static void
+reorder_weights(fz_weights *weights, int j, int src_w)
+{
+	int idx = weights->index[j];
+	int min = weights->index[idx++];
+	int len = weights->index[idx++];
+	int max = weights->max_len;
+	int tmp = idx+max;
+	int i, off;
+
+	/* Copy into the temporary area */
+	memcpy(&weights->index[tmp], &weights->index[idx], sizeof(int)*len);
+
+	/* Pad out if required */
+	assert(len <= max);
+	assert(min+len <= src_w);
+	off = 0;
+	if (len < max)
+	{
+		memset(&weights->index[tmp+len], 0, sizeof(int)*(max-len));
+		len = max;
+		if (min + len > src_w)
+		{
+			off = min + len - src_w;
+			min = src_w - len;
+			weights->index[idx-2] = min;
+		}
+		weights->index[idx-1] = len;
+	}
+
+	/* Copy back into the proper places */
+	for (i = 0; i < len; i++)
+	{
+		weights->index[idx+((min+i+off) % max)] = weights->index[tmp+i];
+	}
+}
+
+static void
+check_weights(fz_weights *weights, int j, int w)
+{
+	int idx, len;
+	int sum = 0;
+	int max = -256;
+	int maxidx = 0;
+	int i;
+
+	idx = weights->index[j];
+	idx++; /* min */
+	len = weights->index[idx++];
+
+	for(i=0; i < len; i++)
+	{
+		int v = weights->index[idx++];
+		sum += v;
+		if (v > max)
+		{
+			max = v;
+			maxidx = idx;
+		}
+	}
+	if (((j != 0) && (j != w-1)) || (sum > 256))
+		weights->index[maxidx-1] += 256-sum;
+	DBUG(("total weight %d = %d\n", j, sum));
+}
+
+static fz_weights *
+make_weights(int src_w, float x, float dst_w, fz_scalefilter *filter, int vertical, int dst_w_int, int n, int flip)
+{
+	fz_weights *weights;
+	float F, G;
+	float window;
+	int j;
+
+	if (dst_w < src_w)
+	{
+		/* Scaling down */
+		F = dst_w / src_w;
+		G = 1;
+	}
+	else
+	{
+		/* Scaling up */
+		F = 1;
+		G = src_w / dst_w;
+	}
+	window = filter->width / F;
+	DBUG(("make_weights src_w=%d x=%g dst_w=%g dst_w_int=%d F=%g window=%g\n", src_w, x, dst_w, dst_w_int, F, window));
+	weights	= newweights(filter, src_w, dst_w, dst_w_int, n, flip);
+	if (weights == NULL)
+		return NULL;
+	for (j = 0; j < dst_w_int; j++)
+	{
+		/* find the position of the centre of dst[j] in src space */
+		float centre = (j - x + 0.5f)*src_w/dst_w - 0.5f;
+		int l, r;
+		l = ceilf(centre - window);
+		r = floorf(centre + window);
+		DBUG(("%d: centre=%g l=%d r=%d\n", j, centre, l, r));
+		init_weights(weights, j);
+		for (; l <= r; l++)
+		{
+			add_weight(weights, j, l, filter, x, F, G, src_w, dst_w);
+		}
+		check_weights(weights, j, dst_w_int);
+		if (vertical)
+		{
+			reorder_weights(weights, j, src_w);
+		}
+	}
+	weights->count++; /* weights->count = dst_w_int now */
+	return weights;
+}
+
+static void
+scale_row_to_temp(int *dst, unsigned char *src, fz_weights *weights)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int len, i, j, n;
+	unsigned char *min;
+
+	n = weights->n;
+	if (weights->flip)
+	{
+		dst += (weights->count-1)*n;
+		for (i=weights->count; i > 0; i--)
+		{
+			min = &src[n * *contrib++];
+			len = *contrib++;
+			for (j = 0; j < n; j++)
+				dst[j] = 0;
+			while (len-- > 0)
+			{
+				for (j = n; j > 0; j--)
+					*dst++ += *min++ * *contrib;
+				dst -= n;
+				contrib++;
+			}
+			dst -= n;
+		}
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			min = &src[n * *contrib++];
+			len = *contrib++;
+			for (j = 0; j < n; j++)
+				dst[j] = 0;
+			while (len-- > 0)
+			{
+				for (j = n; j > 0; j--)
+					*dst++ += *min++ * *contrib;
+				dst -= n;
+				contrib++;
+			}
+			dst += n;
+		}
+	}
+}
+
+static void
+scale_row_to_temp1(int *dst, unsigned char *src, fz_weights *weights)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int len, i;
+	unsigned char *min;
+
+	assert(weights->n == 1);
+	if (weights->flip)
+	{
+		dst += weights->count;
+		for (i=weights->count; i > 0; i--)
+		{
+			int val = 0;
+			min = &src[*contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				val += *min++ * *contrib++;
+			}
+			*--dst = val;
+		}
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			int val = 0;
+			min = &src[*contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				val += *min++ * *contrib++;
+			}
+			*dst++ = val;
+		}
+	}
+}
+
+static void
+scale_row_to_temp2(int *dst, unsigned char *src, fz_weights *weights)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int len, i;
+	unsigned char *min;
+
+	assert(weights->n == 2);
+	if (weights->flip)
+	{
+		dst += 2*weights->count;
+		for (i=weights->count; i > 0; i--)
+		{
+			int c1 = 0;
+			int c2 = 0;
+			min = &src[2 * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				c1 += *min++ * *contrib;
+				c2 += *min++ * *contrib++;
+			}
+			*--dst = c2;
+			*--dst = c1;
+		}
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			int c1 = 0;
+			int c2 = 0;
+			min = &src[2 * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				c1 += *min++ * *contrib;
+				c2 += *min++ * *contrib++;
+			}
+			*dst++ = c1;
+			*dst++ = c2;
+		}
+	}
+}
+
+static void
+scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
+{
+	int *contrib = &weights->index[weights->index[0]];
+#ifndef ARCH_ARM
+	int len, i;
+	unsigned char *min;
+#endif
+
+	assert(weights->n == 4);
+	if (weights->flip)
+	{
+		dst += 4*weights->count;
+#ifdef ARCH_ARM
+		asm volatile(
+		"1:"
+		"ldr	r4, [%2], #4		@ r4 = *contrib++	\n"
+		"ldr	r9, [%2], #4		@ r9 = len = *contrib++	\n"
+		"mov	r5, #0			@ r5 = r = 0		\n"
+		"mov	r6, #0			@ r6 = g = 0		\n"
+		"mov	r7, #0			@ r7 = b = 0		\n"
+		"mov	r8, #0			@ r8 = a = 0		\n"
+		"add	r4, %1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
+		"cmp	r9, #0			@ while (len-- > 0)	\n"
+		"beq	3f			@ {			\n"
+		"2:							\n"
+		"ldr	r10,[%2], #4		@ r10 = *contrib++	\n"
+		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+		"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
+		"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
+		"mla	r5, r10,r11,r5		@ r += r11 * r10	\n"
+		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+		"mla	r6, r10,r12,r6		@ g += r12 * r10	\n"
+		"mla	r7, r10,r14,r7		@ b += r14 * r10	\n"
+		"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
+		"subs	r9, r9, #1		@ r9 = len--		\n"
+		"bgt	2b			@ }			\n"
+		"stmdb	%0!,{r5,r6,r7,r8}	@ *--dst=a;*--dst=b;	\n"
+		"3:				@ *--dst=g;*--dst=r;	\n"
+		"subs	%3, %3, #1		@ i--			\n"
+		"bgt	1b			@ 			\n"
+		:
+		:
+		"r" (dst),
+		"r" (src),
+		"r" (contrib),
+		"r" (weights->count)
+		:
+		"r4","r5","r6","r7","r8","r9","r10","r11","r12","r14",
+		"memory","cc"
+		);
+#else
+		for (i=weights->count; i > 0; i--)
+		{
+			int r = 0;
+			int g = 0;
+			int b = 0;
+			int a = 0;
+			min = &src[4 * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				r += *min++ * *contrib;
+				g += *min++ * *contrib;
+				b += *min++ * *contrib;
+				a += *min++ * *contrib++;
+			}
+			*--dst = a;
+			*--dst = b;
+			*--dst = g;
+			*--dst = r;
+		}
+#endif
+	}
+	else
+	{
+#ifdef ARCH_ARM
+		asm volatile(
+		"1:"
+		"ldr	r4, [%2], #4		@ r4 = *contrib++	\n"
+		"ldr	r9, [%2], #4		@ r9 = len = *contrib++	\n"
+		"mov	r5, #0			@ r5 = r = 0		\n"
+		"mov	r6, #0			@ r6 = g = 0		\n"
+		"mov	r7, #0			@ r7 = b = 0		\n"
+		"mov	r8, #0			@ r8 = a = 0		\n"
+		"add	r4, %1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
+		"cmp	r9, #0			@ while (len-- > 0)	\n"
+		"beq	3f			@ {			\n"
+		"2:							\n"
+		"ldr	r10,[%2], #4		@ r10 = *contrib++	\n"
+		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+		"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
+		"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
+		"mla	r5, r10,r11,r5		@ r += r11 * r10	\n"
+		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
+		"mla	r6, r10,r12,r6		@ g += r12 * r10	\n"
+		"mla	r7, r10,r14,r7		@ b += r14 * r10	\n"
+		"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
+		"subs	r9, r9, #1		@ r9 = len--		\n"
+		"bgt	2b			@ }			\n"
+		"stmia	%0!,{r5,r6,r7,r8}	@ *dst++=r;*dst++=g;	\n"
+		"3:				@ *dst++=b;*dst++=a;	\n"
+		"subs	%3, %3, #1		@ i--			\n"
+		"bgt	1b			@ 			\n"
+		:
+		:
+		"r" (dst),
+		"r" (src),
+		"r" (contrib),
+		"r" (weights->count)
+		:
+		"r4","r5","r6","r7","r8","r9","r10","r11","r12","r14",
+		"memory","cc"
+		);
+#else
+		for (i=weights->count; i > 0; i--)
+		{
+			int r = 0;
+			int g = 0;
+			int b = 0;
+			int a = 0;
+			min = &src[4 * *contrib++];
+			len = *contrib++;
+			while (len-- > 0)
+			{
+				r += *min++ * *contrib;
+				g += *min++ * *contrib;
+				b += *min++ * *contrib;
+				a += *min++ * *contrib++;
+			}
+			*dst++ = r;
+			*dst++ = g;
+			*dst++ = b;
+			*dst++ = a;
+		}
+#endif
+	}
+}
+
+static void
+scale_row_from_temp(unsigned char *dst, int *src, fz_weights *weights, int width, int row)
+{
+	int *contrib = &weights->index[weights->index[row]];
+	int len, x;
+
+	contrib++; /* Skip min */
+	len = *contrib++;
+	for (x=width; x > 0; x--)
+	{
+		int *min = src;
+		int val = 0;
+		int len2 = len;
+		int *contrib2 = contrib;
+
+		while (len2-- > 0)
+		{
+			val += *min * *contrib2++;
+			min += width;
+		}
+		val = (val+(1<<15))>>16;
+		if (val < 0)
+			val = 0;
+		else if (val > 255)
+			val = 255;
+		*dst++ = val;
+		src++;
+	}
+}
+
+#ifdef SINGLE_PIXEL_SPECIALS
+static void
+duplicate_single_pixel(unsigned char *dst, unsigned char *src, int n, int w, int h)
+{
+	int i;
+
+	for (i = n; i > 0; i--)
+		*dst++ = *src++;
+	for (i = (w*h-1)*n; i > 0; i--)
+	{
+		*dst = dst[-n];
+		dst++;
+	}
+}
+
+static void
+scale_single_row(unsigned char *dst, unsigned char *src, fz_weights *weights, int src_w, int h)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int min, len, i, j, val, n;
+	int tmp[FZ_MAXCOLORS];
+
+	n = weights->n;
+	/* Scale a single row */
+	if (weights->flip)
+	{
+		dst += (weights->count-1)*n;
+		for (i=weights->count; i > 0; i--)
+		{
+			min = *contrib++;
+			len = *contrib++;
+			min *= n;
+			for (j = 0; j < n; j++)
+				tmp[j] = 0;
+			while (len-- > 0)
+			{
+				for (j = 0; j < n; j++)
+					tmp[j] += src[min++] * *contrib;
+				contrib++;
+			}
+			for (j = 0; j < n; j++)
+			{
+				val = (tmp[j]+(1<<7))>>8;
+				if (val < 0)
+					val = 0;
+				else if (val > 255)
+					val = 255;
+				*dst++ = val;
+			}
+			dst -= 2*n;
+		}
+		dst += n;
+	}
+	else
+	{
+		for (i=weights->count; i > 0; i--)
+		{
+			min = *contrib++;
+			len = *contrib++;
+			min *= n;
+			for (j = 0; j < n; j++)
+				tmp[j] = 0;
+			while (len-- > 0)
+			{
+				for (j = 0; j < n; j++)
+					tmp[j] += src[min++] * *contrib;
+				contrib++;
+			}
+			for (j = 0; j < n; j++)
+			{
+				val = (tmp[j]+(1<<7))>>8;
+				if (val < 0)
+					val = 0;
+				else if (val > 255)
+					val = 255;
+				*dst++ = val;
+			}
+		}
+	}
+	/* And then duplicate it h times */
+	n *= weights->count;
+	while (--h > 0)
+	{
+		memcpy(dst, dst-n, n);
+		dst += n;
+	}
+}
+
+static void
+scale_single_col(unsigned char *dst, unsigned char *src, fz_weights *weights, int src_w, int n, int w, int flip_y)
+{
+	int *contrib = &weights->index[weights->index[0]];
+	int min, len, i, j, val;
+	int tmp[FZ_MAXCOLORS];
+
+	if (flip_y)
+	{
+		src_w = (src_w-1)*n;
+		w = (w-1)*n;
+		for (i=weights->count; i > 0; i--)
+		{
+			/* Scale the next pixel in the column */
+			min = *contrib++;
+			len = *contrib++;
+			min = src_w-min*n;
+			for (j = 0; j < n; j++)
+				tmp[j] = 0;
+			while (len-- > 0)
+			{
+				for (j = 0; j < n; j++)
+					tmp[j] += src[src_w-min+j] * *contrib;
+				contrib++;
+			}
+			for (j = 0; j < n; j++)
+			{
+				val = (tmp[j]+(1<<7))>>8;
+				if (val < 0)
+					val = 0;
+				else if (val > 255)
+					val = 255;
+				*dst++ = val;
+			}
+			/* And then duplicate it across the row */
+			for (j = w; j > 0; j--)
+			{
+				*dst = dst[-n];
+				dst++;
+			}
+		}
+	}
+	else
+	{
+		w = (w-1)*n;
+		for (i=weights->count; i > 0; i--)
+		{
+			/* Scale the next pixel in the column */
+			min = *contrib++;
+			len = *contrib++;
+			min *= n;
+			for (j = 0; j < n; j++)
+				tmp[j] = 0;
+			while (len-- > 0)
+			{
+				for (j = 0; j < n; j++)
+					tmp[j] += src[min++] * *contrib;
+				contrib++;
+			}
+			for (j = 0; j < n; j++)
+			{
+				val = (tmp[j]+(1<<7))>>8;
+				if (val < 0)
+					val = 0;
+				else if (val > 255)
+					val = 255;
+				*dst++ = val;
+			}
+			/* And then duplicate it across the row */
+			for (j = w; j > 0; j--)
+			{
+				*dst = dst[-n];
+				dst++;
+			}
+		}
+	}
+}
+#endif /* SINGLE_PIXEL_SPECIALS */
+
+fz_pixmap *
+fz_scalepixmap(fz_pixmap *src, float x, float y, float w, float h)
+{
+	fz_scalefilter *filter = &fz_scalefilter_simple;
+	fz_weights *contrib_rows = NULL;
+	fz_weights *contrib_cols = NULL;
+	fz_pixmap *output = NULL;
+	int *temp = NULL;
+	int max_row, temp_span, temp_rows, row;
+	int dst_w_int, dst_h_int, dst_x_int, dst_y_int;
+	int flip_x, flip_y;
+
+	DBUG(("Scale: (%d,%d) to (%g,%g) at (%g,%g)\n",src->w,src->h,w,h,x,y));
+
+	/* Find the destination bbox, width/height, and sub pixel offset,
+	 * allowing for whether we're flipping or not. */
+	/* Note that the x and y sub pixel offsets here are different.
+	 * The (x,y) position given describes where the bottom left corner
+	 * of the source image should be mapped to (i.e. where (0,h) in image
+	 * space ends up, not the more logical and sane (0,0)). Also there
+	 * are differences in the way we scale horizontally and vertically.
+	 * When scaling rows horizontally, we always read forwards through
+	 * the source, and store either forwards or in reverse as required.
+	 * When scaling vertically, we always store out forwards, but may
+	 * feed source rows in in a different order.
+	 *
+	 * Consider the image rectange 'r' to which the image is mapped,
+	 * and the (possibly) larger rectangle 'R', given by expanding 'r' to
+	 * complete pixels.
+	 *
+	 * x can either be r.xmin-R.xmin or R.xmax-r.xmax depending on whether
+	 * the image is x flipped or not. Whatever happens 0 <= x < 1.
+	 * y is always R.ymax - r.ymax.
+	 */
+	/* dst_x_int is calculated to be the left of the scaled image, and
+	 * x (the sub_pixel_offset) is the distance in from either the left
+	 * or right pixel expanded edge. */
+	flip_x = (w < 0);
+	if (flip_x)
+	{
+		float tmp;
+		w = -w;
+		dst_x_int = floor(x-w);
+		tmp = ceilf(x);
+		dst_w_int = (int)tmp;
+		x = tmp - x;
+		dst_w_int -= dst_x_int;
+	}
+	else
+	{
+		dst_x_int = floor(x);
+		x -= (float)dst_x_int;
+		dst_w_int = (int)ceilf(x + w);
+	}
+	flip_y = (h < 0);
+	/* dst_y_int is calculated to be the bottom of the scaled image, but
+	 * y (the sub pixel offset) has to end up being the value at the top.
+	 */
+	if (flip_y)
+	{
+		h = -h;
+		dst_y_int = floor(y-h);
+		dst_h_int = (int)ceilf(y) - dst_y_int;
+	} else {
+		dst_y_int = floor(y);
+		y += h;
+		dst_h_int = (int)ceilf(y) - dst_y_int;
+	}
+	/* y is the top edge position in floats. We want it to be the
+	 * distance down from the next pixel boundary. */
+	y = ceilf(y) - y;
+
+	DBUG(("Result image: (%d,%d) at (%d,%d) (subpix=%g,%g)\n", dst_w_int, dst_h_int, dst_x_int, dst_y_int, x, y));
+
+	/* Step 1: Calculate the weights for columns and rows */
+#ifdef SINGLE_PIXEL_SPECIALS
+	if (src->w == 1)
+	{
+		contrib_cols = NULL;
+	}
+	else
+#endif /* SINGLE_PIXEL_SPECIALS */
+	{
+		contrib_cols = make_weights(src->w, x, w, filter, 0, dst_w_int, src->n, flip_x);
+		if (contrib_cols == NULL)
+			goto cleanup;
+	}
+#ifdef SINGLE_PIXEL_SPECIALS
+	if (src->h == 1)
+	{
+		contrib_rows = NULL;
+	}
+	else
+#endif /* SINGLE_PIXEL_SPECIALS */
+	{
+		contrib_rows = make_weights(src->h, y, h, filter, 1, dst_h_int, src->n, flip_y);
+		if (contrib_rows == NULL)
+			goto cleanup;
+	}
+
+	assert(contrib_cols == NULL || contrib_cols->count == dst_w_int);
+	assert(contrib_rows == NULL || contrib_rows->count == dst_h_int);
+	output = fz_newpixmap(src->colorspace, dst_x_int, dst_y_int, dst_w_int, dst_h_int);
+	if (output == NULL)
+		goto cleanup;
+
+	/* Step 2: Apply the weights */
+#ifdef SINGLE_PIXEL_SPECIALS
+	if (contrib_rows == NULL)
+	{
+		/* Only 1 source pixel high. */
+		if (contrib_cols == NULL)
+		{
+			/* Only 1 pixel in the entire image! */
+			duplicate_single_pixel(output->samples, src->samples, src->n, dst_w_int, dst_h_int);
+		}
+		else
+		{
+			/* Scale the row once, then copy it. */
+			scale_single_row(output->samples, src->samples, contrib_cols, src->w, dst_h_int);
+		}
+	}
+	else if (contrib_cols == NULL)
+	{
+		/* Only 1 source pixel wide. Scale the col and duplicate. */
+		scale_single_col(output->samples, src->samples, contrib_rows, src->h, src->n, dst_w_int, flip_y);
+	}
+	else
+#endif /* SINGLE_PIXEL_SPECIALS */
+	{
+		void (*row_scale)(int *dst, unsigned char *src, fz_weights *weights);
+
+		temp_span = contrib_cols->count * src->n;
+		temp_rows = contrib_rows->max_len;
+		if (temp_span <= 0 || temp_rows > INT_MAX / temp_span)
+			goto cleanup;
+		temp = fz_calloc(temp_span*temp_rows, sizeof(int));
+		if (temp == NULL)
+			goto cleanup;
+		switch (src->n)
+		{
+		default:
+			row_scale = scale_row_to_temp;
+			break;
+		case 1: /* Image mask case */
+			row_scale = scale_row_to_temp1;
+			break;
+		case 2: /* Greyscale with alpha case */
+			row_scale = scale_row_to_temp2;
+			break;
+		case 4: /* RGBA */
+			row_scale = scale_row_to_temp4;
+			break;
+		}
+		max_row = 0;
+		for (row = 0; row < contrib_rows->count; row++)
+		{
+			/*
+			Which source rows do we need to have scaled into the
+			temporary buffer in order to be able to do the final
+			scale?
+			*/
+			int row_index = contrib_rows->index[row];
+			int row_min = contrib_rows->index[row_index++];
+			int row_len = contrib_rows->index[row_index++];
+			while (max_row < row_min+row_len)
+			{
+				/* Scale another row */
+				assert(max_row < src->h);
+				DBUG(("scaling row %d to temp\n", max_row));
+				(*row_scale)(&temp[temp_span*(max_row % temp_rows)], &src->samples[(flip_y ? (src->h-1-max_row): max_row)*src->w*src->n], contrib_cols);
+				max_row++;
+			}
+
+			DBUG(("scaling row %d from temp\n", row));
+			scale_row_from_temp(&output->samples[row*output->w*output->n], temp, contrib_rows, temp_span, row);
+		}
+		fz_free(temp);
+	}
+
+cleanup:
+	fz_free(contrib_rows);
+	fz_free(contrib_cols);
+	return output;
+}
diff --git a/draw/draw_unpack.c b/draw/draw_unpack.c
new file mode 100644
index 00000000..2c7cb452
--- /dev/null
+++ b/draw/draw_unpack.c
@@ -0,0 +1,235 @@
+#include "fitz.h"
+
+/* Unpack image samples and optionally pad pixels with opaque alpha */
+
+#define get1(buf,x) ((buf[x >> 3] >> ( 7 - (x & 7) ) ) & 1 )
+#define get2(buf,x) ((buf[x >> 2] >> ( ( 3 - (x & 3) ) << 1 ) ) & 3 )
+#define get4(buf,x) ((buf[x >> 1] >> ( ( 1 - (x & 1) ) << 2 ) ) & 15 )
+#define get8(buf,x) (buf[x])
+#define get16(buf,x) (buf[x << 1])
+
+static unsigned char get1tab1[256][8];
+static unsigned char get1tab1p[256][16];
+static unsigned char get1tab255[256][8];
+static unsigned char get1tab255p[256][16];
+
+static void
+initget1tables(void)
+{
+	static int once = 0;
+	unsigned char bits[1];
+	int i, k, x;
+
+	/* TODO: mutex lock here */
+
+	if (once)
+		return;
+
+	for (i = 0; i < 256; i++)
+	{
+		bits[0] = i;
+		for (k = 0; k < 8; k++)
+		{
+			x = get1(bits, k);
+
+			get1tab1[i][k] = x;
+			get1tab1p[i][k * 2] = x;
+			get1tab1p[i][k * 2 + 1] = 255;
+
+			get1tab255[i][k] = x * 255;
+			get1tab255p[i][k * 2] = x * 255;
+			get1tab255p[i][k * 2 + 1] = 255;
+		}
+	}
+
+	once = 1;
+}
+
+void
+fz_unpacktile(fz_pixmap *dst, unsigned char * restrict src, int n, int depth, int stride, int scale)
+{
+	int pad, x, y, k;
+	int w = dst->w;
+
+	pad = 0;
+	if (dst->n > n)
+		pad = 255;
+
+	if (depth == 1)
+		initget1tables();
+
+	if (scale == 0)
+	{
+		switch (depth)
+		{
+		case 1: scale = 255; break;
+		case 2: scale = 85; break;
+		case 4: scale = 17; break;
+		}
+	}
+
+	for (y = 0; y < dst->h; y++)
+	{
+		unsigned char *sp = src + y * stride;
+		unsigned char *dp = dst->samples + y * (dst->w * dst->n);
+
+		/* Specialized loops */
+
+		if (n == 1 && depth == 1 && scale == 1 && !pad)
+		{
+			int w3 = w >> 3;
+			for (x = 0; x < w3; x++)
+			{
+				memcpy(dp, get1tab1[*sp++], 8);
+				dp += 8;
+			}
+			x = x << 3;
+			if (x < w)
+				memcpy(dp, get1tab1[*sp], w - x);
+		}
+
+		else if (n == 1 && depth == 1 && scale == 255 && !pad)
+		{
+			int w3 = w >> 3;
+			for (x = 0; x < w3; x++)
+			{
+				memcpy(dp, get1tab255[*sp++], 8);
+				dp += 8;
+			}
+			x = x << 3;
+			if (x < w)
+				memcpy(dp, get1tab255[*sp], w - x);
+		}
+
+		else if (n == 1 && depth == 1 && scale == 1 && pad)
+		{
+			int w3 = w >> 3;
+			for (x = 0; x < w3; x++)
+			{
+				memcpy(dp, get1tab1p[*sp++], 16);
+				dp += 16;
+			}
+			x = x << 3;
+			if (x < w)
+				memcpy(dp, get1tab1p[*sp], (w - x) << 1);
+		}
+
+		else if (n == 1 && depth == 1 && scale == 255 && pad)
+		{
+			int w3 = w >> 3;
+			for (x = 0; x < w3; x++)
+			{
+				memcpy(dp, get1tab255p[*sp++], 16);
+				dp += 16;
+			}
+			x = x << 3;
+			if (x < w)
+				memcpy(dp, get1tab255p[*sp], (w - x) << 1);
+		}
+
+		else if (depth == 8 && !pad)
+		{
+			int len = w * n;
+			while (len--)
+				*dp++ = *sp++;
+		}
+
+		else if (depth == 8 && pad)
+		{
+			for (x = 0; x < w; x++)
+			{
+				for (k = 0; k < n; k++)
+					*dp++ = *sp++;
+				*dp++ = 255;
+			}
+		}
+
+		else
+		{
+			int b = 0;
+			for (x = 0; x < w; x++)
+			{
+				for (k = 0; k < n; k++)
+				{
+					switch (depth)
+					{
+					case 1: *dp++ = get1(sp, b) * scale; break;
+					case 2: *dp++ = get2(sp, b) * scale; break;
+					case 4: *dp++ = get4(sp, b) * scale; break;
+					case 8: *dp++ = get8(sp, b); break;
+					case 16: *dp++ = get16(sp, b); break;
+					}
+					b++;
+				}
+				if (pad)
+					*dp++ = 255;
+			}
+		}
+	}
+}
+
+/* Apply decode array */
+
+void
+fz_decodeindexedtile(fz_pixmap *pix, float *decode, int maxval)
+{
+	int add[FZ_MAXCOLORS];
+	int mul[FZ_MAXCOLORS];
+	unsigned char *p = pix->samples;
+	int len = pix->w * pix->h;
+	int n = pix->n - 1;
+	int needed;
+	int k;
+
+	needed = 0;
+	for (k = 0; k < n; k++)
+	{
+		int min = decode[k * 2] * 256;
+		int max = decode[k * 2 + 1] * 256;
+		add[k] = min;
+		mul[k] = (max - min) / maxval;
+		needed |= min != 0 || max != maxval * 256;
+	}
+
+	if (!needed)
+		return;
+
+	while (len--)
+	{
+		for (k = 0; k < n; k++)
+			p[k] = (add[k] + (((p[k] << 8) * mul[k]) >> 8)) >> 8;
+		p += n + 1;
+	}
+}
+
+void
+fz_decodetile(fz_pixmap *pix, float *decode)
+{
+	int add[FZ_MAXCOLORS];
+	int mul[FZ_MAXCOLORS];
+	unsigned char *p = pix->samples;
+	int len = pix->w * pix->h;
+	int n = MAX(1, pix->n - 1);
+	int needed;
+	int k;
+
+	needed = 0;
+	for (k = 0; k < n; k++)
+	{
+		int min = decode[k * 2] * 255;
+		int max = decode[k * 2 + 1] * 255;
+		add[k] = min;
+		mul[k] = max - min;
+		needed |= min != 0 || max != 255;
+	}
+
+	if (!needed)
+		return;
+
+	while (len--)
+	{
+		for (k = 0; k < n; k++)
+			p[k] = add[k] + fz_mul255(p[k], mul[k]);
+		p += pix->n;
+	}
+}
diff --git a/draw/glyphcache.c b/draw/glyphcache.c
deleted file mode 100644
index 15bb7cae..00000000
--- a/draw/glyphcache.c
+++ /dev/null
@@ -1,134 +0,0 @@
-#include "fitz.h"
-
-#define MAXFONTSIZE 1000
-#define MAXGLYPHSIZE 256
-#define MAXCACHESIZE (1024*1024)
-
-typedef struct fz_glyphkey_s fz_glyphkey;
-
-struct fz_glyphcache_s
-{
-	fz_hashtable *hash;
-	int total;
-};
-
-struct fz_glyphkey_s
-{
-	fz_font *font;
-	int a, b;
-	int c, d;
-	unsigned short cid;
-	unsigned char e, f;
-};
-
-fz_glyphcache *
-fz_newglyphcache(void)
-{
-	fz_glyphcache *cache;
-
-	cache = fz_malloc(sizeof(fz_glyphcache));
-	cache->hash = fz_newhash(509, sizeof(fz_glyphkey));
-	cache->total = 0;
-
-	return cache;
-}
-
-static void
-fz_evictglyphcache(fz_glyphcache *cache)
-{
-	fz_glyphkey *key;
-	fz_pixmap *pixmap;
-	int i;
-
-	for (i = 0; i < fz_hashlen(cache->hash); i++)
-	{
-		key = fz_hashgetkey(cache->hash, i);
-		if (key->font)
-			fz_dropfont(key->font);
-		pixmap = fz_hashgetval(cache->hash, i);
-		if (pixmap)
-			fz_droppixmap(pixmap);
-	}
-
-	cache->total = 0;
-
-	fz_emptyhash(cache->hash);
-}
-
-void
-fz_freeglyphcache(fz_glyphcache *cache)
-{
-	fz_evictglyphcache(cache);
-	fz_freehash(cache->hash);
-	fz_free(cache);
-}
-
-fz_pixmap *
-fz_renderstrokedglyph(fz_glyphcache *cache, fz_font *font, int cid, fz_matrix trm, fz_matrix ctm, fz_strokestate *stroke)
-{
-	if (font->ftface)
-		return fz_renderftstrokedglyph(font, cid, trm, ctm, stroke);
-	return fz_renderglyph(cache, font, cid, trm);
-}
-
-fz_pixmap *
-fz_renderglyph(fz_glyphcache *cache, fz_font *font, int cid, fz_matrix ctm)
-{
-	fz_glyphkey key;
-	fz_pixmap *val;
-	float size = fz_matrixexpansion(ctm);
-
-	if (size > MAXFONTSIZE)
-	{
-		/* TODO: this case should be handled by rendering glyph as a path fill */
-		fz_warn("font size too large (%g), not rendering glyph", size);
-		return nil;
-	}
-
-	memset(&key, 0, sizeof key);
-	key.font = font;
-	key.cid = cid;
-	key.a = ctm.a * 65536;
-	key.b = ctm.b * 65536;
-	key.c = ctm.c * 65536;
-	key.d = ctm.d * 65536;
-	key.e = (ctm.e - floorf(ctm.e)) * 256;
-	key.f = (ctm.f - floorf(ctm.f)) * 256;
-
-	val = fz_hashfind(cache->hash, &key);
-	if (val)
-		return fz_keeppixmap(val);
-
-	ctm.e = floorf(ctm.e) + key.e / 256.0f;
-	ctm.f = floorf(ctm.f) + key.f / 256.0f;
-
-	if (font->ftface)
-	{
-		val = fz_renderftglyph(font, cid, ctm);
-	}
-	else if (font->t3procs)
-	{
-		val = fz_rendert3glyph(font, cid, ctm);
-	}
-	else
-	{
-		fz_warn("assert: uninitialized font structure");
-		return nil;
-	}
-
-	if (val)
-	{
-		if (val->w < MAXGLYPHSIZE && val->h < MAXGLYPHSIZE)
-		{
-			if (cache->total + val->w * val->h > MAXCACHESIZE)
-				fz_evictglyphcache(cache);
-			fz_keepfont(key.font);
-			fz_hashinsert(cache->hash, &key, val);
-			cache->total += val->w * val->h;
-			return fz_keeppixmap(val);
-		}
-		return val;
-	}
-
-	return nil;
-}
diff --git a/draw/imagedraw.c b/draw/imagedraw.c
deleted file mode 100644
index 044b2938..00000000
--- a/draw/imagedraw.c
+++ /dev/null
@@ -1,372 +0,0 @@
-#include "fitz.h"
-
-typedef unsigned char byte;
-
-static inline float roundup(float x)
-{
-	return (x < 0) ? floorf(x) : ceilf(x);
-}
-
-static inline int lerp(int a, int b, int t)
-{
-	return a + (((b - a) * t) >> 16);
-}
-
-static inline int bilerp(int a, int b, int c, int d, int u, int v)
-{
-	return lerp(lerp(a, b, u), lerp(c, d, u), v);
-}
-
-static inline byte *samplenearest(byte *s, int w, int h, int n, int u, int v)
-{
-	if (u < 0) u = 0;
-	if (v < 0) v = 0;
-	if (u >= w) u = w - 1;
-	if (v >= h) v = h - 1;
-	return s + (v * w + u) * n;
-}
-
-/* Blend premultiplied source image in constant alpha over destination */
-
-static inline void
-fz_paintaffinealphaNlerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha)
-{
-	int k;
-
-	while (w--)
-	{
-		int ui = u >> 16;
-		int vi = v >> 16;
-		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
-		{
-			int uf = u & 0xffff;
-			int vf = v & 0xffff;
-			byte *a = samplenearest(sp, sw, sh, n, ui, vi);
-			byte *b = samplenearest(sp, sw, sh, n, ui+1, vi);
-			byte *c = samplenearest(sp, sw, sh, n, ui, vi+1);
-			byte *d = samplenearest(sp, sw, sh, n, ui+1, vi+1);
-			int x = bilerp(a[n-1], b[n-1], c[n-1], d[n-1], uf, vf);
-			int t = 255 - fz_mul255(x, alpha);
-			for (k = 0; k < n; k++)
-			{
-				x = bilerp(a[k], b[k], c[k], d[k], uf, vf);
-				dp[k] = fz_mul255(x, alpha) + fz_mul255(dp[k], t);
-			}
-		}
-		dp += n;
-		u += fa;
-		v += fb;
-	}
-}
-
-static inline void
-fz_paintaffinealphaNnear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha)
-{
-	int k;
-
-	while (w--)
-	{
-		int ui = u >> 16;
-		int vi = v >> 16;
-		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
-		{
-			byte *sample = sp + ((vi * sw + ui) * n);
-			int t = 255 - fz_mul255(sample[n-1], alpha);
-			for (k = 0; k < n; k++)
-				dp[k] = fz_mul255(sample[k], alpha) + fz_mul255(dp[k], t);
-		}
-		dp += n;
-		u += fa;
-		v += fb;
-	}
-}
-
-/* Blend premultiplied source image over destination */
-
-static inline void
-fz_paintaffineNlerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n)
-{
-	int k;
-
-	while (w--)
-	{
-		int ui = u >> 16;
-		int vi = v >> 16;
-		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
-		{
-			int uf = u & 0xffff;
-			int vf = v & 0xffff;
-			byte *a = samplenearest(sp, sw, sh, n, ui, vi);
-			byte *b = samplenearest(sp, sw, sh, n, ui+1, vi);
-			byte *c = samplenearest(sp, sw, sh, n, ui, vi+1);
-			byte *d = samplenearest(sp, sw, sh, n, ui+1, vi+1);
-			int t = 255 - bilerp(a[n-1], b[n-1], c[n-1], d[n-1], uf, vf);
-			for (k = 0; k < n; k++)
-			{
-				int x = bilerp(a[k], b[k], c[k], d[k], uf, vf);
-				dp[k] = x + fz_mul255(dp[k], t);
-			}
-		}
-		dp += n;
-		u += fa;
-		v += fb;
-	}
-}
-
-static inline void
-fz_paintaffineNnear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n)
-{
-	int k;
-
-	while (w--)
-	{
-		int ui = u >> 16;
-		int vi = v >> 16;
-		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
-		{
-			byte *sample = sp + ((vi * sw + ui) * n);
-			int t = 255 - sample[n-1];
-			for (k = 0; k < n; k++)
-				dp[k] = sample[k] + fz_mul255(dp[k], t);
-		}
-		dp += n;
-		u += fa;
-		v += fb;
-	}
-}
-
-/* Blend non-premultiplied color in source image mask over destination */
-
-static inline void
-fz_paintaffinecolorNlerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, byte *color)
-{
-	int sa = color[n-1];
-	int k;
-
-	while (w--)
-	{
-		int ui = u >> 16;
-		int vi = v >> 16;
-		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
-		{
-			int uf = u & 0xffff;
-			int vf = v & 0xffff;
-			byte *a = samplenearest(sp, sw, sh, 1, ui, vi);
-			byte *b = samplenearest(sp, sw, sh, 1, ui+1, vi);
-			byte *c = samplenearest(sp, sw, sh, 1, ui, vi+1);
-			byte *d = samplenearest(sp, sw, sh, 1, ui+1, vi+1);
-			int ma = bilerp(a[0], b[0], c[0], d[0], uf, vf);
-			int masa = FZ_COMBINE(FZ_EXPAND(ma), sa);
-			for (k = 0; k < n - 1; k++)
-				dp[k] = FZ_BLEND(color[k], dp[k], masa);
-			dp[k] = FZ_BLEND(255, dp[k], masa);
-		}
-		dp += n;
-		u += fa;
-		v += fb;
-	}
-}
-
-static inline void
-fz_paintaffinecolorNnear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, byte *color)
-{
-	int sa = color[n-1];
-	int k;
-
-	while (w--)
-	{
-		int ui = u >> 16;
-		int vi = v >> 16;
-		if (ui >= 0 && ui < sw && vi >= 0 && vi < sh)
-		{
-			int ma = sp[vi * sw + ui];
-			int masa = FZ_COMBINE(FZ_EXPAND(ma), sa);
-			for (k = 0; k < n - 1; k++)
-				dp[k] = FZ_BLEND(color[k], dp[k], masa);
-			dp[k] = FZ_BLEND(255, dp[k], masa);
-		}
-		dp += n;
-		u += fa;
-		v += fb;
-	}
-}
-
-static void
-fz_paintaffinelerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha)
-{
-	if (alpha == 255)
-	{
-		switch (n)
-		{
-		case 1: fz_paintaffineNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 1); break;
-		case 2: fz_paintaffineNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 2); break;
-		case 4: fz_paintaffineNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 4); break;
-		default: fz_paintaffineNlerp(dp, sp, sw, sh, u, v, fa, fb, w, n); break;
-		}
-	}
-	else if (alpha > 0)
-	{
-		switch (n)
-		{
-		case 1: fz_paintaffinealphaNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 1, alpha); break;
-		case 2: fz_paintaffinealphaNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 2, alpha); break;
-		case 4: fz_paintaffinealphaNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 4, alpha); break;
-		default: fz_paintaffinealphaNlerp(dp, sp, sw, sh, u, v, fa, fb, w, n, alpha); break;
-		}
-	}
-}
-
-static void
-fz_paintaffinenear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, int alpha)
-{
-	if (alpha == 255)
-	{
-		switch (n)
-		{
-		case 1: fz_paintaffineNnear(dp, sp, sw, sh, u, v, fa, fb, w, 1); break;
-		case 2: fz_paintaffineNnear(dp, sp, sw, sh, u, v, fa, fb, w, 2); break;
-		case 4: fz_paintaffineNnear(dp, sp, sw, sh, u, v, fa, fb, w, 4); break;
-		default: fz_paintaffineNnear(dp, sp, sw, sh, u, v, fa, fb, w, n); break;
-		}
-	}
-	else if (alpha > 0)
-	{
-		switch (n)
-		{
-		case 1: fz_paintaffinealphaNnear(dp, sp, sw, sh, u, v, fa, fb, w, 1, alpha); break;
-		case 2: fz_paintaffinealphaNnear(dp, sp, sw, sh, u, v, fa, fb, w, 2, alpha); break;
-		case 4: fz_paintaffinealphaNnear(dp, sp, sw, sh, u, v, fa, fb, w, 4, alpha); break;
-		default: fz_paintaffinealphaNnear(dp, sp, sw, sh, u, v, fa, fb, w, n, alpha); break;
-		}
-	}
-}
-
-static void
-fz_paintaffinecolorlerp(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, byte *color)
-{
-	switch (n)
-	{
-	case 2: fz_paintaffinecolorNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 2, color); break;
-	case 4: fz_paintaffinecolorNlerp(dp, sp, sw, sh, u, v, fa, fb, w, 4, color); break;
-	default: fz_paintaffinecolorNlerp(dp, sp, sw, sh, u, v, fa, fb, w, n, color); break;
-	}
-}
-
-static void
-fz_paintaffinecolornear(byte *dp, byte *sp, int sw, int sh, int u, int v, int fa, int fb, int w, int n, byte *color)
-{
-	switch (n)
-	{
-	case 2: fz_paintaffinecolorNnear(dp, sp, sw, sh, u, v, fa, fb, w, 2, color); break;
-	case 4: fz_paintaffinecolorNnear(dp, sp, sw, sh, u, v, fa, fb, w, 4, color); break;
-	default: fz_paintaffinecolorNnear(dp, sp, sw, sh, u, v, fa, fb, w, n, color); break;
-	}
-}
-
-/* Draw an image with an affine transform on destination */
-
-static void
-fz_paintimageimp(fz_pixmap *dst, fz_bbox scissor, fz_pixmap *img, fz_matrix ctm, byte *color, int alpha)
-{
-	byte *dp, *sp;
-	int u, v, fa, fb, fc, fd;
-	int x, y, w, h;
-	int sw, sh, n;
-	fz_matrix inv;
-	fz_bbox bbox;
-	int dolerp;
-
-	/* grid fit the image */
-	if (fz_isrectilinear(ctm))
-	{
-		ctm.a = roundup(ctm.a);
-		ctm.b = roundup(ctm.b);
-		ctm.c = roundup(ctm.c);
-		ctm.d = roundup(ctm.d);
-		ctm.e = floorf(ctm.e);
-		ctm.f = floorf(ctm.f);
-	}
-
-	/* turn on interpolation for upscaled and non-rectilinear transforms */
-	dolerp = 0;
-	if (!fz_isrectilinear(ctm))
-		dolerp = 1;
-	if (sqrtf(ctm.a * ctm.a + ctm.b * ctm.b) > img->w)
-		dolerp = 1;
-	if (sqrtf(ctm.c * ctm.c + ctm.d * ctm.d) > img->h)
-		dolerp = 1;
-
-	/* except when we shouldn't, at large magnifications */
-	if (!img->interpolate)
-	{
-		if (sqrtf(ctm.a * ctm.a + ctm.b * ctm.b) > img->w * 2)
-			dolerp = 0;
-		if (sqrtf(ctm.c * ctm.c + ctm.d * ctm.d) > img->h * 2)
-			dolerp = 0;
-	}
-
-	bbox = fz_roundrect(fz_transformrect(ctm, fz_unitrect));
-	bbox = fz_intersectbbox(bbox, scissor);
-	x = bbox.x0;
-	y = bbox.y0;
-	w = bbox.x1 - bbox.x0;
-	h = bbox.y1 - bbox.y0;
-
-	/* map from screen space (x,y) to image space (u,v) */
-	inv = fz_scale(1.0f / img->w, -1.0f / img->h);
-	inv = fz_concat(inv, fz_translate(0, 1));
-	inv = fz_concat(inv, ctm);
-	inv = fz_invertmatrix(inv);
-
-	fa = inv.a * 65536;
-	fb = inv.b * 65536;
-	fc = inv.c * 65536;
-	fd = inv.d * 65536;
-
-	/* Calculate initial texture positions. Do a half step to start. */
-	u = (fa * x) + (fc * y) + inv.e * 65536 + ((fa+fc)>>1);
-	v = (fb * x) + (fd * y) + inv.f * 65536 + ((fb+fd)>>1);
-
-	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
-	n = dst->n;
-	sp = img->samples;
-	sw = img->w;
-	sh = img->h;
-
-	/* TODO: if (fb == 0 && fa == 1) call fz_paintspan */
-
-	while (h--)
-	{
-		if (dolerp)
-		{
-			if (color)
-				fz_paintaffinecolorlerp(dp, sp, sw, sh, u, v, fa, fb, w, n, color);
-			else
-				fz_paintaffinelerp(dp, sp, sw, sh, u, v, fa, fb, w, n, alpha);
-		}
-		else
-		{
-			if (color)
-				fz_paintaffinecolornear(dp, sp, sw, sh, u, v, fa, fb, w, n, color);
-			else
-				fz_paintaffinenear(dp, sp, sw, sh, u, v, fa, fb, w, n, alpha);
-		}
-		dp += dst->w * n;
-		u += fc;
-		v += fd;
-	}
-}
-
-void
-fz_paintimagecolor(fz_pixmap *dst, fz_bbox scissor, fz_pixmap *img, fz_matrix ctm, byte *color)
-{
-	assert(img->n == 1);
-	fz_paintimageimp(dst, scissor, img, ctm, color, 255);
-}
-
-void
-fz_paintimage(fz_pixmap *dst, fz_bbox scissor, fz_pixmap *img, fz_matrix ctm, int alpha)
-{
-	assert(dst->n == img->n);
-	fz_paintimageimp(dst, scissor, img, ctm, nil, alpha);
-}
diff --git a/draw/imagesmooth.c b/draw/imagesmooth.c
deleted file mode 100644
index 6254e3e4..00000000
--- a/draw/imagesmooth.c
+++ /dev/null
@@ -1,1175 +0,0 @@
-/*
-This code does smooth scaling of a pixmap.
-
-This function returns a new pixmap representing the area starting at (0,0)
-given by taking the source pixmap src, scaling it to width w, and height h,
-and then positioning it at (frac(x),frac(y)).
-*/
-
-#include "fitz.h"
-
-/* Do we special case handling of single pixel high/wide images? The
- * 'purest' handling is given by not special casing them, but certain
- * files that use such images 'stack' them to give full images. Not
- * special casing them results in then being fainter and giving noticable
- * rounding errors.
- */
-#define SINGLE_PIXEL_SPECIALS
-
-#ifdef DEBUG_SCALING
-#ifdef WIN32
-#include <windows.h>
-static void debug_print(const char *fmt, ...)
-{
-	va_list args;
-	char text[256];
-	va_start(args, fmt);
-	vsprintf(text, fmt, args);
-	va_end(args);
-	OutputDebugStringA(text);
-	printf(text);
-}
-#else
-static void debug_print(const char *fmt, ...)
-{
-	va_list args;
-	va_start(args, fmt);
-	vfprintf(stderr, fmt, args);
-	va_end(args);
-}
-#endif
-#endif
-#ifdef DEBUG_SCALING
-#define DBUG(A) debug_print A
-#else
-#define DBUG(A) do {} while(0==1)
-#endif
-
-/*
-Consider a row of source samples, src, of width src_w, positioned at x,
-scaled to width dst_w.
-
-src[i] is centred at: x + (i + 0.5)*dst_w/src_w
-
-Therefore the distance between the centre of the jth output pixel and
-the centre of the ith source sample is:
-
-dist[j,i] = j + 0.5 - (x + (i + 0.5)*dst_w/src_w)
-
-When scaling up, therefore:
-
-dst[j] = SUM(filter(dist[j,i]) * src[i])
-	(for all ints i)
-
-This can be simplified by noticing that filters are only non zero within
-a given filter width (henceforth called W). So:
-
-dst[j] = SUM(filter(dist[j,i]) * src[i])
-	(for ints i, s.t. (j*src_w/dst_w)-W < i < (j*src_w/dst_w)+W)
-
-When scaling down, each filtered source sample is stretched to be wider
-to avoid aliasing issues. This effectively reduces the distance between
-centres.
-
-dst[j] = SUM(filter(dist[j,i] * F) * F * src[i])
-	(where F = dst_w/src_w)
-	(for ints i, s.t. (j-W)/F < i < (j+W)/F)
-
-*/
-
-typedef struct fz_scalefilter_s fz_scalefilter;
-
-struct fz_scalefilter_s
-{
-	int width;
-	float (*fn)(fz_scalefilter *, float);
-};
-
-/* Image scale filters */
-
-static float
-triangle(fz_scalefilter *filter, float f)
-{
-	if (f >= 1)
-		return 0;
-	return 1-f;
-}
-
-static float
-box(fz_scalefilter *filter, float f)
-{
-	if (f >= 0.5f)
-		return 0;
-	return 1;
-}
-
-static float
-simple(fz_scalefilter *filter, float x)
-{
-	if (x >= 1)
-		return 0;
-	return 1 + (2*x - 3)*x*x;
-}
-
-static float
-lanczos2(fz_scalefilter *filter, float x)
-{
-	if (x >= 2)
-		return 0;
-	return sinf(M_PI*x) * sinf(M_PI*x/2) / (M_PI*x) / (M_PI*x/2);
-}
-
-static float
-lanczos3(fz_scalefilter *filter, float f)
-{
-	if (f >= 3)
-		return 0;
-	return sinf(M_PI*f) * sinf(M_PI*f/3) / (M_PI*f) / (M_PI*f/3);
-}
-
-/*
-The Mitchell family of filters is defined:
-
-	f(x) =	1 { (12-9B-6C)x^3 + (-18+12B+6C)x^2 + (6-2B)	for x < 1
-		- {
-		6 { (-B-6C)x^3+(6B+30C)x^2+(-12B-48C)x+(8B+24C)	for 1<=x<=2
-
-The 'best' ones lie along the line B+2C = 1.
-The literature suggests that B=1/3, C=1/3 is best.
-
-	f(x) =	1 { (12-3-2)x^3 - (-18+4+2)x^2 + (16/3)	for x < 1
-		- {
-		6 { (-7/3)x^3 + 12x^2 - 20x + (32/3)	for 1<=x<=2
-
-	f(x) =	1 { 21x^3 - 36x^2 + 16			for x < 1
-		- {
-		18{ -7x^3 + 36x^2 - 60x + 32		for 1<=x<=2
-*/
-
-static float
-mitchell(fz_scalefilter *filter, float x)
-{
-	if (x >= 2)
-		return 0;
-	if (x >= 1)
-		return (32 + x*(-60 + x*(36 - 7*x)))/18;
-	return (16 + x*x*(-36 + 21*x))/18;
-}
-
-fz_scalefilter fz_scalefilter_box = { 1, box };
-fz_scalefilter fz_scalefilter_triangle = { 1, triangle };
-fz_scalefilter fz_scalefilter_simple = { 1, simple };
-fz_scalefilter fz_scalefilter_lanczos2 = { 2, lanczos2 };
-fz_scalefilter fz_scalefilter_lanczos3 = { 3, lanczos3 };
-fz_scalefilter fz_scalefilter_mitchell = { 2, mitchell };
-
-/*
-We build ourselves a set of tables to contain the precalculated weights
-for a given set of scale settings.
-
-The first dst_w entries in index are the index into index of the
-sets of weight for each destination pixel.
-
-Each of the sets of weights is a set of values consisting of:
-	the minimum source pixel index used for this destination pixel
-	the number of weights used for this destination pixel
-	the weights themselves
-
-So to calculate dst[i] we do the following:
-
-	weights = &index[index[i]];
-	min = *weights++;
-	len = *weights++;
-	dst[i] = 0;
-	while (--len > 0)
-		dst[i] += src[min++] * *weights++
-
-in addition, we guarantee that at the end of this process weights will now
-point to the weights value for dst pixel i+1.
-
-In the simplest version of this algorithm, we would scale the whole image
-horizontally first into a temporary buffer, then scale that temporary
-buffer again vertically to give us our result. Using such a simple
-algorithm would mean that could use the same style of weights for both
-horizontal and vertical scaling.
-
-Unfortunately, this would also require a large temporary buffer,
-particularly in the case where we are scaling up.
-
-We therefore modify the algorithm as follows; we scale scanlines from the
-source image horizontally into a temporary buffer, until we have all the
-contributors for a given output scanline. We then produce that output
-scanline from the temporary buffer. In this way we restrict the height
-of the temporary buffer to a small fraction of the final size.
-
-Unfortunately, this means that the pseudo code for recombining a
-scanline of fully scaled pixels is as follows:
-
-	weights = &index[index[y]];
-	min = *weights++;
-	len = *weights++;
-	for (x=0 to dst_w)
-		min2 = min
-		len2 = len
-		weights2 = weights
-		dst[x] = 0;
-		while (--len2 > 0)
-			dst[x] += temp[x][(min2++) % tmp_buf_height] * *weights2++
-
-i.e. it requires a % operation for every source pixel - this is typically
-expensive.
-
-To avoid this, we alter the order in which vertical weights are stored,
-so that they are ordered in the same order as the temporary buffer lines
-would appear. This simplifies the algorithm to:
-
-	weights = &index[index[y]];
-	min = *weights++;
-	len = *weights++;
-	for (x=0 to dst_w)
-		min2 = 0
-		len2 = len
-		weights2 = weights
-		dst[x] = 0;
-		while (--len2 > 0)
-			dst[x] += temp[i][min2++] * *weights2++
-
-This means that len may be larger than it needs to be (due to the
-possible inclusion of a zero weight row or two), but in practise this
-is only an increase of 1 or 2 at worst.
-
-We implement this by generating the weights as normal (but ensuring we
-leave enough space) and then reordering afterwards.
-
-*/
-
-typedef struct fz_weights_s fz_weights;
-
-struct fz_weights_s
-{
-	int count;
-	int max_len;
-	int n;
-	int flip;
-	int new_line;
-	int index[1];
-};
-
-static fz_weights *
-newweights(fz_scalefilter *filter, int src_w, float dst_w, int dst_w_i, int n, int flip)
-{
-	int max_len;
-	fz_weights *weights;
-
-	if (src_w > dst_w)
-	{
-		/* Scaling down, so there will be a maximum of
-		 * 2*filterwidth*src_w/dst_w src pixels
-		 * contributing to each dst pixel. */
-		max_len = (int)ceilf((2 * filter->width * src_w)/dst_w);
-		if (max_len > src_w)
-			max_len = src_w;
-	}
-	else
-	{
-		/* Scaling up, so there will be a maximum of
-		 * 2*filterwidth src pixels contributing to each dst pixel.
-		 */
-		max_len = 2 * filter->width;
-	}
-	/* We need the size of the struct,
-	 * plus dst_w*sizeof(int) for the index
-	 * plus (2+max_len)*sizeof(int) for the weights
-	 * plus room for an extra set of weights for reordering.
-	 */
-	weights = fz_malloc(sizeof(*weights)+(max_len+3)*(dst_w_i+1)*sizeof(int));
-	if (weights == NULL)
-		return NULL;
-	weights->count = -1;
-	weights->max_len = max_len;
-	weights->index[0] = dst_w_i;
-	weights->n = n;
-	weights->flip = flip;
-	return weights;
-}
-
-static void
-init_weights(fz_weights *weights, int j)
-{
-	int index;
-
-	assert(weights->count == j-1);
-	weights->count++;
-	weights->new_line = 1;
-	if (j == 0)
-		index = weights->index[0];
-	else
-	{
-		index = weights->index[j-1];
-		index += 2 + weights->index[index+1];
-	}
-	weights->index[j] = index; /* row pointer */
-	weights->index[index] = 0; /* min */
-	weights->index[index+1] = 0; /* len */
-}
-
-static void
-add_weight(fz_weights *weights, int j, int i, fz_scalefilter *filter,
-	float x, float F, float G, int src_w, float dst_w)
-{
-	float dist = j - x + 0.5f - ((i + 0.5f)*dst_w/src_w);
-	float f;
-	int min, len, index, weight;
-
-	dist *= G;
-	if (dist < 0)
-		dist = -dist;
-	f = filter->fn(filter, dist)*F;
-	weight = (int)(256*f+0.5f);
-	if (weight == 0)
-		return;
-
-	/* wrap i back into range */
-#ifdef MIRROR_WRAP
-	do
-	{
-		if (i < 0)
-			i = -1-i;
-		else if (i >= src_w)
-			i = 2*src_w-1-i;
-		else
-			break;
-	}
-	while (1);
-#elif defined(WRAP)
-	if (i < 0)
-		i = 0;
-	else if (i >= src_w)
-		i = src_w-1;
-#else
-	if (i < 0)
-	{
-		i = 0;
-		weight = 0;
-	}
-	else if (i >= src_w)
-	{
-		i = src_w-1;
-		weight = 0;
-	}
-#endif
-
-	DBUG(("add_weight[%d][%d] = %d(%g) dist=%g\n",j,i,weight,f,dist));
-
-	if (weights->new_line)
-	{
-		/* New line */
-		weights->new_line = 0;
-		index = weights->index[j]; /* row pointer */
-		weights->index[index] = i; /* min */
-		weights->index[index+1] = 0; /* len */
-	}
-	index = weights->index[j];
-	min = weights->index[index++];
-	len = weights->index[index++];
-	while (i < min)
-	{
-		/* This only happens in rare cases, but we need to insert
-		 * one earlier. In exceedingly rare cases we may need to
-		 * insert more than one earlier. */
-		int k;
-
-		for (k = len; k > 0; k--)
-		{
-			weights->index[index+k] = weights->index[index+k-1];
-		}
-		weights->index[index] = 0;
-		min--;
-		len++;
-		weights->index[index-2] = min;
-		weights->index[index-1] = len;
-	}
-	if (i-min >= len)
-	{
-		/* The usual case */
-		while (i-min >= ++len)
-		{
-			weights->index[index+len-1] = 0;
-		}
-		assert(len-1 == i-min);
-		weights->index[index+i-min] = weight;
-		weights->index[index-1] = len;
-		assert(len <= weights->max_len);
-	}
-	else
-	{
-		/* Infrequent case */
-		weights->index[index+i-min] += weight;
-	}
-}
-
-static void
-reorder_weights(fz_weights *weights, int j, int src_w)
-{
-	int idx = weights->index[j];
-	int min = weights->index[idx++];
-	int len = weights->index[idx++];
-	int max = weights->max_len;
-	int tmp = idx+max;
-	int i, off;
-
-	/* Copy into the temporary area */
-	memcpy(&weights->index[tmp], &weights->index[idx], sizeof(int)*len);
-
-	/* Pad out if required */
-	assert(len <= max);
-	assert(min+len <= src_w);
-	off = 0;
-	if (len < max)
-	{
-		memset(&weights->index[tmp+len], 0, sizeof(int)*(max-len));
-		len = max;
-		if (min + len > src_w)
-		{
-			off = min + len - src_w;
-			min = src_w - len;
-			weights->index[idx-2] = min;
-		}
-		weights->index[idx-1] = len;
-	}
-
-	/* Copy back into the proper places */
-	for (i = 0; i < len; i++)
-	{
-		weights->index[idx+((min+i+off) % max)] = weights->index[tmp+i];
-	}
-}
-
-static void
-check_weights(fz_weights *weights, int j, int w)
-{
-	int idx, len;
-	int sum = 0;
-	int max = -256;
-	int maxidx = 0;
-	int i;
-
-	idx = weights->index[j];
-	idx++; /* min */
-	len = weights->index[idx++];
-
-	for(i=0; i < len; i++)
-	{
-		int v = weights->index[idx++];
-		sum += v;
-		if (v > max)
-		{
-			max = v;
-			maxidx = idx;
-		}
-	}
-	if (((j != 0) && (j != w-1)) || (sum > 256))
-		weights->index[maxidx-1] += 256-sum;
-	DBUG(("total weight %d = %d\n", j, sum));
-}
-
-static fz_weights *
-make_weights(int src_w, float x, float dst_w, fz_scalefilter *filter, int vertical, int dst_w_int, int n, int flip)
-{
-	fz_weights *weights;
-	float F, G;
-	float window;
-	int j;
-
-	if (dst_w < src_w)
-	{
-		/* Scaling down */
-		F = dst_w / src_w;
-		G = 1;
-	}
-	else
-	{
-		/* Scaling up */
-		F = 1;
-		G = src_w / dst_w;
-	}
-	window = filter->width / F;
-	DBUG(("make_weights src_w=%d x=%g dst_w=%g dst_w_int=%d F=%g window=%g\n", src_w, x, dst_w, dst_w_int, F, window));
-	weights	= newweights(filter, src_w, dst_w, dst_w_int, n, flip);
-	if (weights == NULL)
-		return NULL;
-	for (j = 0; j < dst_w_int; j++)
-	{
-		/* find the position of the centre of dst[j] in src space */
-		float centre = (j - x + 0.5f)*src_w/dst_w - 0.5f;
-		int l, r;
-		l = ceilf(centre - window);
-		r = floorf(centre + window);
-		DBUG(("%d: centre=%g l=%d r=%d\n", j, centre, l, r));
-		init_weights(weights, j);
-		for (; l <= r; l++)
-		{
-			add_weight(weights, j, l, filter, x, F, G, src_w, dst_w);
-		}
-		check_weights(weights, j, dst_w_int);
-		if (vertical)
-		{
-			reorder_weights(weights, j, src_w);
-		}
-	}
-	weights->count++; /* weights->count = dst_w_int now */
-	return weights;
-}
-
-static void
-scale_row_to_temp(int *dst, unsigned char *src, fz_weights *weights)
-{
-	int *contrib = &weights->index[weights->index[0]];
-	int len, i, j, n;
-	unsigned char *min;
-
-	n = weights->n;
-	if (weights->flip)
-	{
-		dst += (weights->count-1)*n;
-		for (i=weights->count; i > 0; i--)
-		{
-			min = &src[n * *contrib++];
-			len = *contrib++;
-			for (j = 0; j < n; j++)
-				dst[j] = 0;
-			while (len-- > 0)
-			{
-				for (j = n; j > 0; j--)
-					*dst++ += *min++ * *contrib;
-				dst -= n;
-				contrib++;
-			}
-			dst -= n;
-		}
-	}
-	else
-	{
-		for (i=weights->count; i > 0; i--)
-		{
-			min = &src[n * *contrib++];
-			len = *contrib++;
-			for (j = 0; j < n; j++)
-				dst[j] = 0;
-			while (len-- > 0)
-			{
-				for (j = n; j > 0; j--)
-					*dst++ += *min++ * *contrib;
-				dst -= n;
-				contrib++;
-			}
-			dst += n;
-		}
-	}
-}
-
-static void
-scale_row_to_temp1(int *dst, unsigned char *src, fz_weights *weights)
-{
-	int *contrib = &weights->index[weights->index[0]];
-	int len, i;
-	unsigned char *min;
-
-	assert(weights->n == 1);
-	if (weights->flip)
-	{
-		dst += weights->count;
-		for (i=weights->count; i > 0; i--)
-		{
-			int val = 0;
-			min = &src[*contrib++];
-			len = *contrib++;
-			while (len-- > 0)
-			{
-				val += *min++ * *contrib++;
-			}
-			*--dst = val;
-		}
-	}
-	else
-	{
-		for (i=weights->count; i > 0; i--)
-		{
-			int val = 0;
-			min = &src[*contrib++];
-			len = *contrib++;
-			while (len-- > 0)
-			{
-				val += *min++ * *contrib++;
-			}
-			*dst++ = val;
-		}
-	}
-}
-
-static void
-scale_row_to_temp2(int *dst, unsigned char *src, fz_weights *weights)
-{
-	int *contrib = &weights->index[weights->index[0]];
-	int len, i;
-	unsigned char *min;
-
-	assert(weights->n == 2);
-	if (weights->flip)
-	{
-		dst += 2*weights->count;
-		for (i=weights->count; i > 0; i--)
-		{
-			int c1 = 0;
-			int c2 = 0;
-			min = &src[2 * *contrib++];
-			len = *contrib++;
-			while (len-- > 0)
-			{
-				c1 += *min++ * *contrib;
-				c2 += *min++ * *contrib++;
-			}
-			*--dst = c2;
-			*--dst = c1;
-		}
-	}
-	else
-	{
-		for (i=weights->count; i > 0; i--)
-		{
-			int c1 = 0;
-			int c2 = 0;
-			min = &src[2 * *contrib++];
-			len = *contrib++;
-			while (len-- > 0)
-			{
-				c1 += *min++ * *contrib;
-				c2 += *min++ * *contrib++;
-			}
-			*dst++ = c1;
-			*dst++ = c2;
-		}
-	}
-}
-
-static void
-scale_row_to_temp4(int *dst, unsigned char *src, fz_weights *weights)
-{
-	int *contrib = &weights->index[weights->index[0]];
-#ifndef ARCH_ARM
-	int len, i;
-	unsigned char *min;
-#endif
-
-	assert(weights->n == 4);
-	if (weights->flip)
-	{
-		dst += 4*weights->count;
-#ifdef ARCH_ARM
-		asm volatile(
-		"1:"
-		"ldr	r4, [%2], #4		@ r4 = *contrib++	\n"
-		"ldr	r9, [%2], #4		@ r9 = len = *contrib++	\n"
-		"mov	r5, #0			@ r5 = r = 0		\n"
-		"mov	r6, #0			@ r6 = g = 0		\n"
-		"mov	r7, #0			@ r7 = b = 0		\n"
-		"mov	r8, #0			@ r8 = a = 0		\n"
-		"add	r4, %1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
-		"cmp	r9, #0			@ while (len-- > 0)	\n"
-		"beq	3f			@ {			\n"
-		"2:							\n"
-		"ldr	r10,[%2], #4		@ r10 = *contrib++	\n"
-		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
-		"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
-		"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
-		"mla	r5, r10,r11,r5		@ r += r11 * r10	\n"
-		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
-		"mla	r6, r10,r12,r6		@ g += r12 * r10	\n"
-		"mla	r7, r10,r14,r7		@ b += r14 * r10	\n"
-		"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
-		"subs	r9, r9, #1		@ r9 = len--		\n"
-		"bgt	2b			@ }			\n"
-		"stmdb	%0!,{r5,r6,r7,r8}	@ *--dst=a;*--dst=b;	\n"
-		"3:				@ *--dst=g;*--dst=r;	\n"
-		"subs	%3, %3, #1		@ i--			\n"
-		"bgt	1b			@ 			\n"
-		:
-		:
-		"r" (dst),
-		"r" (src),
-		"r" (contrib),
-		"r" (weights->count)
-		:
-		"r4","r5","r6","r7","r8","r9","r10","r11","r12","r14",
-		"memory","cc"
-		);
-#else
-		for (i=weights->count; i > 0; i--)
-		{
-			int r = 0;
-			int g = 0;
-			int b = 0;
-			int a = 0;
-			min = &src[4 * *contrib++];
-			len = *contrib++;
-			while (len-- > 0)
-			{
-				r += *min++ * *contrib;
-				g += *min++ * *contrib;
-				b += *min++ * *contrib;
-				a += *min++ * *contrib++;
-			}
-			*--dst = a;
-			*--dst = b;
-			*--dst = g;
-			*--dst = r;
-		}
-#endif
-	}
-	else
-	{
-#ifdef ARCH_ARM
-		asm volatile(
-		"1:"
-		"ldr	r4, [%2], #4		@ r4 = *contrib++	\n"
-		"ldr	r9, [%2], #4		@ r9 = len = *contrib++	\n"
-		"mov	r5, #0			@ r5 = r = 0		\n"
-		"mov	r6, #0			@ r6 = g = 0		\n"
-		"mov	r7, #0			@ r7 = b = 0		\n"
-		"mov	r8, #0			@ r8 = a = 0		\n"
-		"add	r4, %1, r4, LSL #2	@ r4 = min = &src[4*r4]	\n"
-		"cmp	r9, #0			@ while (len-- > 0)	\n"
-		"beq	3f			@ {			\n"
-		"2:							\n"
-		"ldr	r10,[%2], #4		@ r10 = *contrib++	\n"
-		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
-		"ldrb	r12,[r4], #1		@ r12 = *min++		\n"
-		"ldrb	r14,[r4], #1		@ r14 = *min++		\n"
-		"mla	r5, r10,r11,r5		@ r += r11 * r10	\n"
-		"ldrb	r11,[r4], #1		@ r11 = *min++		\n"
-		"mla	r6, r10,r12,r6		@ g += r12 * r10	\n"
-		"mla	r7, r10,r14,r7		@ b += r14 * r10	\n"
-		"mla	r8, r10,r11,r8		@ a += r11 * r10	\n"
-		"subs	r9, r9, #1		@ r9 = len--		\n"
-		"bgt	2b			@ }			\n"
-		"stmia	%0!,{r5,r6,r7,r8}	@ *dst++=r;*dst++=g;	\n"
-		"3:				@ *dst++=b;*dst++=a;	\n"
-		"subs	%3, %3, #1		@ i--			\n"
-		"bgt	1b			@ 			\n"
-		:
-		:
-		"r" (dst),
-		"r" (src),
-		"r" (contrib),
-		"r" (weights->count)
-		:
-		"r4","r5","r6","r7","r8","r9","r10","r11","r12","r14",
-		"memory","cc"
-		);
-#else
-		for (i=weights->count; i > 0; i--)
-		{
-			int r = 0;
-			int g = 0;
-			int b = 0;
-			int a = 0;
-			min = &src[4 * *contrib++];
-			len = *contrib++;
-			while (len-- > 0)
-			{
-				r += *min++ * *contrib;
-				g += *min++ * *contrib;
-				b += *min++ * *contrib;
-				a += *min++ * *contrib++;
-			}
-			*dst++ = r;
-			*dst++ = g;
-			*dst++ = b;
-			*dst++ = a;
-		}
-#endif
-	}
-}
-
-static void
-scale_row_from_temp(unsigned char *dst, int *src, fz_weights *weights, int width, int row)
-{
-	int *contrib = &weights->index[weights->index[row]];
-	int len, x;
-
-	contrib++; /* Skip min */
-	len = *contrib++;
-	for (x=width; x > 0; x--)
-	{
-		int *min = src;
-		int val = 0;
-		int len2 = len;
-		int *contrib2 = contrib;
-
-		while (len2-- > 0)
-		{
-			val += *min * *contrib2++;
-			min += width;
-		}
-		val = (val+(1<<15))>>16;
-		if (val < 0)
-			val = 0;
-		else if (val > 255)
-			val = 255;
-		*dst++ = val;
-		src++;
-	}
-}
-
-#ifdef SINGLE_PIXEL_SPECIALS
-static void
-duplicate_single_pixel(unsigned char *dst, unsigned char *src, int n, int w, int h)
-{
-	int i;
-
-	for (i = n; i > 0; i--)
-		*dst++ = *src++;
-	for (i = (w*h-1)*n; i > 0; i--)
-	{
-		*dst = dst[-n];
-		dst++;
-	}
-}
-
-static void
-scale_single_row(unsigned char *dst, unsigned char *src, fz_weights *weights, int src_w, int h)
-{
-	int *contrib = &weights->index[weights->index[0]];
-	int min, len, i, j, val, n;
-	int tmp[FZ_MAXCOLORS];
-
-	n = weights->n;
-	/* Scale a single row */
-	if (weights->flip)
-	{
-		dst += (weights->count-1)*n;
-		for (i=weights->count; i > 0; i--)
-		{
-			min = *contrib++;
-			len = *contrib++;
-			min *= n;
-			for (j = 0; j < n; j++)
-				tmp[j] = 0;
-			while (len-- > 0)
-			{
-				for (j = 0; j < n; j++)
-					tmp[j] += src[min++] * *contrib;
-				contrib++;
-			}
-			for (j = 0; j < n; j++)
-			{
-				val = (tmp[j]+(1<<7))>>8;
-				if (val < 0)
-					val = 0;
-				else if (val > 255)
-					val = 255;
-				*dst++ = val;
-			}
-			dst -= 2*n;
-		}
-		dst += n;
-	}
-	else
-	{
-		for (i=weights->count; i > 0; i--)
-		{
-			min = *contrib++;
-			len = *contrib++;
-			min *= n;
-			for (j = 0; j < n; j++)
-				tmp[j] = 0;
-			while (len-- > 0)
-			{
-				for (j = 0; j < n; j++)
-					tmp[j] += src[min++] * *contrib;
-				contrib++;
-			}
-			for (j = 0; j < n; j++)
-			{
-				val = (tmp[j]+(1<<7))>>8;
-				if (val < 0)
-					val = 0;
-				else if (val > 255)
-					val = 255;
-				*dst++ = val;
-			}
-		}
-	}
-	/* And then duplicate it h times */
-	n *= weights->count;
-	while (--h > 0)
-	{
-		memcpy(dst, dst-n, n);
-		dst += n;
-	}
-}
-
-static void
-scale_single_col(unsigned char *dst, unsigned char *src, fz_weights *weights, int src_w, int n, int w, int flip_y)
-{
-	int *contrib = &weights->index[weights->index[0]];
-	int min, len, i, j, val;
-	int tmp[FZ_MAXCOLORS];
-
-	if (flip_y)
-	{
-		src_w = (src_w-1)*n;
-		w = (w-1)*n;
-		for (i=weights->count; i > 0; i--)
-		{
-			/* Scale the next pixel in the column */
-			min = *contrib++;
-			len = *contrib++;
-			min = src_w-min*n;
-			for (j = 0; j < n; j++)
-				tmp[j] = 0;
-			while (len-- > 0)
-			{
-				for (j = 0; j < n; j++)
-					tmp[j] += src[src_w-min+j] * *contrib;
-				contrib++;
-			}
-			for (j = 0; j < n; j++)
-			{
-				val = (tmp[j]+(1<<7))>>8;
-				if (val < 0)
-					val = 0;
-				else if (val > 255)
-					val = 255;
-				*dst++ = val;
-			}
-			/* And then duplicate it across the row */
-			for (j = w; j > 0; j--)
-			{
-				*dst = dst[-n];
-				dst++;
-			}
-		}
-	}
-	else
-	{
-		w = (w-1)*n;
-		for (i=weights->count; i > 0; i--)
-		{
-			/* Scale the next pixel in the column */
-			min = *contrib++;
-			len = *contrib++;
-			min *= n;
-			for (j = 0; j < n; j++)
-				tmp[j] = 0;
-			while (len-- > 0)
-			{
-				for (j = 0; j < n; j++)
-					tmp[j] += src[min++] * *contrib;
-				contrib++;
-			}
-			for (j = 0; j < n; j++)
-			{
-				val = (tmp[j]+(1<<7))>>8;
-				if (val < 0)
-					val = 0;
-				else if (val > 255)
-					val = 255;
-				*dst++ = val;
-			}
-			/* And then duplicate it across the row */
-			for (j = w; j > 0; j--)
-			{
-				*dst = dst[-n];
-				dst++;
-			}
-		}
-	}
-}
-#endif /* SINGLE_PIXEL_SPECIALS */
-
-fz_pixmap *
-fz_scalepixmap(fz_pixmap *src, float x, float y, float w, float h)
-{
-	fz_scalefilter *filter = &fz_scalefilter_simple;
-	fz_weights *contrib_rows = NULL;
-	fz_weights *contrib_cols = NULL;
-	fz_pixmap *output = NULL;
-	int *temp = NULL;
-	int max_row, temp_span, temp_rows, row;
-	int dst_w_int, dst_h_int, dst_x_int, dst_y_int;
-	int flip_x, flip_y;
-
-	DBUG(("Scale: (%d,%d) to (%g,%g) at (%g,%g)\n",src->w,src->h,w,h,x,y));
-
-	/* Find the destination bbox, width/height, and sub pixel offset,
-	 * allowing for whether we're flipping or not. */
-	/* Note that the x and y sub pixel offsets here are different.
-	 * The (x,y) position given describes where the bottom left corner
-	 * of the source image should be mapped to (i.e. where (0,h) in image
-	 * space ends up, not the more logical and sane (0,0)). Also there
-	 * are differences in the way we scale horizontally and vertically.
-	 * When scaling rows horizontally, we always read forwards through
-	 * the source, and store either forwards or in reverse as required.
-	 * When scaling vertically, we always store out forwards, but may
-	 * feed source rows in in a different order.
-	 *
-	 * Consider the image rectange 'r' to which the image is mapped,
-	 * and the (possibly) larger rectangle 'R', given by expanding 'r' to
-	 * complete pixels.
-	 *
-	 * x can either be r.xmin-R.xmin or R.xmax-r.xmax depending on whether
-	 * the image is x flipped or not. Whatever happens 0 <= x < 1.
-	 * y is always R.ymax - r.ymax.
-	 */
-	/* dst_x_int is calculated to be the left of the scaled image, and
-	 * x (the sub_pixel_offset) is the distance in from either the left
-	 * or right pixel expanded edge. */
-	flip_x = (w < 0);
-	if (flip_x)
-	{
-		float tmp;
-		w = -w;
-		dst_x_int = floor(x-w);
-		tmp = ceilf(x);
-		dst_w_int = (int)tmp;
-		x = tmp - x;
-		dst_w_int -= dst_x_int;
-	}
-	else
-	{
-		dst_x_int = floor(x);
-		x -= (float)dst_x_int;
-		dst_w_int = (int)ceilf(x + w);
-	}
-	flip_y = (h < 0);
-	/* dst_y_int is calculated to be the bottom of the scaled image, but
-	 * y (the sub pixel offset) has to end up being the value at the top.
-	 */
-	if (flip_y)
-	{
-		h = -h;
-		dst_y_int = floor(y-h);
-		dst_h_int = (int)ceilf(y) - dst_y_int;
-	} else {
-		dst_y_int = floor(y);
-		y += h;
-		dst_h_int = (int)ceilf(y) - dst_y_int;
-	}
-	/* y is the top edge position in floats. We want it to be the
-	 * distance down from the next pixel boundary. */
-	y = ceilf(y) - y;
-
-	DBUG(("Result image: (%d,%d) at (%d,%d) (subpix=%g,%g)\n", dst_w_int, dst_h_int, dst_x_int, dst_y_int, x, y));
-
-	/* Step 1: Calculate the weights for columns and rows */
-#ifdef SINGLE_PIXEL_SPECIALS
-	if (src->w == 1)
-	{
-		contrib_cols = NULL;
-	}
-	else
-#endif /* SINGLE_PIXEL_SPECIALS */
-	{
-		contrib_cols = make_weights(src->w, x, w, filter, 0, dst_w_int, src->n, flip_x);
-		if (contrib_cols == NULL)
-			goto cleanup;
-	}
-#ifdef SINGLE_PIXEL_SPECIALS
-	if (src->h == 1)
-	{
-		contrib_rows = NULL;
-	}
-	else
-#endif /* SINGLE_PIXEL_SPECIALS */
-	{
-		contrib_rows = make_weights(src->h, y, h, filter, 1, dst_h_int, src->n, flip_y);
-		if (contrib_rows == NULL)
-			goto cleanup;
-	}
-
-	assert(contrib_cols == NULL || contrib_cols->count == dst_w_int);
-	assert(contrib_rows == NULL || contrib_rows->count == dst_h_int);
-	output = fz_newpixmap(src->colorspace, dst_x_int, dst_y_int, dst_w_int, dst_h_int);
-	if (output == NULL)
-		goto cleanup;
-
-	/* Step 2: Apply the weights */
-#ifdef SINGLE_PIXEL_SPECIALS
-	if (contrib_rows == NULL)
-	{
-		/* Only 1 source pixel high. */
-		if (contrib_cols == NULL)
-		{
-			/* Only 1 pixel in the entire image! */
-			duplicate_single_pixel(output->samples, src->samples, src->n, dst_w_int, dst_h_int);
-		}
-		else
-		{
-			/* Scale the row once, then copy it. */
-			scale_single_row(output->samples, src->samples, contrib_cols, src->w, dst_h_int);
-		}
-	}
-	else if (contrib_cols == NULL)
-	{
-		/* Only 1 source pixel wide. Scale the col and duplicate. */
-		scale_single_col(output->samples, src->samples, contrib_rows, src->h, src->n, dst_w_int, flip_y);
-	}
-	else
-#endif /* SINGLE_PIXEL_SPECIALS */
-	{
-		void (*row_scale)(int *dst, unsigned char *src, fz_weights *weights);
-
-		temp_span = contrib_cols->count * src->n;
-		temp_rows = contrib_rows->max_len;
-		if (temp_span <= 0 || temp_rows > INT_MAX / temp_span)
-			goto cleanup;
-		temp = fz_calloc(temp_span*temp_rows, sizeof(int));
-		if (temp == NULL)
-			goto cleanup;
-		switch (src->n)
-		{
-		default:
-			row_scale = scale_row_to_temp;
-			break;
-		case 1: /* Image mask case */
-			row_scale = scale_row_to_temp1;
-			break;
-		case 2: /* Greyscale with alpha case */
-			row_scale = scale_row_to_temp2;
-			break;
-		case 4: /* RGBA */
-			row_scale = scale_row_to_temp4;
-			break;
-		}
-		max_row = 0;
-		for (row = 0; row < contrib_rows->count; row++)
-		{
-			/*
-			Which source rows do we need to have scaled into the
-			temporary buffer in order to be able to do the final
-			scale?
-			*/
-			int row_index = contrib_rows->index[row];
-			int row_min = contrib_rows->index[row_index++];
-			int row_len = contrib_rows->index[row_index++];
-			while (max_row < row_min+row_len)
-			{
-				/* Scale another row */
-				assert(max_row < src->h);
-				DBUG(("scaling row %d to temp\n", max_row));
-				(*row_scale)(&temp[temp_span*(max_row % temp_rows)], &src->samples[(flip_y ? (src->h-1-max_row): max_row)*src->w*src->n], contrib_cols);
-				max_row++;
-			}
-
-			DBUG(("scaling row %d from temp\n", row));
-			scale_row_from_temp(&output->samples[row*output->w*output->n], temp, contrib_rows, temp_span, row);
-		}
-		fz_free(temp);
-	}
-
-cleanup:
-	fz_free(contrib_rows);
-	fz_free(contrib_cols);
-	return output;
-}
diff --git a/draw/imageunpack.c b/draw/imageunpack.c
deleted file mode 100644
index 2c7cb452..00000000
--- a/draw/imageunpack.c
+++ /dev/null
@@ -1,235 +0,0 @@
-#include "fitz.h"
-
-/* Unpack image samples and optionally pad pixels with opaque alpha */
-
-#define get1(buf,x) ((buf[x >> 3] >> ( 7 - (x & 7) ) ) & 1 )
-#define get2(buf,x) ((buf[x >> 2] >> ( ( 3 - (x & 3) ) << 1 ) ) & 3 )
-#define get4(buf,x) ((buf[x >> 1] >> ( ( 1 - (x & 1) ) << 2 ) ) & 15 )
-#define get8(buf,x) (buf[x])
-#define get16(buf,x) (buf[x << 1])
-
-static unsigned char get1tab1[256][8];
-static unsigned char get1tab1p[256][16];
-static unsigned char get1tab255[256][8];
-static unsigned char get1tab255p[256][16];
-
-static void
-initget1tables(void)
-{
-	static int once = 0;
-	unsigned char bits[1];
-	int i, k, x;
-
-	/* TODO: mutex lock here */
-
-	if (once)
-		return;
-
-	for (i = 0; i < 256; i++)
-	{
-		bits[0] = i;
-		for (k = 0; k < 8; k++)
-		{
-			x = get1(bits, k);
-
-			get1tab1[i][k] = x;
-			get1tab1p[i][k * 2] = x;
-			get1tab1p[i][k * 2 + 1] = 255;
-
-			get1tab255[i][k] = x * 255;
-			get1tab255p[i][k * 2] = x * 255;
-			get1tab255p[i][k * 2 + 1] = 255;
-		}
-	}
-
-	once = 1;
-}
-
-void
-fz_unpacktile(fz_pixmap *dst, unsigned char * restrict src, int n, int depth, int stride, int scale)
-{
-	int pad, x, y, k;
-	int w = dst->w;
-
-	pad = 0;
-	if (dst->n > n)
-		pad = 255;
-
-	if (depth == 1)
-		initget1tables();
-
-	if (scale == 0)
-	{
-		switch (depth)
-		{
-		case 1: scale = 255; break;
-		case 2: scale = 85; break;
-		case 4: scale = 17; break;
-		}
-	}
-
-	for (y = 0; y < dst->h; y++)
-	{
-		unsigned char *sp = src + y * stride;
-		unsigned char *dp = dst->samples + y * (dst->w * dst->n);
-
-		/* Specialized loops */
-
-		if (n == 1 && depth == 1 && scale == 1 && !pad)
-		{
-			int w3 = w >> 3;
-			for (x = 0; x < w3; x++)
-			{
-				memcpy(dp, get1tab1[*sp++], 8);
-				dp += 8;
-			}
-			x = x << 3;
-			if (x < w)
-				memcpy(dp, get1tab1[*sp], w - x);
-		}
-
-		else if (n == 1 && depth == 1 && scale == 255 && !pad)
-		{
-			int w3 = w >> 3;
-			for (x = 0; x < w3; x++)
-			{
-				memcpy(dp, get1tab255[*sp++], 8);
-				dp += 8;
-			}
-			x = x << 3;
-			if (x < w)
-				memcpy(dp, get1tab255[*sp], w - x);
-		}
-
-		else if (n == 1 && depth == 1 && scale == 1 && pad)
-		{
-			int w3 = w >> 3;
-			for (x = 0; x < w3; x++)
-			{
-				memcpy(dp, get1tab1p[*sp++], 16);
-				dp += 16;
-			}
-			x = x << 3;
-			if (x < w)
-				memcpy(dp, get1tab1p[*sp], (w - x) << 1);
-		}
-
-		else if (n == 1 && depth == 1 && scale == 255 && pad)
-		{
-			int w3 = w >> 3;
-			for (x = 0; x < w3; x++)
-			{
-				memcpy(dp, get1tab255p[*sp++], 16);
-				dp += 16;
-			}
-			x = x << 3;
-			if (x < w)
-				memcpy(dp, get1tab255p[*sp], (w - x) << 1);
-		}
-
-		else if (depth == 8 && !pad)
-		{
-			int len = w * n;
-			while (len--)
-				*dp++ = *sp++;
-		}
-
-		else if (depth == 8 && pad)
-		{
-			for (x = 0; x < w; x++)
-			{
-				for (k = 0; k < n; k++)
-					*dp++ = *sp++;
-				*dp++ = 255;
-			}
-		}
-
-		else
-		{
-			int b = 0;
-			for (x = 0; x < w; x++)
-			{
-				for (k = 0; k < n; k++)
-				{
-					switch (depth)
-					{
-					case 1: *dp++ = get1(sp, b) * scale; break;
-					case 2: *dp++ = get2(sp, b) * scale; break;
-					case 4: *dp++ = get4(sp, b) * scale; break;
-					case 8: *dp++ = get8(sp, b); break;
-					case 16: *dp++ = get16(sp, b); break;
-					}
-					b++;
-				}
-				if (pad)
-					*dp++ = 255;
-			}
-		}
-	}
-}
-
-/* Apply decode array */
-
-void
-fz_decodeindexedtile(fz_pixmap *pix, float *decode, int maxval)
-{
-	int add[FZ_MAXCOLORS];
-	int mul[FZ_MAXCOLORS];
-	unsigned char *p = pix->samples;
-	int len = pix->w * pix->h;
-	int n = pix->n - 1;
-	int needed;
-	int k;
-
-	needed = 0;
-	for (k = 0; k < n; k++)
-	{
-		int min = decode[k * 2] * 256;
-		int max = decode[k * 2 + 1] * 256;
-		add[k] = min;
-		mul[k] = (max - min) / maxval;
-		needed |= min != 0 || max != maxval * 256;
-	}
-
-	if (!needed)
-		return;
-
-	while (len--)
-	{
-		for (k = 0; k < n; k++)
-			p[k] = (add[k] + (((p[k] << 8) * mul[k]) >> 8)) >> 8;
-		p += n + 1;
-	}
-}
-
-void
-fz_decodetile(fz_pixmap *pix, float *decode)
-{
-	int add[FZ_MAXCOLORS];
-	int mul[FZ_MAXCOLORS];
-	unsigned char *p = pix->samples;
-	int len = pix->w * pix->h;
-	int n = MAX(1, pix->n - 1);
-	int needed;
-	int k;
-
-	needed = 0;
-	for (k = 0; k < n; k++)
-	{
-		int min = decode[k * 2] * 255;
-		int max = decode[k * 2 + 1] * 255;
-		add[k] = min;
-		mul[k] = max - min;
-		needed |= min != 0 || max != 255;
-	}
-
-	if (!needed)
-		return;
-
-	while (len--)
-	{
-		for (k = 0; k < n; k++)
-			p[k] = add[k] + fz_mul255(p[k], mul[k]);
-		p += pix->n;
-	}
-}
diff --git a/draw/meshdraw.c b/draw/meshdraw.c
deleted file mode 100644
index 79437505..00000000
--- a/draw/meshdraw.c
+++ /dev/null
@@ -1,579 +0,0 @@
-#include "fitz.h"
-
-/*
- * polygon clipping
- */
-
-enum { IN, OUT, ENTER, LEAVE };
-enum { MAXV = 3 + 4 };
-enum { MAXN = 2 + FZ_MAXCOLORS };
-
-static int clipx(float val, int ismax, float *v1, float *v2, int n)
-{
-	float t;
-	int i;
-	int v1o = ismax ? v1[0] > val : v1[0] < val;
-	int v2o = ismax ? v2[0] > val : v2[0] < val;
-	if (v1o + v2o == 0)
-		return IN;
-	if (v1o + v2o == 2)
-		return OUT;
-	if (v2o)
-	{
-		t = (val - v1[0]) / (v2[0] - v1[0]);
-		v2[0] = val;
-		v2[1] = v1[1] + t * (v2[1] - v1[1]);
-		for (i = 2; i < n; i++)
-			v2[i] = v1[i] + t * (v2[i] - v1[i]);
-		return LEAVE;
-	}
-	else
-	{
-		t = (val - v2[0]) / (v1[0] - v2[0]);
-		v1[0] = val;
-		v1[1] = v2[1] + t * (v1[1] - v2[1]);
-		for (i = 2; i < n; i++)
-			v1[i] = v2[i] + t * (v1[i] - v2[i]);
-		return ENTER;
-	}
-}
-
-static int clipy(float val, int ismax, float *v1, float *v2, int n)
-{
-	float t;
-	int i;
-	int v1o = ismax ? v1[1] > val : v1[1] < val;
-	int v2o = ismax ? v2[1] > val : v2[1] < val;
-	if (v1o + v2o == 0)
-		return IN;
-	if (v1o + v2o == 2)
-		return OUT;
-	if (v2o)
-	{
-		t = (val - v1[1]) / (v2[1] - v1[1]);
-		v2[0] = v1[0] + t * (v2[0] - v1[0]);
-		v2[1] = val;
-		for (i = 2; i < n; i++)
-			v2[i] = v1[i] + t * (v2[i] - v1[i]);
-		return LEAVE;
-	}
-	else
-	{
-		t = (val - v2[1]) / (v1[1] - v2[1]);
-		v1[0] = v2[0] + t * (v1[0] - v2[0]);
-		v1[1] = val;
-		for (i = 2; i < n; i++)
-			v1[i] = v2[i] + t * (v1[i] - v2[i]);
-		return ENTER;
-	}
-}
-
-static inline void copyvert(float *dst, float *src, int n)
-{
-	while (n--)
-		*dst++ = *src++;
-}
-
-static int clippoly(float src[MAXV][MAXN],
-	float dst[MAXV][MAXN], int len, int n,
-	float val, int isy, int ismax)
-{
-	float cv1[MAXN];
-	float cv2[MAXN];
-	int v1, v2, cp;
-	int r;
-
-	v1 = len - 1;
-	cp = 0;
-
-	for (v2 = 0; v2 < len; v2++)
-	{
-		copyvert(cv1, src[v1], n);
-		copyvert(cv2, src[v2], n);
-
-		if (isy)
-			r = clipy(val, ismax, cv1, cv2, n);
-		else
-			r = clipx(val, ismax, cv1, cv2, n);
-
-		switch (r)
-		{
-		case IN:
-			copyvert(dst[cp++], cv2, n);
-			break;
-		case OUT:
-			break;
-		case LEAVE:
-			copyvert(dst[cp++], cv2, n);
-			break;
-		case ENTER:
-			copyvert(dst[cp++], cv1, n);
-			copyvert(dst[cp++], cv2, n);
-			break;
-		}
-		v1 = v2;
-	}
-
-	return cp;
-}
-
-/*
- * gouraud shaded polygon scan conversion
- */
-
-static inline void
-paintscan(fz_pixmap *pix, int y, int x1, int x2, int *v1, int *v2, int n)
-{
-	unsigned char *p = pix->samples + ((y - pix->y) * pix->w + (x1 - pix->x)) * pix->n;
-	int v[FZ_MAXCOLORS];
-	int dv[FZ_MAXCOLORS];
-	int w = x2 - x1;
-	int k;
-
-	assert(w >= 0);
-	assert(y >= pix->y);
-	assert(y < pix->y + pix->h);
-	assert(x1 >= pix->x);
-	assert(x2 <= pix->x + pix->w);
-
-	if (w == 0)
-		return;
-
-	for (k = 0; k < n; k++)
-	{
-		v[k] = v1[k];
-		dv[k] = (v2[k] - v1[k]) / w;
-	}
-
-	while (w--)
-	{
-		for (k = 0; k < n; k++)
-		{
-			*p++ = v[k] >> 16;
-			v[k] += dv[k];
-		}
-		*p++ = 255;
-	}
-}
-
-static inline int
-findnext(int gel[MAXV][MAXN], int len, int a, int *s, int *e, int d)
-{
-	int b;
-
-	while (1)
-	{
-		b = a + d;
-		if (b == len)
-			b = 0;
-		if (b == -1)
-			b = len - 1;
-
-		if (gel[b][1] == gel[a][1])
-		{
-			a = b;
-			continue;
-		}
-
-		if (gel[b][1] > gel[a][1])
-		{
-			*s = a;
-			*e = b;
-			return 0;
-		}
-
-		return 1;
-	}
-}
-
-static inline void
-loadedge(int gel[MAXV][MAXN], int s, int e, int *ael, int *del, int n)
-{
-	int swp, k, dy;
-
-	if (gel[s][1] > gel[e][1])
-	{
-		swp = s; s = e; e = swp;
-	}
-
-	dy = gel[e][1] - gel[s][1];
-
-	ael[0] = gel[s][0];
-	del[0] = (gel[e][0] - gel[s][0]) / dy;
-	for (k = 2; k < n; k++)
-	{
-		ael[k] = gel[s][k];
-		del[k] = (gel[e][k] - gel[s][k]) / dy;
-	}
-}
-
-static inline void
-stepedge(int *ael, int *del, int n)
-{
-	int k;
-	ael[0] += del[0];
-	for (k = 2; k < n; k++)
-		ael[k] += del[k];
-}
-
-static void
-fz_painttriangle(fz_pixmap *pix, float *av, float *bv, float *cv, int n, fz_bbox bbox)
-{
-	float poly[MAXV][MAXN];
-	float temp[MAXV][MAXN];
-	float cx0 = bbox.x0;
-	float cy0 = bbox.y0;
-	float cx1 = bbox.x1;
-	float cy1 = bbox.y1;
-
-	int gel[MAXV][MAXN];
-	int ael[2][MAXN];
-	int del[2][MAXN];
-	int y, s0, s1, e0, e1;
-	int top, bot, len;
-
-	int i, k;
-
-	copyvert(poly[0], av, n);
-	copyvert(poly[1], bv, n);
-	copyvert(poly[2], cv, n);
-
-	len = clippoly(poly, temp, 3, n, cx0, 0, 0);
-	len = clippoly(temp, poly, len, n, cx1, 0, 1);
-	len = clippoly(poly, temp, len, n, cy0, 1, 0);
-	len = clippoly(temp, poly, len, n, cy1, 1, 1);
-
-	if (len < 3)
-		return;
-
-	for (i = 0; i < len; i++)
-	{
-		gel[i][0] = floorf(poly[i][0] + 0.5f) * 65536; /* trunc and fix */
-		gel[i][1] = floorf(poly[i][1] + 0.5f);	/* y is not fixpoint */
-		for (k = 2; k < n; k++)
-			gel[i][k] = poly[i][k] * 65536;	/* fix with precision */
-	}
-
-	top = bot = 0;
-	for (i = 0; i < len; i++)
-	{
-		if (gel[i][1] < gel[top][1])
-			top = i;
-		if (gel[i][1] > gel[bot][1])
-			bot = i;
-	}
-
-	if (gel[bot][1] - gel[top][1] == 0)
-		return;
-
-	y = gel[top][1];
-
-	if (findnext(gel, len, top, &s0, &e0, 1))
-		return;
-	if (findnext(gel, len, top, &s1, &e1, -1))
-		return;
-
-	loadedge(gel, s0, e0, ael[0], del[0], n);
-	loadedge(gel, s1, e1, ael[1], del[1], n);
-
-	while (1)
-	{
-		int x0 = ael[0][0] >> 16;
-		int x1 = ael[1][0] >> 16;
-
-		if (ael[0][0] < ael[1][0])
-			paintscan(pix, y, x0, x1, ael[0]+2, ael[1]+2, n-2);
-		else
-			paintscan(pix, y, x1, x0, ael[1]+2, ael[0]+2, n-2);
-
-		stepedge(ael[0], del[0], n);
-		stepedge(ael[1], del[1], n);
-		y ++;
-
-		if (y >= gel[e0][1])
-		{
-			if (findnext(gel, len, e0, &s0, &e0, 1))
-				return;
-			loadedge(gel, s0, e0, ael[0], del[0], n);
-		}
-
-		if (y >= gel[e1][1])
-		{
-			if (findnext(gel, len, e1, &s1, &e1, -1))
-				return;
-			loadedge(gel, s1, e1, ael[1], del[1], n);
-		}
-	}
-}
-
-static void
-fz_paintquad(fz_pixmap *pix,
-		fz_point p0, fz_point p1, fz_point p2, fz_point p3,
-		float c0, float c1, float c2, float c3,
-		int n, fz_bbox bbox)
-{
-	float v[4][3];
-
-	v[0][0] = p0.x;
-	v[0][1] = p0.y;
-	v[0][2] = c0;
-
-	v[1][0] = p1.x;
-	v[1][1] = p1.y;
-	v[1][2] = c1;
-
-	v[2][0] = p2.x;
-	v[2][1] = p2.y;
-	v[2][2] = c2;
-
-	v[3][0] = p3.x;
-	v[3][1] = p3.y;
-	v[3][2] = c3;
-
-	fz_painttriangle(pix, v[0], v[2], v[3], n, bbox);
-	fz_painttriangle(pix, v[0], v[3], v[1], n, bbox);
-}
-
-/*
- * linear, radial and mesh painting
- */
-
-#define HUGENUM 32000 /* how far to extend axial/radial shadings */
-#define RADSEGS 32 /* how many segments to generate for radial meshes */
-
-static fz_point
-fz_pointoncircle(fz_point p, float r, float theta)
-{
-	p.x = p.x + cosf(theta) * r;
-	p.y = p.y + sinf(theta) * r;
-
-	return p;
-}
-
-static void
-fz_paintlinear(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
-{
-	fz_point p0, p1;
-	fz_point v0, v1, v2, v3;
-	fz_point e0, e1;
-	float theta;
-
-	p0.x = shade->mesh[0];
-	p0.y = shade->mesh[1];
-	p0 = fz_transformpoint(ctm, p0);
-
-	p1.x = shade->mesh[3];
-	p1.y = shade->mesh[4];
-	p1 = fz_transformpoint(ctm, p1);
-
-	theta = atan2f(p1.y - p0.y, p1.x - p0.x);
-	theta += (float)M_PI * 0.5f;
-
-	v0 = fz_pointoncircle(p0, HUGENUM, theta);
-	v1 = fz_pointoncircle(p1, HUGENUM, theta);
-	v2 = fz_pointoncircle(p0, -HUGENUM, theta);
-	v3 = fz_pointoncircle(p1, -HUGENUM, theta);
-
-	fz_paintquad(dest, v0, v1, v2, v3, 0, 255, 0, 255, 3, bbox);
-
-	if (shade->extend[0])
-	{
-		e0.x = v0.x - (p1.x - p0.x) * HUGENUM;
-		e0.y = v0.y - (p1.y - p0.y) * HUGENUM;
-
-		e1.x = v2.x - (p1.x - p0.x) * HUGENUM;
-		e1.y = v2.y - (p1.y - p0.y) * HUGENUM;
-
-		fz_paintquad(dest, e0, e1, v0, v2, 0, 0, 0, 0, 3, bbox);
-	}
-
-	if (shade->extend[1])
-	{
-		e0.x = v1.x + (p1.x - p0.x) * HUGENUM;
-		e0.y = v1.y + (p1.y - p0.y) * HUGENUM;
-
-		e1.x = v3.x + (p1.x - p0.x) * HUGENUM;
-		e1.y = v3.y + (p1.y - p0.y) * HUGENUM;
-
-		fz_paintquad(dest, e0, e1, v1, v3, 255, 255, 255, 255, 3, bbox);
-	}
-}
-
-static void
-fz_paintannulus(fz_matrix ctm,
-		fz_point p0, float r0, float c0,
-		fz_point p1, float r1, float c1,
-		fz_pixmap *dest, fz_bbox bbox)
-{
-	fz_point t0, t1, t2, t3, b0, b1, b2, b3;
-	float theta, step;
-	int i;
-
-	theta = atan2f(p1.y - p0.y, p1.x - p0.x);
-	step = (float)M_PI * 2 / RADSEGS;
-
-	for (i = 0; i < RADSEGS / 2; i++)
-	{
-		t0 = fz_pointoncircle(p0, r0, theta + i * step);
-		t1 = fz_pointoncircle(p0, r0, theta + i * step + step);
-		t2 = fz_pointoncircle(p1, r1, theta + i * step);
-		t3 = fz_pointoncircle(p1, r1, theta + i * step + step);
-		b0 = fz_pointoncircle(p0, r0, theta - i * step);
-		b1 = fz_pointoncircle(p0, r0, theta - i * step - step);
-		b2 = fz_pointoncircle(p1, r1, theta - i * step);
-		b3 = fz_pointoncircle(p1, r1, theta - i * step - step);
-
-		t0 = fz_transformpoint(ctm, t0);
-		t1 = fz_transformpoint(ctm, t1);
-		t2 = fz_transformpoint(ctm, t2);
-		t3 = fz_transformpoint(ctm, t3);
-		b0 = fz_transformpoint(ctm, b0);
-		b1 = fz_transformpoint(ctm, b1);
-		b2 = fz_transformpoint(ctm, b2);
-		b3 = fz_transformpoint(ctm, b3);
-
-		fz_paintquad(dest, t0, t1, t2, t3, c0, c0, c1, c1, 3, bbox);
-		fz_paintquad(dest, b0, b1, b2, b3, c0, c0, c1, c1, 3, bbox);
-	}
-}
-
-static void
-fz_paintradial(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
-{
-	fz_point p0, p1;
-	float r0, r1;
-	fz_point e;
-	float er, rs;
-
-	p0.x = shade->mesh[0];
-	p0.y = shade->mesh[1];
-	r0 = shade->mesh[2];
-
-	p1.x = shade->mesh[3];
-	p1.y = shade->mesh[4];
-	r1 = shade->mesh[5];
-
-	if (shade->extend[0])
-	{
-		if (r0 < r1)
-			rs = r0 / (r0 - r1);
-		else
-			rs = -HUGENUM;
-
-		e.x = p0.x + (p1.x - p0.x) * rs;
-		e.y = p0.y + (p1.y - p0.y) * rs;
-		er = r0 + (r1 - r0) * rs;
-
-		fz_paintannulus(ctm, e, er, 0, p0, r0, 0, dest, bbox);
-	}
-
-	fz_paintannulus(ctm, p0, r0, 0, p1, r1, 255, dest, bbox);
-
-	if (shade->extend[1])
-	{
-		if (r0 > r1)
-			rs = r1 / (r1 - r0);
-		else
-			rs = -HUGENUM;
-
-		e.x = p1.x + (p0.x - p1.x) * rs;
-		e.y = p1.y + (p0.y - p1.y) * rs;
-		er = r1 + (r0 - r1) * rs;
-
-		fz_paintannulus(ctm, p1, r1, 255, e, er, 255, dest, bbox);
-	}
-}
-
-static void
-fz_paintmesh(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
-{
-	float tri[3][MAXN];
-	fz_point p;
-	float *mesh;
-	int ntris;
-	int i, k;
-
-	mesh = shade->mesh;
-
-	if (shade->usefunction)
-		ntris = shade->meshlen / 9;
-	else
-		ntris = shade->meshlen / ((2 + shade->colorspace->n) * 3);
-
-	while (ntris--)
-	{
-		for (k = 0; k < 3; k++)
-		{
-			p.x = *mesh++;
-			p.y = *mesh++;
-			p = fz_transformpoint(ctm, p);
-			tri[k][0] = p.x;
-			tri[k][1] = p.y;
-			if (shade->usefunction)
-				tri[k][2] = *mesh++ * 255;
-			else
-			{
-				fz_convertcolor(shade->colorspace, mesh, dest->colorspace, tri[k] + 2);
-				for (i = 0; i < dest->colorspace->n; i++)
-					tri[k][i + 2] *= 255;
-				mesh += shade->colorspace->n;
-			}
-		}
-		fz_painttriangle(dest, tri[0], tri[1], tri[2], 2 + dest->colorspace->n, bbox);
-	}
-}
-
-void
-fz_paintshade(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
-{
-	unsigned char clut[256][FZ_MAXCOLORS];
-	fz_pixmap *temp, *conv;
-	float color[FZ_MAXCOLORS];
-	int i, k;
-
-	ctm = fz_concat(shade->matrix, ctm);
-
-	if (shade->usefunction)
-	{
-		for (i = 0; i < 256; i++)
-		{
-			fz_convertcolor(shade->colorspace, shade->function[i], dest->colorspace, color);
-			for (k = 0; k < dest->colorspace->n; k++)
-				clut[i][k] = color[k] * 255;
-			clut[i][k] = shade->function[i][shade->colorspace->n] * 255;
-		}
-		conv = fz_newpixmapwithrect(dest->colorspace, bbox);
-		temp = fz_newpixmapwithrect(fz_devicegray, bbox);
-		fz_clearpixmap(temp);
-	}
-	else
-	{
-		temp = dest;
-	}
-
-	switch (shade->type)
-	{
-	case FZ_LINEAR: fz_paintlinear(shade, ctm, temp, bbox); break;
-	case FZ_RADIAL: fz_paintradial(shade, ctm, temp, bbox); break;
-	case FZ_MESH: fz_paintmesh(shade, ctm, temp, bbox); break;
-	}
-
-	if (shade->usefunction)
-	{
-		unsigned char *s = temp->samples;
-		unsigned char *d = conv->samples;
-		int len = temp->w * temp->h;
-		while (len--)
-		{
-			int v = *s++;
-			int a = fz_mul255(*s++, clut[v][conv->n - 1]);
-			for (k = 0; k < conv->n - 1; k++)
-				*d++ = fz_mul255(clut[v][k], a);
-			*d++ = a;
-		}
-		fz_paintpixmap(dest, conv, 255);
-		fz_droppixmap(conv);
-		fz_droppixmap(temp);
-	}
-}
-
diff --git a/draw/pathscan.c b/draw/pathscan.c
deleted file mode 100644
index aa956077..00000000
--- a/draw/pathscan.c
+++ /dev/null
@@ -1,559 +0,0 @@
-#include "fitz.h"
-
-#define BBOX_MIN -(1<<20)
-#define BBOX_MAX (1<<20)
-
-/* divide and floor towards -inf */
-static inline int fz_idiv(int a, int b)
-{
-	return a < 0 ? (a - b + 1) / b : a / b;
-}
-
-enum { HSCALE = 17, VSCALE = 15, SF = 1 };
-
-/*
- * Global Edge List -- list of straight path segments for scan conversion
- *
- * Stepping along the edges is with bresenham's line algorithm.
- *
- * See Mike Abrash -- Graphics Programming Black Book (notably chapter 40)
- */
-
-fz_gel *
-fz_newgel(void)
-{
-	fz_gel *gel;
-
-	gel = fz_malloc(sizeof(fz_gel));
-	gel->cap = 512;
-	gel->len = 0;
-	gel->edges = fz_calloc(gel->cap, sizeof(fz_edge));
-
-	gel->clip.x0 = gel->clip.y0 = BBOX_MAX;
-	gel->clip.x1 = gel->clip.y1 = BBOX_MIN;
-
-	gel->bbox.x0 = gel->bbox.y0 = BBOX_MAX;
-	gel->bbox.x1 = gel->bbox.y1 = BBOX_MIN;
-
-	return gel;
-}
-
-void
-fz_resetgel(fz_gel *gel, fz_bbox clip)
-{
-	if (fz_isinfiniterect(clip))
-	{
-		gel->clip.x0 = gel->clip.y0 = BBOX_MAX;
-		gel->clip.x1 = gel->clip.y1 = BBOX_MIN;
-	}
-	else {
-		gel->clip.x0 = clip.x0 * HSCALE;
-		gel->clip.x1 = clip.x1 * HSCALE;
-		gel->clip.y0 = clip.y0 * VSCALE;
-		gel->clip.y1 = clip.y1 * VSCALE;
-	}
-
-	gel->bbox.x0 = gel->bbox.y0 = BBOX_MAX;
-	gel->bbox.x1 = gel->bbox.y1 = BBOX_MIN;
-
-	gel->len = 0;
-}
-
-void
-fz_freegel(fz_gel *gel)
-{
-	fz_free(gel->edges);
-	fz_free(gel);
-}
-
-fz_bbox
-fz_boundgel(fz_gel *gel)
-{
-	fz_bbox bbox;
-	if (gel->len == 0)
-		return fz_emptybbox;
-	bbox.x0 = fz_idiv(gel->bbox.x0, HSCALE);
-	bbox.y0 = fz_idiv(gel->bbox.y0, VSCALE);
-	bbox.x1 = fz_idiv(gel->bbox.x1, HSCALE) + 1;
-	bbox.y1 = fz_idiv(gel->bbox.y1, VSCALE) + 1;
-	return bbox;
-}
-
-enum { INSIDE, OUTSIDE, LEAVE, ENTER };
-
-#define cliplerpy(v,m,x0,y0,x1,y1,t) cliplerpx(v,m,y0,x0,y1,x1,t)
-
-static int
-cliplerpx(int val, int m, int x0, int y0, int x1, int y1, int *out)
-{
-	int v0out = m ? x0 > val : x0 < val;
-	int v1out = m ? x1 > val : x1 < val;
-
-	if (v0out + v1out == 0)
-		return INSIDE;
-
-	if (v0out + v1out == 2)
-		return OUTSIDE;
-
-	if (v1out)
-	{
-		*out = y0 + (y1 - y0) * (val - x0) / (x1 - x0);
-		return LEAVE;
-	}
-
-	else
-	{
-		*out = y1 + (y0 - y1) * (val - x1) / (x0 - x1);
-		return ENTER;
-	}
-}
-
-static void
-fz_insertgelraw(fz_gel *gel, int x0, int y0, int x1, int y1)
-{
-	fz_edge *edge;
-	int dx, dy;
-	int winding;
-	int width;
-	int tmp;
-
-	if (y0 == y1)
-		return;
-
-	if (y0 > y1) {
-		winding = -1;
-		tmp = x0; x0 = x1; x1 = tmp;
-		tmp = y0; y0 = y1; y1 = tmp;
-	}
-	else
-		winding = 1;
-
-	if (x0 < gel->bbox.x0) gel->bbox.x0 = x0;
-	if (x0 > gel->bbox.x1) gel->bbox.x1 = x0;
-	if (x1 < gel->bbox.x0) gel->bbox.x0 = x1;
-	if (x1 > gel->bbox.x1) gel->bbox.x1 = x1;
-
-	if (y0 < gel->bbox.y0) gel->bbox.y0 = y0;
-	if (y1 > gel->bbox.y1) gel->bbox.y1 = y1;
-
-	if (gel->len + 1 == gel->cap) {
-		gel->cap = gel->cap + 512;
-		gel->edges = fz_realloc(gel->edges, gel->cap, sizeof(fz_edge));
-	}
-
-	edge = &gel->edges[gel->len++];
-
-	dy = y1 - y0;
-	dx = x1 - x0;
-	width = ABS(dx);
-
-	edge->xdir = dx > 0 ? 1 : -1;
-	edge->ydir = winding;
-	edge->x = x0;
-	edge->y = y0;
-	edge->h = dy;
-	edge->adjdown = dy;
-
-	/* initial error term going l->r and r->l */
-	if (dx >= 0)
-		edge->e = 0;
-	else
-		edge->e = -dy + 1;
-
-	/* y-major edge */
-	if (dy >= width) {
-		edge->xmove = 0;
-		edge->adjup = width;
-	}
-
-	/* x-major edge */
-	else {
-		edge->xmove = (width / dy) * edge->xdir;
-		edge->adjup = width % dy;
-	}
-}
-
-void
-fz_insertgel(fz_gel *gel, float fx0, float fy0, float fx1, float fy1)
-{
-	int x0, y0, x1, y1;
-	int d, v;
-
-	fx0 = floorf(fx0 * HSCALE);
-	fx1 = floorf(fx1 * HSCALE);
-	fy0 = floorf(fy0 * VSCALE);
-	fy1 = floorf(fy1 * VSCALE);
-
-	x0 = CLAMP(fx0, BBOX_MIN, BBOX_MAX);
-	y0 = CLAMP(fy0, BBOX_MIN, BBOX_MAX);
-	x1 = CLAMP(fx1, BBOX_MIN, BBOX_MAX);
-	y1 = CLAMP(fy1, BBOX_MIN, BBOX_MAX);
-
-	d = cliplerpy(gel->clip.y0, 0, x0, y0, x1, y1, &v);
-	if (d == OUTSIDE) return;
-	if (d == LEAVE) { y1 = gel->clip.y0; x1 = v; }
-	if (d == ENTER) { y0 = gel->clip.y0; x0 = v; }
-
-	d = cliplerpy(gel->clip.y1, 1, x0, y0, x1, y1, &v);
-	if (d == OUTSIDE) return;
-	if (d == LEAVE) { y1 = gel->clip.y1; x1 = v; }
-	if (d == ENTER) { y0 = gel->clip.y1; x0 = v; }
-
-	d = cliplerpx(gel->clip.x0, 0, x0, y0, x1, y1, &v);
-	if (d == OUTSIDE) {
-		x0 = x1 = gel->clip.x0;
-	}
-	if (d == LEAVE) {
-		fz_insertgelraw(gel, gel->clip.x0, v, gel->clip.x0, y1);
-		x1 = gel->clip.x0;
-		y1 = v;
-	}
-	if (d == ENTER) {
-		fz_insertgelraw(gel, gel->clip.x0, y0, gel->clip.x0, v);
-		x0 = gel->clip.x0;
-		y0 = v;
-	}
-
-	d = cliplerpx(gel->clip.x1, 1, x0, y0, x1, y1, &v);
-	if (d == OUTSIDE) {
-		x0 = x1 = gel->clip.x1;
-	}
-	if (d == LEAVE) {
-		fz_insertgelraw(gel, gel->clip.x1, v, gel->clip.x1, y1);
-		x1 = gel->clip.x1;
-		y1 = v;
-	}
-	if (d == ENTER) {
-		fz_insertgelraw(gel, gel->clip.x1, y0, gel->clip.x1, v);
-		x0 = gel->clip.x1;
-		y0 = v;
-	}
-
-	fz_insertgelraw(gel, x0, y0, x1, y1);
-}
-
-void
-fz_sortgel(fz_gel *gel)
-{
-	fz_edge *a = gel->edges;
-	int n = gel->len;
-
-	int h, i, k;
-	fz_edge t;
-
-	h = 1;
-	if (n < 14) {
-		h = 1;
-	}
-	else {
-		while (h < n)
-			h = 3 * h + 1;
-		h /= 3;
-		h /= 3;
-	}
-
-	while (h > 0)
-	{
-		for (i = 0; i < n; i++) {
-			t = a[i];
-			k = i - h;
-			/* TODO: sort on y major, x minor */
-			while (k >= 0 && a[k].y > t.y) {
-				a[k + h] = a[k];
-				k -= h;
-			}
-			a[k + h] = t;
-		}
-
-		h /= 3;
-	}
-}
-
-int
-fz_isrectgel(fz_gel *gel)
-{
-	/* a rectangular path is converted into two vertical edges of identical height */
-	if (gel->len == 2)
-	{
-		fz_edge *a = gel->edges + 0;
-		fz_edge *b = gel->edges + 1;
-		return a->y == b->y && a->h == b->h &&
-			a->xmove == 0 && a->adjup == 0 &&
-			b->xmove == 0 && b->adjup == 0;
-	}
-	return 0;
-}
-
-/*
- * Active Edge List -- keep track of active edges while sweeping
- */
-
-fz_ael *
-fz_newael(void)
-{
-	fz_ael *ael;
-	ael = fz_malloc(sizeof(fz_ael));
-	ael->cap = 64;
-	ael->len = 0;
-	ael->edges = fz_calloc(ael->cap, sizeof(fz_edge*));
-	return ael;
-}
-
-void
-fz_freeael(fz_ael *ael)
-{
-	fz_free(ael->edges);
-	fz_free(ael);
-}
-
-static inline void
-sortael(fz_edge **a, int n)
-{
-	int h, i, k;
-	fz_edge *t;
-
-	h = 1;
-	if (n < 14) {
-		h = 1;
-	}
-	else {
-		while (h < n)
-			h = 3 * h + 1;
-		h /= 3;
-		h /= 3;
-	}
-
-	while (h > 0)
-	{
-		for (i = 0; i < n; i++) {
-			t = a[i];
-			k = i - h;
-			while (k >= 0 && a[k]->x > t->x) {
-				a[k + h] = a[k];
-				k -= h;
-			}
-			a[k + h] = t;
-		}
-
-		h /= 3;
-	}
-}
-
-static fz_error
-insertael(fz_ael *ael, fz_gel *gel, int y, int *e)
-{
-	/* insert edges that start here */
-	while (*e < gel->len && gel->edges[*e].y == y) {
-		if (ael->len + 1 == ael->cap) {
-			int newcap = ael->cap + 64;
-			fz_edge **newedges = fz_realloc(ael->edges, newcap, sizeof(fz_edge*));
-			ael->edges = newedges;
-			ael->cap = newcap;
-		}
-		ael->edges[ael->len++] = &gel->edges[(*e)++];
-	}
-
-	/* shell-sort the edges by increasing x */
-	sortael(ael->edges, ael->len);
-
-	return fz_okay;
-}
-
-static void
-advanceael(fz_ael *ael)
-{
-	fz_edge *edge;
-	int i = 0;
-
-	while (i < ael->len)
-	{
-		edge = ael->edges[i];
-
-		edge->h --;
-
-		/* terminator! */
-		if (edge->h == 0) {
-			ael->edges[i] = ael->edges[--ael->len];
-		}
-
-		else {
-			edge->x += edge->xmove;
-			edge->e += edge->adjup;
-			if (edge->e > 0) {
-				edge->x += edge->xdir;
-				edge->e -= edge->adjdown;
-			}
-			i ++;
-		}
-	}
-}
-
-/*
- * Scan convert
- */
-
-static inline void
-addspan(unsigned char *list, int x0, int x1, int xofs)
-{
-	int x0pix, x0sub;
-	int x1pix, x1sub;
-
-	if (x0 == x1)
-		return;
-
-	/* x between 0 and width of bbox */
-	x0 -= xofs;
-	x1 -= xofs;
-
-	x0pix = x0 / HSCALE;
-	x0sub = x0 % HSCALE;
-	x1pix = x1 / HSCALE;
-	x1sub = x1 % HSCALE;
-
-	if (x0pix == x1pix)
-	{
-		list[x0pix] += x1sub - x0sub;
-		list[x0pix+1] += x0sub - x1sub;
-	}
-
-	else
-	{
-		list[x0pix] += HSCALE - x0sub;
-		list[x0pix+1] += x0sub;
-		list[x1pix] += x1sub - HSCALE;
-		list[x1pix+1] += -x1sub;
-	}
-}
-
-static inline void
-nonzerowinding(fz_ael *ael, unsigned char *list, int xofs)
-{
-	int winding = 0;
-	int x = 0;
-	int i;
-	for (i = 0; i < ael->len; i++)
-	{
-		if (!winding && (winding + ael->edges[i]->ydir))
-			x = ael->edges[i]->x;
-		if (winding && !(winding + ael->edges[i]->ydir))
-			addspan(list, x, ael->edges[i]->x, xofs);
-		winding += ael->edges[i]->ydir;
-	}
-}
-
-static inline void
-evenodd(fz_ael *ael, unsigned char *list, int xofs)
-{
-	int even = 0;
-	int x = 0;
-	int i;
-	for (i = 0; i < ael->len; i++)
-	{
-		if (!even)
-			x = ael->edges[i]->x;
-		else
-			addspan(list, x, ael->edges[i]->x, xofs);
-		even = !even;
-	}
-}
-
-static inline void
-undelta(unsigned char *list, int n)
-{
-	int d = 0;
-	while (n--)
-	{
-		d += *list;
-		*list++ = d;
-	}
-}
-
-static inline void
-blit(fz_pixmap *dest, int x, int y, unsigned char *mp, int w, unsigned char *color)
-{
-	unsigned char *dp;
-
-	dp = dest->samples + ( (y - dest->y) * dest->w + (x - dest->x) ) * dest->n;
-
-	if (color)
-		fz_paintspancolor(dp, mp, dest->n, w, color);
-	else
-		fz_paintspan(dp, mp, 1, w, 255);
-}
-
-fz_error
-fz_scanconvert(fz_gel *gel, fz_ael *ael, int eofill, fz_bbox clip,
-	fz_pixmap *dest, unsigned char *color)
-{
-	fz_error error;
-	unsigned char *deltas;
-	int y, e;
-	int yd, yc;
-
-	int xmin = fz_idiv(gel->bbox.x0, HSCALE);
-	int xmax = fz_idiv(gel->bbox.x1, HSCALE) + 1;
-
-	int xofs = xmin * HSCALE;
-
-	int skipx = clip.x0 - xmin;
-	int clipn = clip.x1 - clip.x0;
-
-	if (gel->len == 0)
-		return fz_okay;
-
-	assert(clip.x0 >= xmin);
-	assert(clip.x1 <= xmax);
-
-	deltas = fz_malloc(xmax - xmin + 1);
-	memset(deltas, 0, xmax - xmin + 1);
-
-	e = 0;
-	y = gel->edges[0].y;
-	yc = fz_idiv(y, VSCALE);
-	yd = yc;
-
-	while (ael->len > 0 || e < gel->len)
-	{
-		yc = fz_idiv(y, VSCALE);
-		if (yc != yd)
-		{
-			if (yd >= clip.y0 && yd < clip.y1)
-			{
-				undelta(deltas, skipx + clipn);
-				blit(dest, xmin + skipx, yd, deltas + skipx, clipn, color);
-				memset(deltas, 0, skipx + clipn);
-			}
-		}
-		yd = yc;
-
-		error = insertael(ael, gel, y, &e);
-		if (error) {
-			fz_free(deltas);
-			return error;
-		}
-
-		if (yd >= clip.y0 && yd < clip.y1)
-		{
-			if (eofill)
-				evenodd(ael, deltas, xofs);
-			else
-				nonzerowinding(ael, deltas, xofs);
-		}
-
-		advanceael(ael);
-
-		if (ael->len > 0)
-			y ++;
-		else if (e < gel->len)
-			y = gel->edges[e].y;
-	}
-
-	if (yd >= clip.y0 && yd < clip.y1)
-	{
-		undelta(deltas, skipx + clipn);
-		blit(dest, xmin + skipx, yd, deltas + skipx, clipn, color);
-	}
-
-	fz_free(deltas);
-	return fz_okay;
-}
diff --git a/draw/pathstroke.c b/draw/pathstroke.c
deleted file mode 100644
index 23ea3515..00000000
--- a/draw/pathstroke.c
+++ /dev/null
@@ -1,773 +0,0 @@
-#include "fitz.h"
-
-#define MAXDEPTH 8
-
-enum { BUTT = 0, ROUND = 1, SQUARE = 2, MITER = 0, BEVEL = 2 };
-
-static void
-line(fz_gel *gel, fz_matrix *ctm, float x0, float y0, float x1, float y1)
-{
-	float tx0 = ctm->a * x0 + ctm->c * y0 + ctm->e;
-	float ty0 = ctm->b * x0 + ctm->d * y0 + ctm->f;
-	float tx1 = ctm->a * x1 + ctm->c * y1 + ctm->e;
-	float ty1 = ctm->b * x1 + ctm->d * y1 + ctm->f;
-	fz_insertgel(gel, tx0, ty0, tx1, ty1);
-}
-
-static void
-bezier(fz_gel *gel, fz_matrix *ctm, float flatness,
-	float xa, float ya,
-	float xb, float yb,
-	float xc, float yc,
-	float xd, float yd, int depth)
-{
-	float dmax;
-	float xab, yab;
-	float xbc, ybc;
-	float xcd, ycd;
-	float xabc, yabc;
-	float xbcd, ybcd;
-	float xabcd, yabcd;
-
-	/* termination check */
-	dmax = ABS(xa - xb);
-	dmax = MAX(dmax, ABS(ya - yb));
-	dmax = MAX(dmax, ABS(xd - xc));
-	dmax = MAX(dmax, ABS(yd - yc));
-	if (dmax < flatness || depth >= MAXDEPTH)
-	{
-		line(gel, ctm, xa, ya, xd, yd);
-		return;
-	}
-
-	xab = xa + xb;
-	yab = ya + yb;
-	xbc = xb + xc;
-	ybc = yb + yc;
-	xcd = xc + xd;
-	ycd = yc + yd;
-
-	xabc = xab + xbc;
-	yabc = yab + ybc;
-	xbcd = xbc + xcd;
-	ybcd = ybc + ycd;
-
-	xabcd = xabc + xbcd;
-	yabcd = yabc + ybcd;
-
-	xab *= 0.5f; yab *= 0.5f;
-	xbc *= 0.5f; ybc *= 0.5f;
-	xcd *= 0.5f; ycd *= 0.5f;
-
-	xabc *= 0.25f; yabc *= 0.25f;
-	xbcd *= 0.25f; ybcd *= 0.25f;
-
-	xabcd *= 0.125f; yabcd *= 0.125f;
-
-	bezier(gel, ctm, flatness, xa, ya, xab, yab, xabc, yabc, xabcd, yabcd, depth + 1);
-	bezier(gel, ctm, flatness, xabcd, yabcd, xbcd, ybcd, xcd, ycd, xd, yd, depth + 1);
-}
-
-void
-fz_fillpath(fz_gel *gel, fz_path *path, fz_matrix ctm, float flatness)
-{
-	float x1, y1, x2, y2, x3, y3;
-	float cx = 0;
-	float cy = 0;
-	float bx = 0;
-	float by = 0;
-	int i = 0;
-
-	while (i < path->len)
-	{
-		switch (path->els[i++].k)
-		{
-		case FZ_MOVETO:
-			/* implicit closepath before moveto */
-			if (i && (cx != bx || cy != by))
-				line(gel, &ctm, cx, cy, bx, by);
-			x1 = path->els[i++].v;
-			y1 = path->els[i++].v;
-			cx = bx = x1;
-			cy = by = y1;
-			break;
-
-		case FZ_LINETO:
-			x1 = path->els[i++].v;
-			y1 = path->els[i++].v;
-			line(gel, &ctm, cx, cy, x1, y1);
-			cx = x1;
-			cy = y1;
-			break;
-
-		case FZ_CURVETO:
-			x1 = path->els[i++].v;
-			y1 = path->els[i++].v;
-			x2 = path->els[i++].v;
-			y2 = path->els[i++].v;
-			x3 = path->els[i++].v;
-			y3 = path->els[i++].v;
-			bezier(gel, &ctm, flatness, cx, cy, x1, y1, x2, y2, x3, y3, 0);
-			cx = x3;
-			cy = y3;
-			break;
-
-		case FZ_CLOSEPATH:
-			line(gel, &ctm, cx, cy, bx, by);
-			cx = bx;
-			cy = by;
-			break;
-		}
-	}
-
-	if (i && (cx != bx || cy != by))
-		line(gel, &ctm, cx, cy, bx, by);
-}
-
-struct sctx
-{
-	fz_gel *gel;
-	fz_matrix *ctm;
-	float flatness;
-
-	int linecap;
-	int linejoin;
-	float linewidth;
-	float miterlimit;
-	fz_point beg[2];
-	fz_point seg[2];
-	int sn, bn;
-	int dot;
-
-	float *dashlist;
-	float dashphase;
-	int dashlen;
-	int toggle;
-	int offset;
-	float phase;
-	fz_point cur;
-};
-
-static void
-fz_addline(struct sctx *s, float x0, float y0, float x1, float y1)
-{
-	float tx0 = s->ctm->a * x0 + s->ctm->c * y0 + s->ctm->e;
-	float ty0 = s->ctm->b * x0 + s->ctm->d * y0 + s->ctm->f;
-	float tx1 = s->ctm->a * x1 + s->ctm->c * y1 + s->ctm->e;
-	float ty1 = s->ctm->b * x1 + s->ctm->d * y1 + s->ctm->f;
-	fz_insertgel(s->gel, tx0, ty0, tx1, ty1);
-}
-
-static void
-fz_addarc(struct sctx *s,
-	float xc, float yc,
-	float x0, float y0,
-	float x1, float y1)
-{
-	float th0, th1, r;
-	float theta;
-	float ox, oy, nx, ny;
-	int n, i;
-
-	r = fabsf(s->linewidth);
-	theta = 2 * (float)M_SQRT2 * sqrtf(s->flatness / r);
-	th0 = atan2f(y0, x0);
-	th1 = atan2f(y1, x1);
-
-	if (r > 0)
-	{
-		if (th0 < th1)
-			th0 += (float)M_PI * 2;
-		n = ceilf((th0 - th1) / theta);
-	}
-	else
-	{
-		if (th1 < th0)
-			th1 += (float)M_PI * 2;
-		n = ceilf((th1 - th0) / theta);
-	}
-
-	ox = x0;
-	oy = y0;
-	for (i = 1; i < n; i++)
-	{
-		theta = th0 + (th1 - th0) * i / n;
-		nx = cosf(theta) * r;
-		ny = sinf(theta) * r;
-		fz_addline(s, xc + ox, yc + oy, xc + nx, yc + ny);
-		ox = nx;
-		oy = ny;
-	}
-
-	fz_addline(s, xc + ox, yc + oy, xc + x1, yc + y1);
-}
-
-static void
-fz_linestroke(struct sctx *s, fz_point a, fz_point b)
-{
-	float dx = b.x - a.x;
-	float dy = b.y - a.y;
-	float scale = s->linewidth / sqrtf(dx * dx + dy * dy);
-	float dlx = dy * scale;
-	float dly = -dx * scale;
-	fz_addline(s, a.x - dlx, a.y - dly, b.x - dlx, b.y - dly);
-	fz_addline(s, b.x + dlx, b.y + dly, a.x + dlx, a.y + dly);
-}
-
-static void
-fz_linejoin(struct sctx *s, fz_point a, fz_point b, fz_point c)
-{
-	float miterlimit = s->miterlimit;
-	float linewidth = s->linewidth;
-	int linejoin = s->linejoin;
-	float dx0, dy0;
-	float dx1, dy1;
-	float dlx0, dly0;
-	float dlx1, dly1;
-	float dmx, dmy;
-	float dmr2;
-	float scale;
-	float cross;
-
-	dx0 = b.x - a.x;
-	dy0 = b.y - a.y;
-
-	dx1 = c.x - b.x;
-	dy1 = c.y - b.y;
-
-	if (dx0 * dx0 + dy0 * dy0 < FLT_EPSILON)
-		linejoin = BEVEL;
-	if (dx1 * dx1 + dy1 * dy1 < FLT_EPSILON)
-		linejoin = BEVEL;
-
-	scale = linewidth / sqrtf(dx0 * dx0 + dy0 * dy0);
-	dlx0 = dy0 * scale;
-	dly0 = -dx0 * scale;
-
-	scale = linewidth / sqrtf(dx1 * dx1 + dy1 * dy1);
-	dlx1 = dy1 * scale;
-	dly1 = -dx1 * scale;
-
-	cross = dx1 * dy0 - dx0 * dy1;
-
-	dmx = (dlx0 + dlx1) * 0.5f;
-	dmy = (dly0 + dly1) * 0.5f;
-	dmr2 = dmx * dmx + dmy * dmy;
-
-	if (cross * cross < FLT_EPSILON && dx0 * dx1 + dy0 * dy1 >= 0)
-		linejoin = BEVEL;
-
-	if (linejoin == MITER)
-		if (dmr2 * miterlimit * miterlimit < linewidth * linewidth)
-			linejoin = BEVEL;
-
-	if (linejoin == BEVEL)
-	{
-		fz_addline(s, b.x - dlx0, b.y - dly0, b.x - dlx1, b.y - dly1);
-		fz_addline(s, b.x + dlx1, b.y + dly1, b.x + dlx0, b.y + dly0);
-	}
-
-	if (linejoin == MITER)
-	{
-		scale = linewidth * linewidth / dmr2;
-		dmx *= scale;
-		dmy *= scale;
-
-		if (cross < 0)
-		{
-			fz_addline(s, b.x - dlx0, b.y - dly0, b.x - dlx1, b.y - dly1);
-			fz_addline(s, b.x + dlx1, b.y + dly1, b.x + dmx, b.y + dmy);
-			fz_addline(s, b.x + dmx, b.y + dmy, b.x + dlx0, b.y + dly0);
-		}
-		else
-		{
-			fz_addline(s, b.x + dlx1, b.y + dly1, b.x + dlx0, b.y + dly0);
-			fz_addline(s, b.x - dlx0, b.y - dly0, b.x - dmx, b.y - dmy);
-			fz_addline(s, b.x - dmx, b.y - dmy, b.x - dlx1, b.y - dly1);
-		}
-	}
-
-	if (linejoin == ROUND)
-	{
-		if (cross < 0)
-		{
-			fz_addline(s, b.x - dlx0, b.y - dly0, b.x - dlx1, b.y - dly1);
-			fz_addarc(s, b.x, b.y, dlx1, dly1, dlx0, dly0);
-		}
-		else
-		{
-			fz_addline(s, b.x + dlx1, b.y + dly1, b.x + dlx0, b.y + dly0);
-			fz_addarc(s, b.x, b.y, -dlx0, -dly0, -dlx1, -dly1);
-		}
-	}
-}
-
-static void
-fz_linecap(struct sctx *s, fz_point a, fz_point b)
-{
-	float flatness = s->flatness;
-	float linewidth = s->linewidth;
-	int linecap = s->linecap;
-
-	float dx = b.x - a.x;
-	float dy = b.y - a.y;
-
-	float scale = linewidth / sqrtf(dx * dx + dy * dy);
-	float dlx = dy * scale;
-	float dly = -dx * scale;
-
-	if (linecap == BUTT)
-		fz_addline(s, b.x - dlx, b.y - dly, b.x + dlx, b.y + dly);
-
-	if (linecap == ROUND)
-	{
-		int i;
-		int n = ceilf((float)M_PI / (2.0f * (float)M_SQRT2 * sqrtf(flatness / linewidth)));
-		float ox = b.x - dlx;
-		float oy = b.y - dly;
-		for (i = 1; i < n; i++)
-		{
-			float theta = (float)M_PI * i / n;
-			float cth = cosf(theta);
-			float sth = sinf(theta);
-			float nx = b.x - dlx * cth - dly * sth;
-			float ny = b.y - dly * cth + dlx * sth;
-			fz_addline(s, ox, oy, nx, ny);
-			ox = nx;
-			oy = ny;
-		}
-		fz_addline(s, ox, oy, b.x + dlx, b.y + dly);
-	}
-
-	if (linecap == SQUARE)
-	{
-		fz_addline(s, b.x - dlx, b.y - dly,
-			b.x - dlx - dly,
-			b.y - dly + dlx);
-		fz_addline(s, b.x - dlx - dly,
-			b.y - dly + dlx,
-			b.x + dlx - dly,
-			b.y + dly + dlx);
-		fz_addline(s, b.x + dlx - dly,
-			b.y + dly + dlx,
-			b.x + dlx, b.y + dly);
-	}
-}
-
-static void
-fz_linedot(struct sctx *s, fz_point a)
-{
-	float flatness = s->flatness;
-	float linewidth = s->linewidth;
-	int n = ceilf((float)M_PI / ((float)M_SQRT2 * sqrtf(flatness / linewidth)));
-	float ox = a.x - linewidth;
-	float oy = a.y;
-	int i;
-
-	for (i = 1; i < n; i++)
-	{
-		float theta = (float)M_PI * 2 * i / n;
-		float cth = cosf(theta);
-		float sth = sinf(theta);
-		float nx = a.x - cth * linewidth;
-		float ny = a.y + sth * linewidth;
-		fz_addline(s, ox, oy, nx, ny);
-		ox = nx;
-		oy = ny;
-	}
-
-	fz_addline(s, ox, oy, a.x - linewidth, a.y);
-}
-
-static void
-fz_strokeflush(struct sctx *s)
-{
-	if (s->sn == 2)
-	{
-		fz_linecap(s, s->beg[1], s->beg[0]);
-		fz_linecap(s, s->seg[0], s->seg[1]);
-	}
-	else if (s->dot)
-	{
-		fz_linedot(s, s->beg[0]);
-	}
-}
-
-static void
-fz_strokemoveto(struct sctx *s, fz_point cur)
-{
-	fz_strokeflush(s);
-	s->seg[0] = cur;
-	s->beg[0] = cur;
-	s->sn = 1;
-	s->bn = 1;
-	s->dot = 0;
-}
-
-static void
-fz_strokelineto(struct sctx *s, fz_point cur)
-{
-	float dx = cur.x - s->seg[s->sn-1].x;
-	float dy = cur.y - s->seg[s->sn-1].y;
-
-	if (dx * dx + dy * dy < FLT_EPSILON)
-	{
-		s->dot = 1;
-		return;
-	}
-
-	fz_linestroke(s, s->seg[s->sn-1], cur);
-
-	if (s->sn == 2)
-	{
-		fz_linejoin(s, s->seg[0], s->seg[1], cur);
-		s->seg[0] = s->seg[1];
-		s->seg[1] = cur;
-	}
-
-	if (s->sn == 1)
-		s->seg[s->sn++] = cur;
-	if (s->bn == 1)
-		s->beg[s->bn++] = cur;
-}
-
-static void
-fz_strokeclosepath(struct sctx *s)
-{
-	if (s->sn == 2)
-	{
-		fz_strokelineto(s, s->beg[0]);
-		if (s->seg[1].x == s->beg[0].x && s->seg[1].y == s->beg[0].y)
-			fz_linejoin(s, s->seg[0], s->beg[0], s->beg[1]);
-		else
-			fz_linejoin(s, s->seg[1], s->beg[0], s->beg[1]);
-	}
-	else if (s->dot)
-	{
-		fz_linedot(s, s->beg[0]);
-	}
-
-	s->seg[0] = s->beg[0];
-	s->bn = 1;
-	s->sn = 1;
-	s->dot = 0;
-}
-
-static void
-fz_strokebezier(struct sctx *s,
-	float xa, float ya,
-	float xb, float yb,
-	float xc, float yc,
-	float xd, float yd, int depth)
-{
-	float dmax;
-	float xab, yab;
-	float xbc, ybc;
-	float xcd, ycd;
-	float xabc, yabc;
-	float xbcd, ybcd;
-	float xabcd, yabcd;
-
-	/* termination check */
-	dmax = ABS(xa - xb);
-	dmax = MAX(dmax, ABS(ya - yb));
-	dmax = MAX(dmax, ABS(xd - xc));
-	dmax = MAX(dmax, ABS(yd - yc));
-	if (dmax < s->flatness || depth >= MAXDEPTH)
-	{
-		fz_point p;
-		p.x = xd;
-		p.y = yd;
-		fz_strokelineto(s, p);
-		return;
-	}
-
-	xab = xa + xb;
-	yab = ya + yb;
-	xbc = xb + xc;
-	ybc = yb + yc;
-	xcd = xc + xd;
-	ycd = yc + yd;
-
-	xabc = xab + xbc;
-	yabc = yab + ybc;
-	xbcd = xbc + xcd;
-	ybcd = ybc + ycd;
-
-	xabcd = xabc + xbcd;
-	yabcd = yabc + ybcd;
-
-	xab *= 0.5f; yab *= 0.5f;
-	xbc *= 0.5f; ybc *= 0.5f;
-	xcd *= 0.5f; ycd *= 0.5f;
-
-	xabc *= 0.25f; yabc *= 0.25f;
-	xbcd *= 0.25f; ybcd *= 0.25f;
-
-	xabcd *= 0.125f; yabcd *= 0.125f;
-
-	fz_strokebezier(s, xa, ya, xab, yab, xabc, yabc, xabcd, yabcd, depth + 1);
-	fz_strokebezier(s, xabcd, yabcd, xbcd, ybcd, xcd, ycd, xd, yd, depth + 1);
-}
-
-void
-fz_strokepath(fz_gel *gel, fz_path *path, fz_strokestate *stroke, fz_matrix ctm, float flatness, float linewidth)
-{
-	struct sctx s;
-	fz_point p0, p1, p2, p3;
-	int i;
-
-	s.gel = gel;
-	s.ctm = &ctm;
-	s.flatness = flatness;
-
-	s.linecap = stroke->linecap;
-	s.linejoin = stroke->linejoin;
-	s.linewidth = linewidth * 0.5f; /* hairlines use a different value from the path value */
-	s.miterlimit = stroke->miterlimit;
-	s.sn = 0;
-	s.bn = 0;
-	s.dot = 0;
-
-	i = 0;
-
-	if (path->len > 0 && path->els[0].k != FZ_MOVETO)
-	{
-		fz_warn("assert: path must begin with moveto");
-		return;
-	}
-
-	p0.x = p0.y = 0;
-
-	while (i < path->len)
-	{
-		switch (path->els[i++].k)
-		{
-		case FZ_MOVETO:
-			p1.x = path->els[i++].v;
-			p1.y = path->els[i++].v;
-			fz_strokemoveto(&s, p1);
-			p0 = p1;
-			break;
-
-		case FZ_LINETO:
-			p1.x = path->els[i++].v;
-			p1.y = path->els[i++].v;
-			fz_strokelineto(&s, p1);
-			p0 = p1;
-			break;
-
-		case FZ_CURVETO:
-			p1.x = path->els[i++].v;
-			p1.y = path->els[i++].v;
-			p2.x = path->els[i++].v;
-			p2.y = path->els[i++].v;
-			p3.x = path->els[i++].v;
-			p3.y = path->els[i++].v;
-			fz_strokebezier(&s, p0.x, p0.y, p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, 0);
-			p0 = p3;
-			break;
-
-		case FZ_CLOSEPATH:
-			fz_strokeclosepath(&s);
-			break;
-		}
-	}
-
-	fz_strokeflush(&s);
-}
-
-static void
-fz_dashmoveto(struct sctx *s, fz_point a)
-{
-	s->toggle = 1;
-	s->offset = 0;
-	s->phase = s->dashphase;
-
-	while (s->phase >= s->dashlist[s->offset])
-	{
-		s->toggle = !s->toggle;
-		s->phase -= s->dashlist[s->offset];
-		s->offset ++;
-		if (s->offset == s->dashlen)
-			s->offset = 0;
-	}
-
-	s->cur = a;
-
-	if (s->toggle)
-		fz_strokemoveto(s, a);
-}
-
-static void
-fz_dashlineto(struct sctx *s, fz_point b)
-{
-	float dx, dy;
-	float total, used, ratio;
-	fz_point a;
-	fz_point m;
-
-	a = s->cur;
-	dx = b.x - a.x;
-	dy = b.y - a.y;
-	total = sqrtf(dx * dx + dy * dy);
-	used = 0;
-
-	while (total - used > s->dashlist[s->offset] - s->phase)
-	{
-		used += s->dashlist[s->offset] - s->phase;
-		ratio = used / total;
-		m.x = a.x + ratio * dx;
-		m.y = a.y + ratio * dy;
-
-		if (s->toggle)
-			fz_strokelineto(s, m);
-		else
-			fz_strokemoveto(s, m);
-
-		s->toggle = !s->toggle;
-		s->phase = 0;
-		s->offset ++;
-		if (s->offset == s->dashlen)
-			s->offset = 0;
-	}
-
-	s->phase += total - used;
-
-	s->cur = b;
-
-	if (s->toggle)
-		fz_strokelineto(s, b);
-}
-
-static void
-fz_dashbezier(struct sctx *s,
-	float xa, float ya,
-	float xb, float yb,
-	float xc, float yc,
-	float xd, float yd, int depth)
-{
-	float dmax;
-	float xab, yab;
-	float xbc, ybc;
-	float xcd, ycd;
-	float xabc, yabc;
-	float xbcd, ybcd;
-	float xabcd, yabcd;
-
-	/* termination check */
-	dmax = ABS(xa - xb);
-	dmax = MAX(dmax, ABS(ya - yb));
-	dmax = MAX(dmax, ABS(xd - xc));
-	dmax = MAX(dmax, ABS(yd - yc));
-	if (dmax < s->flatness || depth >= MAXDEPTH)
-	{
-		fz_point p;
-		p.x = xd;
-		p.y = yd;
-		fz_dashlineto(s, p);
-		return;
-	}
-
-	xab = xa + xb;
-	yab = ya + yb;
-	xbc = xb + xc;
-	ybc = yb + yc;
-	xcd = xc + xd;
-	ycd = yc + yd;
-
-	xabc = xab + xbc;
-	yabc = yab + ybc;
-	xbcd = xbc + xcd;
-	ybcd = ybc + ycd;
-
-	xabcd = xabc + xbcd;
-	yabcd = yabc + ybcd;
-
-	xab *= 0.5f; yab *= 0.5f;
-	xbc *= 0.5f; ybc *= 0.5f;
-	xcd *= 0.5f; ycd *= 0.5f;
-
-	xabc *= 0.25f; yabc *= 0.25f;
-	xbcd *= 0.25f; ybcd *= 0.25f;
-
-	xabcd *= 0.125f; yabcd *= 0.125f;
-
-	fz_dashbezier(s, xa, ya, xab, yab, xabc, yabc, xabcd, yabcd, depth + 1);
-	fz_dashbezier(s, xabcd, yabcd, xbcd, ybcd, xcd, ycd, xd, yd, depth + 1);
-}
-
-void
-fz_dashpath(fz_gel *gel, fz_path *path, fz_strokestate *stroke, fz_matrix ctm, float flatness, float linewidth)
-{
-	struct sctx s;
-	fz_point p0, p1, p2, p3, beg;
-	int i;
-
-	s.gel = gel;
-	s.ctm = &ctm;
-	s.flatness = flatness;
-
-	s.linecap = stroke->linecap;
-	s.linejoin = stroke->linejoin;
-	s.linewidth = linewidth * 0.5f;
-	s.miterlimit = stroke->miterlimit;
-	s.sn = 0;
-	s.bn = 0;
-	s.dot = 0;
-
-	s.dashlist = stroke->dashlist;
-	s.dashphase = stroke->dashphase;
-	s.dashlen = stroke->dashlen;
-	s.toggle = 0;
-	s.offset = 0;
-	s.phase = 0;
-
-	i = 0;
-
-	if (path->len > 0 && path->els[0].k != FZ_MOVETO)
-	{
-		fz_warn("assert: path must begin with moveto");
-		return;
-	}
-
-	p0.x = p0.y = 0;
-
-	while (i < path->len)
-	{
-		switch (path->els[i++].k)
-		{
-		case FZ_MOVETO:
-			p1.x = path->els[i++].v;
-			p1.y = path->els[i++].v;
-			fz_dashmoveto(&s, p1);
-			beg = p0 = p1;
-			break;
-
-		case FZ_LINETO:
-			p1.x = path->els[i++].v;
-			p1.y = path->els[i++].v;
-			fz_dashlineto(&s, p1);
-			p0 = p1;
-			break;
-
-		case FZ_CURVETO:
-			p1.x = path->els[i++].v;
-			p1.y = path->els[i++].v;
-			p2.x = path->els[i++].v;
-			p2.y = path->els[i++].v;
-			p3.x = path->els[i++].v;
-			p3.y = path->els[i++].v;
-			fz_dashbezier(&s, p0.x, p0.y, p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, 0);
-			p0 = p3;
-			break;
-
-		case FZ_CLOSEPATH:
-			fz_dashlineto(&s, beg);
-			p0 = p1 = beg;
-			break;
-		}
-	}
-
-	fz_strokeflush(&s);
-}
diff --git a/draw/porterduff.c b/draw/porterduff.c
deleted file mode 100644
index 69df467a..00000000
--- a/draw/porterduff.c
+++ /dev/null
@@ -1,443 +0,0 @@
-#include "fitz.h"
-
-/*
-
-The functions in this file implement various flavours of Porter-Duff blending.
-
-We take the following as definitions:
-
-	Cx = Color (from plane x)
-	ax = Alpha (from plane x)
-	cx = Cx.ax = Premultiplied color (from plane x)
-
-The general PorterDuff blending equation is:
-
-	Blend Z = X op Y	cz = Fx.cx + Fy. cy	where Fx and Fy depend on op
-
-The two operations we use in this file are: '(X in Y) over Z' and
-'S over Z'. The definitions of the 'over' and 'in' operations are as
-follows:
-
-	For S over Z,	Fs = 1, Fz = 1-as
-	For X in Y,	Fx = ay, Fy = 0
-
-We have 2 choices; we can either work with premultiplied data, or non
-premultiplied data. Our
-
-First the premultiplied case:
-
-	Let S = (X in Y)
-	Let R = (X in Y) over Z = S over Z
-
-	cs	= cx.Fx + cy.Fy	(where Fx = ay, Fy = 0)
-		= cx.ay
-	as	= ax.Fx + ay.Fy
-		= ax.ay
-
-	cr	= cs.Fs + cz.Fz	(where Fs = 1, Fz = 1-as)
-		= cs + cz.(1-as)
-		= cx.ay + cz.(1-ax.ay)
-	ar	= as.Fs + az.Fz
-		= as + az.(1-as)
-		= ax.ay + az.(1-ax.ay)
-
-This has various nice properties, like not needing any divisions, and
-being symmetric in color and alpha, so this is what we use. Because we
-went through the pain of deriving the non premultiplied forms, we list
-them here too, though they are not used.
-
-Non Pre-multiplied case:
-
-	Cs.as	= Fx.Cx.ax + Fy.Cy.ay	(where Fx = ay, Fy = 0)
-		= Cx.ay.ax
-	Cs	= (Cx.ay.ax)/(ay.ax)
-		= Cx
-	Cr.ar	= Fs.Cs.as + Fz.Cz.az	(where Fs = 1, Fz = 1-as)
-		= Cs.as	+ (1-as).Cz.az
-		= Cx.ax.ay + Cz.az.(1-ax.ay)
-	Cr	= (Cx.ax.ay + Cz.az.(1-ax.ay))/(ax.ay + az.(1-ax-ay))
-
-Much more complex, it seems. However, if we could restrict ourselves to
-the case where we were always plotting onto an opaque background (i.e.
-az = 1), then:
-
-	Cr	= Cx.(ax.ay) + Cz.(1-ax.ay)
-		= (Cx-Cz)*(1-ax.ay) + Cz	(a single MLA operation)
-	ar	= 1
-
-Sadly, this is not true in the general case, so we abandon this effort
-and stick to using the premultiplied form.
-
-*/
-
-typedef unsigned char byte;
-
-/* Blend a non-premultiplied color in mask over destination */
-
-static inline void
-fz_paintspancolor2(byte * restrict dp, byte * restrict mp, int w, byte *color)
-{
-	int sa = FZ_EXPAND(color[1]);
-	int g = color[0];
-	while (w--)
-	{
-		int ma = *mp++;
-		ma = FZ_COMBINE(FZ_EXPAND(ma), sa);
-		dp[0] = FZ_BLEND(g, dp[0], ma);
-		dp[1] = FZ_BLEND(255, dp[1], ma);
-		dp += 2;
-	}
-}
-
-static inline void
-fz_paintspancolor4(byte * restrict dp, byte * restrict mp, int w, byte *color)
-{
-	int sa = FZ_EXPAND(color[3]);
-	int r = color[0];
-	int g = color[1];
-	int b = color[2];
-	while (w--)
-	{
-		int ma = *mp++;
-		ma = FZ_COMBINE(FZ_EXPAND(ma), sa);
-		dp[0] = FZ_BLEND(r, dp[0], ma);
-		dp[1] = FZ_BLEND(g, dp[1], ma);
-		dp[2] = FZ_BLEND(b, dp[2], ma);
-		dp[3] = FZ_BLEND(255, dp[3], ma);
-		dp += 4;
-	}
-}
-
-static inline void
-fz_paintspancolorN(byte * restrict dp, byte * restrict mp, int n, int w, byte *color)
-{
-	int sa = FZ_EXPAND(color[n-1]);
-	int k;
-	n--;
-	while (w--)
-	{
-		int ma = *mp++;
-		ma = FZ_COMBINE(FZ_EXPAND(ma), sa);
-		for (k = 0; k < n; k++)
-			dp[k] = FZ_BLEND(color[k], dp[k], ma);
-		dp[k] = FZ_BLEND(255, dp[k], ma);
-		dp += n;
-	}
-}
-
-void
-fz_paintspancolor(byte * restrict dp, byte * restrict mp, int n, int w, byte *color)
-{
-	switch (n)
-	{
-	case 2: fz_paintspancolor2(dp, mp, w, color); break;
-	case 4: fz_paintspancolor4(dp, mp, w, color); break;
-	default: fz_paintspancolorN(dp, mp, n, w, color); break;
-	}
-}
-
-/* Blend source in mask over destination */
-
-static inline void
-fz_paintspanmask2(byte * restrict dp, byte * restrict sp, byte * restrict mp, int w)
-{
-	while (w--)
-	{
-		int masa;
-		int ma = *mp++;
-		ma = FZ_EXPAND(ma);
-		masa = FZ_COMBINE(sp[1], ma);
-		masa = 255 - masa;
-		masa = FZ_EXPAND(masa);
-		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
-		sp++; dp++;
-		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
-		sp++; dp++;
-	}
-}
-
-static inline void
-fz_paintspanmask4(byte * restrict dp, byte * restrict sp, byte * restrict mp, int w)
-{
-	while (w--)
-	{
-		int masa;
-		int ma = *mp++;
-		ma = FZ_EXPAND(ma);
-		masa = FZ_COMBINE(sp[3], ma);
-		masa = 255 - masa;
-		masa = FZ_EXPAND(masa);
-		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
-		sp++; dp++;
-		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
-		sp++; dp++;
-		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
-		sp++; dp++;
-		*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
-		sp++; dp++;
-	}
-}
-
-static inline void
-fz_paintspanmaskN(byte * restrict dp, byte * restrict sp, byte * restrict mp, int n, int w)
-{
-	n--;
-	while (w--)
-	{
-		int k = n;
-		int masa;
-		int ma = *mp++;
-		ma = FZ_EXPAND(ma);
-		masa = FZ_COMBINE(sp[n-1], ma);
-		masa = 255-masa;
-		masa = FZ_EXPAND(masa);
-		while (k--)
-		{
-			*dp = FZ_COMBINE2(*sp, ma, *dp, masa);
-			sp++; dp++;
-		}
-	}
-}
-
-static void
-fz_paintspanmask(byte * restrict dp, byte * restrict sp, byte * restrict mp, int n, int w)
-{
-	switch (n)
-	{
-	case 2: fz_paintspanmask2(dp, sp, mp, w); break;
-	case 4: fz_paintspanmask4(dp, sp, mp, w); break;
-	default: fz_paintspanmaskN(dp, sp, mp, n, w); break;
-	}
-}
-
-/* Blend source in constant alpha over destination */
-
-static inline void
-fz_paintspan2alpha(byte * restrict dp, byte * restrict sp, int w, int alpha)
-{
-	alpha = FZ_EXPAND(alpha);
-	while (w--)
-	{
-		int masa = FZ_COMBINE(sp[1], alpha);
-		*dp = FZ_BLEND(*sp, *dp, masa);
-		dp++; sp++;
-		*dp = FZ_BLEND(*sp, *dp, masa);
-		dp++; sp++;
-	}
-}
-
-static inline void
-fz_paintspan4alpha(byte * restrict dp, byte * restrict sp, int w, int alpha)
-{
-	alpha = FZ_EXPAND(alpha);
-	while (w--)
-	{
-		int masa = FZ_COMBINE(sp[3], alpha);
-		*dp = FZ_BLEND(*sp, *dp, masa);
-		sp++; dp++;
-		*dp = FZ_BLEND(*sp, *dp, masa);
-		sp++; dp++;
-		*dp = FZ_BLEND(*sp, *dp, masa);
-		sp++; dp++;
-		*dp = FZ_BLEND(*sp, *dp, masa);
-		sp++; dp++;
-	}
-}
-
-static inline void
-fz_paintspanNalpha(byte * restrict dp, byte * restrict sp, int n, int w, int alpha)
-{
-	alpha = FZ_EXPAND(alpha);
-	while (w--)
-	{
-		int masa = FZ_COMBINE(sp[n-1], alpha);
-		int k = n;
-		while (k--)
-		{
-			*dp = FZ_BLEND(*sp++, *dp, masa);
-			dp++;
-		}
-	}
-}
-
-/* Blend source over destination */
-
-static inline void
-fz_paintspan1(byte * restrict dp, byte * restrict sp, int w)
-{
-	while (w--)
-	{
-		int t = FZ_EXPAND(255 - sp[0]);
-		*dp = *sp++ + FZ_COMBINE(*dp, t);
-		dp ++;
-	}
-}
-
-static inline void
-fz_paintspan2(byte * restrict dp, byte * restrict sp, int w)
-{
-	while (w--)
-	{
-		int t = FZ_EXPAND(255 - sp[1]);
-		*dp = *sp++ + FZ_COMBINE(*dp, t);
-		dp++;
-		*dp = *sp++ + FZ_COMBINE(*dp, t);
-		dp++;
-	}
-}
-
-static inline void
-fz_paintspan4(byte * restrict dp, byte * restrict sp, int w)
-{
-	while (w--)
-	{
-		int t = FZ_EXPAND(255 - sp[3]);
-		*dp = *sp++ + FZ_COMBINE(*dp, t);
-		dp++;
-		*dp = *sp++ + FZ_COMBINE(*dp, t);
-		dp++;
-		*dp = *sp++ + FZ_COMBINE(*dp, t);
-		dp++;
-		*dp = *sp++ + FZ_COMBINE(*dp, t);
-		dp++;
-	}
-}
-
-static inline void
-fz_paintspanN(byte * restrict dp, byte * restrict sp, int n, int w)
-{
-	while (w--)
-	{
-		int k = n;
-		int t = FZ_EXPAND(255 - sp[n-1]);
-		while (k--)
-		{
-			*dp = *sp++ + FZ_COMBINE(*dp, t);
-			dp++;
-		}
-	}
-}
-
-void
-fz_paintspan(byte * restrict dp, byte * restrict sp, int n, int w, int alpha)
-{
-	if (alpha == 255)
-	{
-		switch (n)
-		{
-		case 1: fz_paintspan1(dp, sp, w); break;
-		case 2: fz_paintspan2(dp, sp, w); break;
-		case 4: fz_paintspan4(dp, sp, w); break;
-		default: fz_paintspanN(dp, sp, n, w); break;
-		}
-	}
-	else if (alpha > 0)
-	{
-		switch (n)
-		{
-		case 2: fz_paintspan2alpha(dp, sp, w, alpha); break;
-		case 4: fz_paintspan4alpha(dp, sp, w, alpha); break;
-		default: fz_paintspanNalpha(dp, sp, n, w, alpha); break;
-		}
-	}
-}
-
-/*
- * Pixmap blending functions
- */
-
-void
-fz_paintpixmapbbox(fz_pixmap *dst, fz_pixmap *src, int alpha, fz_bbox bbox)
-{
-	unsigned char *sp, *dp;
-	int x, y, w, h, n;
-
-	assert(dst->n == src->n);
-
-	bbox = fz_intersectbbox(bbox, fz_boundpixmap(dst));
-	bbox = fz_intersectbbox(bbox, fz_boundpixmap(src));
-
-	x = bbox.x0;
-	y = bbox.y0;
-	w = bbox.x1 - bbox.x0;
-	h = bbox.y1 - bbox.y0;
-	if ((w | h) == 0)
-		return;
-
-	n = src->n;
-	sp = src->samples + ((y - src->y) * src->w + (x - src->x)) * src->n;
-	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
-
-	while (h--)
-	{
-		fz_paintspan(dp, sp, n, w, alpha);
-		sp += src->w * n;
-		dp += dst->w * n;
-	}
-}
-
-void
-fz_paintpixmap(fz_pixmap *dst, fz_pixmap *src, int alpha)
-{
-	unsigned char *sp, *dp;
-	fz_bbox bbox;
-	int x, y, w, h, n;
-
-	assert(dst->n == src->n);
-
-	bbox = fz_boundpixmap(dst);
-	bbox = fz_intersectbbox(bbox, fz_boundpixmap(src));
-
-	x = bbox.x0;
-	y = bbox.y0;
-	w = bbox.x1 - bbox.x0;
-	h = bbox.y1 - bbox.y0;
-	if ((w | h) == 0)
-		return;
-
-	n = src->n;
-	sp = src->samples + ((y - src->y) * src->w + (x - src->x)) * src->n;
-	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
-
-	while (h--)
-	{
-		fz_paintspan(dp, sp, n, w, alpha);
-		sp += src->w * n;
-		dp += dst->w * n;
-	}
-}
-
-void
-fz_paintpixmapmask(fz_pixmap *dst, fz_pixmap *src, fz_pixmap *msk)
-{
-	unsigned char *sp, *dp, *mp;
-	fz_bbox bbox;
-	int x, y, w, h, n;
-
-	assert(dst->n == src->n);
-	assert(msk->n == 1);
-
-	bbox = fz_boundpixmap(dst);
-	bbox = fz_intersectbbox(bbox, fz_boundpixmap(src));
-	bbox = fz_intersectbbox(bbox, fz_boundpixmap(msk));
-
-	x = bbox.x0;
-	y = bbox.y0;
-	w = bbox.x1 - bbox.x0;
-	h = bbox.y1 - bbox.y0;
-	if ((w | h) == 0)
-		return;
-
-	n = src->n;
-	sp = src->samples + ((y - src->y) * src->w + (x - src->x)) * src->n;
-	mp = msk->samples + ((y - msk->y) * msk->w + (x - msk->x)) * msk->n;
-	dp = dst->samples + ((y - dst->y) * dst->w + (x - dst->x)) * dst->n;
-
-	while (h--)
-	{
-		fz_paintspanmask(dp, sp, mp, n, w);
-		sp += src->w * n;
-		dp += dst->w * n;
-		mp += msk->w;
-	}
-}
-- 
cgit v1.2.3