18 files changed, 656 insertions, 511 deletions
diff --git a/apps/pdfdraw.c b/apps/pdfdraw.c
index 73170eb4..40faba0f 100644
--- a/apps/pdfdraw.c
+++ b/apps/pdfdraw.c
@@ -251,8 +251,8 @@ static void drawpnm(int pagenum, struct benchmark *loadtimes, struct benchmark *
 
 					for (x = pix->w; x > 0; x--)
 					{
-						src++;
 						*dst++ = *src++;
+						src++;
 					}
 					dst -= pix->w;
 
@@ -267,10 +267,10 @@ static void drawpnm(int pagenum, struct benchmark *loadtimes, struct benchmark *
 
 					for (x = pix->w; x > 0; x--)
 					{
-						src++;
 						*dst++ = *src++;
 						*dst++ = *src++;
 						*dst++ = *src++;
+						src++;
 					}
 					dst -= pix->w * 3;
 
diff --git a/apps/win_main.c b/apps/win_main.c
index e4c78983..ae12c673 100644
--- a/apps/win_main.c
+++ b/apps/win_main.c
@@ -396,9 +396,9 @@ void winconvert(fz_pixmap *image)
 			unsigned char *s = image->samples + y * image->w * 4;
 			for (x = 0; x < image->w; x++)
 			{
-				p[x * 3 + 0] = s[x * 4 + 3];
-				p[x * 3 + 1] = s[x * 4 + 2];
-				p[x * 3 + 2] = s[x * 4 + 1];
+				p[x * 3 + 0] = s[x * 4 + 2];
+				p[x * 3 + 1] = s[x * 4 + 1];
+				p[x * 3 + 2] = s[x * 4 + 0];
 			}
 		}
 	}
@@ -410,9 +410,9 @@ void winconvert(fz_pixmap *image)
 			unsigned char *s = image->samples + y * image->w * 2;
 			for (x = 0; x < image->w; x++)
 			{
-				p[x * 3 + 0] = s[x * 2 + 1];
-				p[x * 3 + 1] = s[x * 2 + 1];
-				p[x * 3 + 2] = s[x * 2 + 1];
+				p[x * 3 + 0] = s[x * 2];
+				p[x * 3 + 1] = s[x * 2];
+				p[x * 3 + 2] = s[x * 2];
 			}
 		}
 	}
diff --git a/apps/x11_image.c b/apps/x11_image.c
index f38de236..7de1476e 100644
--- a/apps/x11_image.c
+++ b/apps/x11_image.c
@@ -1,5 +1,5 @@
 /*
- * Blit ARGB images to X with X(Shm)Images
+ * Blit RGBA images to X with X(Shm)Images
  */
 
 #ifndef _XOPEN_SOURCE
@@ -243,13 +243,13 @@ select_mode(void)
 			info.mode = byteorder == MSBFirst ? BGR888 : RGB888;
 	}
 	else if (info.bitsperpixel == 32) {
-		if (rs ==  0 && gs ==  8 && bs == 16)
+		if (rs == 0 && gs == 8 && bs == 16)
 			info.mode = byteorder == MSBFirst ? ABGR8888 : RGBA8888;
-		if (rs ==  8 && gs == 16 && bs == 24)
+		if (rs == 8 && gs == 16 && bs == 24)
 			info.mode = byteorder == MSBFirst ? BGRA8888 : ARGB8888;
-		if (rs == 16 && gs ==  8 && bs ==  0)
+		if (rs == 16 && gs == 8 && bs == 0)
 			info.mode = byteorder == MSBFirst ? ARGB8888 : BGRA8888;
-		if (rs == 24 && gs == 16 && bs ==  8)
+		if (rs == 24 && gs == 16 && bs == 8)
 			info.mode = byteorder == MSBFirst ? RGBA8888 : ABGR8888;
 	}
 
@@ -461,29 +461,30 @@ ximage_blit(Drawable d, GC gc,
 #endif
 
 #define PARAMS \
-	 const unsigned char * restrict src, \
-	 int srcstride, \
-	 unsigned char * restrict dst, \
-	 int dststride, \
-	 int w, \
-	 int h
+	const unsigned char * restrict src, \
+	int srcstride, \
+	unsigned char * restrict dst, \
+	int dststride, \
+	int w, \
+	int h
 
 /*
- * Convert byte:ARGB8888 to various formats
+ * Convert byte:RGBA8888 to various formats
  */
 
 static void
 ximage_convert_argb8888(PARAMS)
 {
 	int x, y;
-	unsigned * restrict s = (unsigned *)src;
-	unsigned * restrict d = (unsigned *)dst;
 	for (y = 0; y < h; y++) {
-		for (x = 0; x < w; x++) {
-			d[x] = s[x];
+		for (x = 0; x < w; x ++) {
+			dst[x * 4 + 0] = src[x * 4 + 3]; /* a */
+			dst[x * 4 + 1] = src[x * 4 + 0]; /* r */
+			dst[x * 4 + 2] = src[x * 4 + 1]; /* g */
+			dst[x * 4 + 3] = src[x * 4 + 2]; /* b */
 		}
-		d += dststride>>2;
-		s += srcstride>>2;
+		dst += dststride;
+		src += srcstride;
 	}
 }
 
@@ -491,52 +492,31 @@ static void
 ximage_convert_bgra8888(PARAMS)
 {
 	int x, y;
-	unsigned * restrict s = (unsigned *)src;
-	unsigned * restrict d = (unsigned *)dst;
-	unsigned val;
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			val = s[x];
-			d[x] =
-			(val >> 24) |
-			((val >> 8) & 0xff00) |
-			(val << 24) |
-			((val << 8) & 0xff0000);
-			/*
-			d[x] =
-			(((val >> 24) & 0xff) <<  0) |
-			(((val >> 16) & 0xff) <<  8) |
-			(((val >>  8) & 0xff) << 16) |
-			(((val >>  0) & 0xff) << 24);
-			*/
+			dst[x * 4 + 0] = src[x * 4 + 2];
+			dst[x * 4 + 1] = src[x * 4 + 1];
+			dst[x * 4 + 2] = src[x * 4 + 0];
+			dst[x * 4 + 3] = src[x * 4 + 3];
 		}
-		d += dststride>>2;
-		s += srcstride>>2;
+		dst += dststride;
+		src += srcstride;
 	}
 }
 
-/* following have yet to receive some MMX love ;-) */
-
 static void
 ximage_convert_abgr8888(PARAMS)
 {
 	int x, y;
-	unsigned * restrict s = (unsigned *)src;
-	unsigned * restrict d = (unsigned *)dst;
-	unsigned val;
-
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			val = s[x];
-#if 1 /* FZ_MSB */
-			d[x] = (val & 0xff00ff00) |
-			(((val << 16) | (val >> 16)) & 0x00ff00ff);
-#else /* FZ_LSB */
-			d[x] = (val << 24) | ((val >> 8) & 0xff);
-#endif
+			dst[x * 4 + 0] = src[x * 4 + 3];
+			dst[x * 4 + 1] = src[x * 4 + 2];
+			dst[x * 4 + 2] = src[x * 4 + 1];
+			dst[x * 4 + 3] = src[x * 4 + 0];
 		}
-		d += dststride>>2;
-		s += srcstride>>2;
+		dst += dststride;
+		src += srcstride;
 	}
 }
 
@@ -546,10 +526,7 @@ ximage_convert_rgba8888(PARAMS)
 	int x, y;
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			dst[x * 4 + 0] = src[x * 4 + 1];
-			dst[x * 4 + 1] = src[x * 4 + 2];
-			dst[x * 4 + 2] = src[x * 4 + 3];
-			dst[x * 4 + 3] = src[x * 4 + 0];
+			dst[x] = src[x];
 		}
 		dst += dststride;
 		src += srcstride;
@@ -562,9 +539,9 @@ ximage_convert_bgr888(PARAMS)
 	int x, y;
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			dst[3*x + 0] = src[4*x + 3];
-			dst[3*x + 1] = src[4*x + 2];
-			dst[3*x + 2] = src[4*x + 1];
+			dst[3*x + 0] = src[4*x + 2];
+			dst[3*x + 1] = src[4*x + 1];
+			dst[3*x + 2] = src[4*x + 0];
 		}
 		src += srcstride;
 		dst += dststride;
@@ -577,9 +554,9 @@ ximage_convert_rgb888(PARAMS)
 	int x, y;
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			dst[3*x + 0] = src[4*x + 1];
-			dst[3*x + 1] = src[4*x + 2];
-			dst[3*x + 2] = src[4*x + 3];
+			dst[3*x + 0] = src[4*x + 0];
+			dst[3*x + 1] = src[4*x + 1];
+			dst[3*x + 2] = src[4*x + 2];
 		}
 		src += srcstride;
 		dst += dststride;
@@ -593,9 +570,9 @@ ximage_convert_rgb565(PARAMS)
 	int x, y;
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			r = src[4*x + 1];
-			g = src[4*x + 2];
-			b = src[4*x + 3];
+			r = src[4*x + 0];
+			g = src[4*x + 1];
+			b = src[4*x + 2];
 			((unsigned short *)dst)[x] =
 			((r & 0xF8) << 8) |
 			((g & 0xFC) << 3) |
@@ -613,11 +590,11 @@ ximage_convert_rgb565_br(PARAMS)
 	int x, y;
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			r = src[4*x + 1];
-			g = src[4*x + 2];
-			b = src[4*x + 3];
+			r = src[4*x + 0];
+			g = src[4*x + 1];
+			b = src[4*x + 2];
 			/* final word is:
-			g4 g3 g2 b7 b6 b5 b4 b3  r7 r6 r5 r4 r3 g7 g6 g5
+			g4 g3 g2 b7 b6 b5 b4 b3 : r7 r6 r5 r4 r3 g7 g6 g5
 			*/
 			((unsigned short *)dst)[x] =
 			(r & 0xF8) |
@@ -637,9 +614,9 @@ ximage_convert_rgb555(PARAMS)
 	int x, y;
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			r = src[4*x + 1];
-			g = src[4*x + 2];
-			b = src[4*x + 3];
+			r = src[4*x + 0];
+			g = src[4*x + 1];
+			b = src[4*x + 2];
 			((unsigned short *)dst)[x] =
 			((r & 0xF8) << 7) |
 			((g & 0xF8) << 2) |
@@ -657,11 +634,11 @@ ximage_convert_rgb555_br(PARAMS)
 	int x, y;
 	for (y = 0; y < h; y++) {
 		for (x = 0; x < w; x++) {
-			r = src[4*x + 1];
-			g = src[4*x + 2];
-			b = src[4*x + 3];
+			r = src[4*x + 0];
+			g = src[4*x + 1];
+			b = src[4*x + 2];
 			/* final word is:
-			g5 g4 g3 b7 b6 b5 b4 b3  0 r7 r6 r5 r4 r3 g7 g6
+			g5 g4 g3 b7 b6 b5 b4 b3 : 0 r7 r6 r5 r4 r3 g7 g6
 			*/
 			((unsigned short *)dst)[x] =
 			((r & 0xF8) >> 1) |
@@ -681,9 +658,9 @@ ximage_convert_bgr233(PARAMS)
 	int x,y;
 	for(y = 0; y < h; y++) {
 		for(x = 0; x < w; x++) {
-			r = src[4*x + 1];
-			g = src[4*x + 2];
-			b = src[4*x + 3];
+			r = src[4*x + 0];
+			g = src[4*x + 1];
+			b = src[4*x + 2];
 			/* format: b7 b6 g7 g6 g5 r7 r6 r5 */
 			dst[x] = (b&0xC0) | ((g>>2)&0x38) | ((r>>5)&0x7);
 		}
diff --git a/apps/x11_main.c b/apps/x11_main.c
index 70d331bf..6368daaf 100644
--- a/apps/x11_main.c
+++ b/apps/x11_main.c
@@ -301,8 +301,8 @@ static void winblit(pdfapp_t *app)
 			unsigned char *d = color;
 			for (; i > 0 ; i--)
 			{
-				d[0] = *s++;
-				d[3] = d[2] = d[1] = *s++;
+				d[2] = d[1] = d[0] = *s++;
+				d[3] = *s++;
 				d += 4;
 			}
 			ximage_blit(xwin, xgc,
diff --git a/draw/archarm.c b/draw/archarm.c
index c532a195..2904ce00 100644
--- a/draw/archarm.c
+++ b/draw/archarm.c
@@ -14,24 +14,25 @@ extern void fz_srow4_arm(byte *src, byte *dst, int w, int denom);
 extern void fz_scol4_arm(byte *src, byte *dst, int w, int denom);
 
 static void
-path_w4i1o4_arm(byte * restrict argb, byte * restrict src, byte cov, int len, byte * restrict dst)
+path_w4i1o4_arm(byte * restrict rgba, byte * restrict src, byte cov, int len, byte * restrict dst)
 {
 	/* The ARM code here is a hand coded implementation
 	 * of the optimized C version. */
+
 	if (len <= 0)
 		return;
 	asm volatile(
-	"ldr	%0, [%0]		@ %0 = argb			\n"
+	"ldr	%0, [%0]		@ %0 = rgba			\n"
 	"mov	r11,#0							\n"
 	"mov	r8, #0xFF00						\n"
-	"and	r14,%0,#255		@ r14= alpha			\n"
-	"orr	%0, %0, #255		@ %0 = argb |= 255		\n"
+	"mov	r14,%0,lsr #24		@ r14= alpha			\n"
+	"orr	%0, %0, #0xFF000000	@ %0 = rgba |= 0xFF000000	\n"
 	"orr	r8, r8, r8, LSL #16	@ r8 = 0xFF00FF00		\n"
 	"adds	r14,r14,r14,LSR #7	@ r14 = alpha += alpha>>7	\n"
 	"beq	9f			@ if (alpha == 0) bale		\n"
-	"and	r6, %0, r8		@ r6 = rb<<8			\n"
-	"bic	%0, %0, r8		@ %0 = ag			\n"
-	"mov	r6, r6, LSR #8		@ r6 = rb			\n"
+	"and	r6, %0, r8		@ r6 = ga<<8			\n"
+	"bic	%0, %0, r8		@ %0 = rb			\n"
+	"mov	r6, r6, LSR #8		@ r6 = ga			\n"
 	"cmp	r14,#256		@ if (alpha == 256)		\n"
 	"beq	4f			@     no-alpha loop		\n"
 	"B	2f			@ enter the loop		\n"
@@ -40,37 +41,37 @@ path_w4i1o4_arm(byte * restrict argb, byte * restrict src, byte cov, int len, by
 	"ble	9f							\n"
 	"2:								\n"
 	"ldrb	r12,[%1]		@ r12= *src			\n"
-	"ldr	r9, [%4], #4		@ r9 = dag = *dst32++		\n"
+	"ldr	r9, [%4], #4		@ r9 = drb = *dst32++		\n"
 	"strb	r11,[%1], #1		@ r11= *src++ = 0		\n"
 	"add	%2, r12, %2		@ %2 = cov += r12		\n"
 	"ands	%2, %2, #255		@ %2 = cov &= 255		\n"
 	"beq	1b			@ if coverage == 0 loop back	\n"
 	"add	r10,%2, %2, LSR #7	@ r10= ca = cov+(cov>>7)	\n"
 	"mul	r10,r14,r10		@ r10= ca *= alpha		\n"
-	"and	r7, r8, r9		@ r7 = drb =  dag     & MASK	\n"
+	"and	r7, r8, r9		@ r7 = dga =  drb     & MASK	\n"
 	"mov	r10,r10,LSR #8		@ r10= ca >>= 8			\n"
-	"and	r9, r8, r9, LSL #8	@ r9 = dag = (dag<<8) & MASK	\n"
-	"sub	r12,r6, r7, LSR #8	@ r12= crb = rb - (drb>>8)	\n"
-	"sub	r5, %0, r9, LSR #8	@ r5 = cag = ag - (dag>>8)	\n"
-	"mla	r7, r12,r10,r7		@ r7 = drb += crb * ca		\n"
+	"and	r9, r8, r9, LSL #8	@ r9 = drb = (drb<<8) & MASK	\n"
+	"sub	r12,r6, r7, LSR #8	@ r12= cga = ga - (dga>>8)	\n"
+	"sub	r5, %0, r9, LSR #8	@ r5 = crb = rb - (drb>>8)	\n"
+	"mla	r7, r12,r10,r7		@ r7 = dga += cga * ca		\n"
 	"subs	%3, %3, #1		@ len--				\n"
-	"mla	r9, r5, r10,r9		@ r9 = dag += cag * ca		\n"
-	"and	r7, r8, r7		@ r7 = drb &= MASK		\n"
-	"and	r9, r8, r9		@ r9 = dag &= MASK		\n"
-	"orr	r9, r7, r9, LSR #8	@ r9 = dag = drb | (dag>>8)	\n"
+	"mla	r9, r5, r10,r9		@ r9 = drb += crb * ca		\n"
+	"and	r7, r8, r7		@ r7 = dga &= MASK		\n"
+	"and	r9, r8, r9		@ r9 = drb &= MASK		\n"
+	"orr	r9, r7, r9, LSR #8	@ r9 = drb = dga | (drb>>8)	\n"
 	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
 	"bgt	2b							\n"
 	"b	9f							\n"
 	"@ --- Solid alpha loop	---------------------------------------	\n"
 	"3:	@ Loop used when coverage == 256			\n"
-	"orr	r9, %0, r6, LSL #8	@ r9 = argb			\n"
+	"orr	r9, %0, r6, LSL #8	@ r9 = rgba			\n"
 	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
 	"4:	@ Loop used for when coverage*alpha == 0		\n"
 	"subs	%3, %3, #1		@ len--				\n"
 	"ble	9f							\n"
 	"5:								\n"
 	"ldrb	r12,[%1]		@ r12= *src			\n"
-	"ldr	r9, [%4], #4		@ r9 = dag = *dst32++		\n"
+	"ldr	r9, [%4], #4		@ r9 = drb = *dst32++		\n"
 	"strb	r11,[%1], #1		@ r11= *src++ = 0		\n"
 	"add	%2, r12, %2		@ %2 = cov += r12		\n"
 	"ands	%2, %2, #255		@ %2 = cov &= 255		\n"
@@ -78,21 +79,21 @@ path_w4i1o4_arm(byte * restrict argb, byte * restrict src, byte cov, int len, by
 	"cmp	%2, #255		@ if coverage == solid		\n"
 	"beq	3b			@    loop back			\n"
 	"add	r10,%2, %2, LSR #7	@ r10= ca = cov+(cov>>7)	\n"
-	"and	r7, r8, r9		@ r7 = drb =  dag     & MASK	\n"
-	"and	r9, r8, r9, LSL #8	@ r9 = dag = (dag<<8) & MASK	\n"
-	"sub	r12,r6, r7, LSR #8	@ r12= crb = rb - (drb>>8)	\n"
-	"sub	r5, %0, r9, LSR #8	@ r5 = cag = ag - (dag>>8)	\n"
-	"mla	r7, r12,r10,r7		@ r7 = drb += crb * ca		\n"
+	"and	r7, r8, r9		@ r7 = dga =  drb     & MASK	\n"
+	"and	r9, r8, r9, LSL #8	@ r9 = dga = (drb<<8) & MASK	\n"
+	"sub	r12,r6, r7, LSR #8	@ r12= cga = ga - (dga>>8)	\n"
+	"sub	r5, %0, r9, LSR #8	@ r5 = crb = rb - (drb>>8)	\n"
+	"mla	r7, r12,r10,r7		@ r7 = dga += cga * ca		\n"
 	"subs	%3, %3, #1		@ len--				\n"
-	"mla	r9, r5, r10,r9		@ r9 = dag += cag * ca		\n"
-	"and	r7, r8, r7		@ r7 = drb &= MASK		\n"
-	"and	r9, r8, r9		@ r9 = dag &= MASK		\n"
-	"orr	r9, r7, r9, LSR #8	@ r9 = dag = drb | (dag>>8)	\n"
+	"mla	r9, r5, r10,r9		@ r9 = drb += crb * ca		\n"
+	"and	r7, r8, r7		@ r7 = dga &= MASK		\n"
+	"and	r9, r8, r9		@ r9 = drb &= MASK		\n"
+	"orr	r9, r7, r9, LSR #8	@ r9 = drb = dga | (drb>>8)	\n"
 	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
 	"bgt	5b							\n"
 	"9:				@ End				\n"
 	:
-	"+r" (argb),
+	"+r" (rgba),
 	"+r" (src),
 	"+r" (cov),
 	"+r" (len),
@@ -129,8 +130,8 @@ static void loadtile8_arm(byte * restrict src, int sw, byte * restrict dst, int
 			"2:						\n"
 			"LDRB	r4, [%[src]], #1	@ r4 = *src++	\n"
 			"SUBS	r5, r5, #1				\n"
-			"STRB	r11,[%[dst]], #1	@ *dst++ = 255	\n"
 			"STRB	r4, [%[dst]], #1	@ *dst++ = r4	\n"
+			"STRB	r11,[%[dst]], #1	@ *dst++ = 255	\n"
 			"BGT	2b					\n"
 			"ADD	%[src],%[src],%[sw]	@ src += sw	\n"
 			"ADD	%[dst],%[dst],%[dw]	@ dst += dw	\n"
@@ -161,10 +162,10 @@ static void loadtile8_arm(byte * restrict src, int sw, byte * restrict dst, int
 			"LDRB	r6, [%[src]], #1	@ r6 = *src++	\n"
 			"LDRB	r7, [%[src]], #1	@ r7 = *src++	\n"
 			"SUBS	r5, r5, #3				\n"
-			"STRB	r11,[r8], #1		@ *dp++ = 255	\n"
 			"STRB	r4, [r8], #1		@ *dp++ = r4	\n"
 			"STRB	r6, [r8], #1		@ *dp++ = r6	\n"
 			"STRB	r7, [r8], #1		@ *dp++ = r7	\n"
+			"STRB	r11,[r8], #1		@ *dp++ = 255	\n"
 			"BGT	2b					\n"
 			"ADD	%[src],%[src],%[sw]	@ src += sw	\n"
 			"ADD	%[dst],%[dst],%[dw]	@ dst += dw	\n"
diff --git a/draw/archport.c b/draw/archport.c
index f4fea5bc..337ad3c1 100644
--- a/draw/archport.c
+++ b/draw/archport.c
@@ -7,15 +7,15 @@ typedef unsigned char byte;
 #define MASK 0xFF00FF00;
 
 static void
-path_w4i1o4_32bit(byte * restrict argb,
+path_w4i1o4_32bit(byte * restrict rgba,
 	byte * restrict src, byte cov, int len,
 	byte * restrict dst)
 {
 	/* COLOR * coverage + DST * (256-coverage) = (COLOR - DST)*coverage + DST*256 */
 	unsigned int *dst32 = (unsigned int *)(void *)dst;
-	int alpha = argb[0];
-	unsigned int rb = argb[1] | (argb[3] << 16);
-	unsigned int ag = 255 | (argb[2] << 16);
+	int alpha = rgba[3];
+	unsigned int rb = rgba[0] | (rgba[2] << 16);
+	unsigned int ga = rgba[1] | 0xFF0000;
 
 	if (alpha == 0)
 		return;
@@ -25,23 +25,23 @@ path_w4i1o4_32bit(byte * restrict argb,
 		alpha += alpha>>7; /* alpha is now in the 0...256 range */
 		while (len--)
 		{
-			unsigned int ca, drb, dag, crb, cag;
+			unsigned int ca, drb, dga, crb, cga;
 			cov += *src; *src++ = 0;
 			ca = cov + (cov>>7); /* ca is in 0...256 range */
 			ca = (ca*alpha)>>8; /* ca is is in 0...256 range */
-			dag = *dst32++;
+			drb = *dst32++;
 			if (ca != 0)
 			{
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
-				dst32[-1] = dag;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
 			}
 		}
 	}
@@ -49,133 +49,42 @@ path_w4i1o4_32bit(byte * restrict argb,
 	{
 		while (len--)
 		{
-			unsigned int ca, drb, dag, crb, cag;
+			unsigned int ca, drb, dga, crb, cga;
 			cov += *src; *src++ = 0;
 			ca = cov + (cov>>7); /* ca is in 0...256 range */
-			dag = *dst32++;
+			drb = *dst32++;
 			if (ca == 0)
 				continue;
 			if (ca == 255)
 			{
-				dag = (rb<<8) | ag;
+				drb = (ga<<8) | rb;
 			}
 			else
 			{
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
+				drb = dga |(drb>>8);
 			}
-			dst32[-1] = dag;
+			dst32[-1] = drb;
 		}
 	}
 }
 
 static void
-duff_4o4_32bit(byte * restrict sp, int sw, byte * restrict dp, int dw, int w0, int h)
-{
-	unsigned int *sp32 = (unsigned int *)(void *)sp;
-	unsigned int *dp32 = (unsigned int *)(void *)dp;
-
-	/* duff_non(sp0, sw, 4, dp0, dw, w0, h); */
-
-	sw = (sw>>2)-w0;
-	dw = (dw>>2)-w0;
-	while (h--)
-	{
-		int w = w0;
-		while (w--)
-		{
-			unsigned int sag = *sp32++;
-			unsigned int dag = *dp32++;
-			unsigned int srb, drb;
-			int alpha = sag & 255;
-			if (alpha == 0)
-				continue;
-			alpha += alpha>>7;
-			sag |= 0xFF;
-			drb = dag & MASK;
-			dag = (dag<<8) & MASK;
-			srb = (sag>>8) & ~MASK;
-			sag = sag & ~MASK;
-			srb -= (drb>>8);
-			sag -= (dag>>8);
-			drb += srb * alpha;
-			dag += sag * alpha;
-			drb &= MASK;
-			dag &= MASK;
-			dag = drb | (dag>>8);
-			dp32[-1] = dag;
-		}
-		sp32 += sw;
-		dp32 += dw;
-	}
-}
-
-static void
-duff_4i1o4_32bit(byte * restrict sp, int sw,
-	byte * restrict mp, int mw,
-	byte * restrict dp, int dw, int w0, int h)
-{
-	unsigned int *sp32 = (unsigned int *)(void *)sp;
-	unsigned int *dp32 = (unsigned int *)(void *)dp;
-
-	/* duff_nimon(sp, sw, 4, mp, mw, 1, dp, dw, w0, h); */
-
-	sw = (sw>>2)-w0;
-	dw = (dw>>2)-w0;
-	mw -= w0;
-	while (h--)
-	{
-		int w = w0;
-		while (w--)
-		{
-			unsigned int sag = *sp32++;
-			unsigned int dag = *dp32++;
-			unsigned int srb, drb, alpha, ma;
-			alpha = sag & 255;
-			ma = *mp++;
-			if (alpha == 0)
-				continue;
-			ma += ma>>7;
-			if (ma == 0)
-				continue;
-			alpha += alpha>>7;
-			alpha = (alpha*ma)>>8;
-			sag |= 0xFF;
-			drb = dag & MASK;
-			dag = (dag<<8) & MASK;
-			srb = (sag>>8) & ~MASK;
-			sag = sag & ~MASK;
-			srb -= (drb>>8);
-			sag -= (dag>>8);
-			drb += srb * alpha;
-			dag += sag * alpha;
-			drb &= MASK;
-			dag &= MASK;
-			dag = drb | (dag>>8);
-			dp32[-1] = dag;
-		}
-		sp32 += sw;
-		mp += mw;
-		dp32 += dw;
-	}
-}
-
-static void
-text_w4i1o4_32bit(byte * restrict argb,
+text_w4i1o4_32bit(byte * restrict rgba,
 	byte * restrict src, int srcw,
 	byte * restrict dst, int dstw, int w0, int h)
 {
 	unsigned int *dst32 = (unsigned int *)(void *)dst;
-	unsigned int alpha = argb[0];
-	unsigned int rb = argb[1] | (argb[3] << 16);
-	unsigned int ag = 255 | (argb[2] << 16);
+	unsigned int alpha = rgba[3];
+	unsigned int rb = rgba[1] | (rgba[2] << 16);
+	unsigned int ga = rgba[2] | 0xFF0000;
 
 	if (alpha == 0)
 		return;
@@ -185,29 +94,29 @@ text_w4i1o4_32bit(byte * restrict argb,
 
 	if (alpha != 255)
 	{
-		alpha += alpha>>7;
+		alpha += alpha>>7; /* alpha is now in the 0...256 range */
 		while (h--)
 		{
 			int w = w0;
 			while (w--)
 			{
-				unsigned int ca, drb, dag, crb, cag;
+				unsigned int ca, drb, dga, crb, cga;
 				ca = *src++;
-				dag = *dst32++;
+				dga = *dst32++;
 				ca += ca>>7;
 				ca = (ca*alpha)>>8;
 				if (ca == 0)
 					continue;
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
-				dst32[-1] = dag;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
 			}
 			src += srcw;
 			dst32 += dstw;
@@ -215,28 +124,27 @@ text_w4i1o4_32bit(byte * restrict argb,
 	}
 	else
 	{
-		alpha += alpha>>7;
 		while (h--)
 		{
 			int w = w0;
 			while (w--)
 			{
-				unsigned int ca, drb, dag, crb, cag;
+				unsigned int ca, drb, dga, crb, cga;
 				ca = *src++;
-				dag = *dst32++;
+				drb = *dst32++;
 				ca += ca>>7;
 				if (ca == 0)
 					continue;
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
-				dst32[-1] = dag;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
 			}
 			src += srcw;
 			dst32 += dstw;
@@ -313,9 +221,9 @@ img_4o4_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
 			a = (((c >>8)-(a >>8)) * vd + a ) & MASK;
 			a1 = (((c1>>8)-(a1>>8)) * vd + a1) & MASK;
 		}
-		sa = (a>>8) & 0xFF;
+		sa = (a1>>24);
 		sa = FZ_COMBINE(FZ_EXPAND(sa), FZ_EXPAND(cov));
-		a |= 0xFF00;
+		a1 |= 0xFF000000;
 		d = *dst32++;
 		d1 = d & MASK;
 		d = (d<<8) & MASK;
@@ -328,15 +236,15 @@ img_4o4_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
 }
 
 static void
-img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
+img_w4i1o4_32bit(byte *rgba, byte * restrict src, byte cov, int len,
 	byte * restrict dst, fz_pixmap *image, int u, int v, int fa, int fb)
 {
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h-1;
-	int alpha = FZ_EXPAND(argb[0]);
-	unsigned int rb = argb[1] | (argb[3] << 16);
-	unsigned int ag = 255 | (argb[2] << 16);
+	int alpha = FZ_EXPAND(rgba[3]);
+	unsigned int rb = rgba[0] | (rgba[2] << 16);
+	unsigned int ga = rgba[1] | 0xFF0000;
 	unsigned int *dst32 = (unsigned int *)(void *)dst;
 
 	if (alpha == 0)
@@ -345,10 +253,10 @@ img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
 	{
 		while (len--)
 		{
-			unsigned int ca, drb, dag, crb, cag;
+			unsigned int ca, drb, dga, crb, cga;
 			unsigned int a, b;
 			cov += *src; *src = 0; src++;
-			dag = *dst32++;
+			drb = *dst32++;
 			ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
 			if (ca != 0)
 			{
@@ -396,16 +304,16 @@ img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
 			}
 			if (ca != 0)
 			{
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
-				dst32[-1] = dag;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
 			}
 			u += fa;
 			v += fb;
@@ -415,10 +323,10 @@ img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
 	{
 		while (len--)
 		{
-			unsigned int ca, drb, dag, crb, cag;
+			unsigned int ca, drb, dga, crb, cga;
 			unsigned int a, b;
 			cov += *src; *src = 0; src++;
-			dag = *dst32++;
+			drb = *dst32++;
 			if (cov != 0)
 			{
 				int ui, ui1, vi, vi1, ud, vd;
@@ -466,21 +374,21 @@ img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
 				{
 					if (ca == 256)
 					{
-						dag = (rb<<8) | ag;
+						drb = (ga<<8) | rb;
 					}
 					else
 					{
-						drb = dag & MASK;
-						dag = (dag<<8) & MASK;
+						dga = drb & MASK;
+						drb = (drb<<8) & MASK;
+						cga = ga - (dga>>8);
 						crb = rb - (drb>>8);
-						cag = ag - (dag>>8);
+						dga += cga * ca;
 						drb += crb * ca;
-						dag += cag * ca;
+						dga &= MASK;
 						drb &= MASK;
-						dag &= MASK;
-						dag = drb | (dag>>8);
+						drb = dga | (drb>>8);
 					}
-					dst32[-1] = dag;
+					dst32[-1] = drb;
 				}
 			}
 			u += fa;
@@ -560,10 +468,8 @@ img_1o1_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
 
 void fz_accelerate(void)
 {
-	if (sizeof(int) == 4 && sizeof(unsigned int) == 4)
+	if (sizeof(int) == 4 && sizeof(unsigned int) == 4 && !fz_isbigendian())
 	{
-		fz_duff_4o4 = duff_4o4_32bit;
-		fz_duff_4i1o4 = duff_4i1o4_32bit;
 		fz_path_w4i1o4 = path_w4i1o4_32bit;
 		fz_text_w4i1o4 = text_w4i1o4_32bit;
 		fz_img_4o4 = img_4o4_32bit;
diff --git a/draw/archx86.c b/draw/archx86.c
index 0c313f33..5418e9f7 100644
--- a/draw/archx86.c
+++ b/draw/archx86.c
@@ -40,7 +40,7 @@ static void duff_4i1o4mmx(byte *sp0, int sw, byte *mp0, int mw, byte *dp0, int d
 		{
 			int ts = *s++;
 			int ma = *mp++ + 1;
-			int sa = ((ts & 0xff) * ma) >> 8;
+			int sa = (((ts>>24) & 0xff) * ma) >> 8;
 			int ssa = 255 - sa;
 
 			__m64 d0 = _mm_cvtsi32_si64(*d);
@@ -50,11 +50,11 @@ static void duff_4i1o4mmx(byte *sp0, int sw, byte *mp0, int mw, byte *dp0, int d
 			__m64 mma = _mm_set1_pi16(ma);
 			__m64 mssa = _mm_set1_pi16(ssa);
 
-			/* unpack 0000argb => a0r0g0b0 */
+			/* unpack 0000rgba => r0g0b0a0 */
 			__m64 d1 = _mm_unpacklo_pi8(d0, mzero);
 			__m64 s1 = _mm_unpacklo_pi8(s0, mzero);
 
-			/* s1 * ma => a0r0g0b0 */
+			/* s1 * ma => r0g0b0a0 */
 			__m64 msma = _mm_mullo_pi16(s1, mma);
 			/* d1 * mssa */
 			__m64 mdssa = _mm_mullo_pi16(d1, mssa);
@@ -79,6 +79,8 @@ static void duff_4i1o4mmx(byte *sp0, int sw, byte *mp0, int mw, byte *dp0, int d
 
 #if 0 /* TODO */
 
+/* Needs to be rgba, not bgra, as well as needing finishing */
+
 static inline unsigned
 getargb(unsigned *s, int w, int h, int u, int v)
 {
diff --git a/draw/imagedraw.c b/draw/imagedraw.c
index 81d2bb05..dd887f53 100644
--- a/draw/imagedraw.c
+++ b/draw/imagedraw.c
@@ -13,7 +13,7 @@ getmask(byte *s, int w, int h, int u, int v)
 }
 
 static inline byte *
-getargb(byte *s, int w, int h, int u, int v)
+getrgba(byte *s, int w, int h, int u, int v)
 {
 	if (u < 0) u = 0;
 	if (v < 0) v = 0;
@@ -23,7 +23,7 @@ getargb(byte *s, int w, int h, int u, int v)
 }
 
 static inline byte *
-getag(byte *s, int w, int h, int u, int v)
+getga(byte *s, int w, int h, int u, int v)
 {
 	if (u < 0) u = 0;
 	if (v < 0) v = 0;
@@ -49,14 +49,14 @@ lerp(int a, int b, int t)
 }
 
 static inline void
-lerpag(byte *dst, byte *a, byte *b, int t)
+lerpga(byte *dst, byte *a, byte *b, int t)
 {
 	dst[0] = lerp(a[0], b[0], t);
 	dst[1] = lerp(a[1], b[1], t);
 }
 
 static inline void
-lerpargb(byte *dst, byte *a, byte *b, int t)
+lerprgba(byte *dst, byte *a, byte *b, int t)
 {
 	dst[0] = lerp(a[0], b[0], t);
 	dst[1] = lerp(a[1], b[1], t);
@@ -81,7 +81,7 @@ samplemask(byte *s, int w, int h, int u, int v)
 }
 
 static inline void
-sampleag(byte *s, int w, int h, int u, int v, byte *out)
+samplega(byte *s, int w, int h, int u, int v, byte *out)
 {
 	byte ab[4];
 	byte cd[4];
@@ -89,17 +89,17 @@ sampleag(byte *s, int w, int h, int u, int v, byte *out)
 	int vi = v >> 16;
 	int ud = u & 0xFFFF;
 	int vd = v & 0xFFFF;
-	byte *a = getag(s, w, h, ui, vi);
-	byte *b = getag(s, w, h, ui+1, vi);
-	byte *c = getag(s, w, h, ui, vi+1);
-	byte *d = getag(s, w, h, ui+1, vi+1);
-	lerpag(ab, a, b, ud);
-	lerpag(cd, c, d, ud);
-	lerpag(out, ab, cd, vd);
+	byte *a = getga(s, w, h, ui, vi);
+	byte *b = getga(s, w, h, ui+1, vi);
+	byte *c = getga(s, w, h, ui, vi+1);
+	byte *d = getga(s, w, h, ui+1, vi+1);
+	lerpga(ab, a, b, ud);
+	lerpga(cd, c, d, ud);
+	lerpga(out, ab, cd, vd);
 }
 
 static inline void
-sampleargb(byte *s, int w, int h, int u, int v, byte *out)
+samplergba(byte *s, int w, int h, int u, int v, byte *out)
 {
 	byte ab[4];
 	byte cd[4];
@@ -107,13 +107,13 @@ sampleargb(byte *s, int w, int h, int u, int v, byte *out)
 	int vi = v >> 16;
 	int ud = u & 0xFFFF;
 	int vd = v & 0xFFFF;
-	byte *a = getargb(s, w, h, ui, vi);
-	byte *b = getargb(s, w, h, ui+1, vi);
-	byte *c = getargb(s, w, h, ui, vi+1);
-	byte *d = getargb(s, w, h, ui+1, vi+1);
-	lerpargb(ab, a, b, ud);
-	lerpargb(cd, c, d, ud);
-	lerpargb(out, ab, cd, vd);
+	byte *a = getrgba(s, w, h, ui, vi);
+	byte *b = getrgba(s, w, h, ui+1, vi);
+	byte *c = getrgba(s, w, h, ui, vi+1);
+	byte *d = getrgba(s, w, h, ui+1, vi+1);
+	lerprgba(ab, a, b, ud);
+	lerprgba(cd, c, d, ud);
+	lerprgba(out, ab, cd, vd);
 }
 
 static inline void
@@ -170,7 +170,7 @@ img_2o2(byte * restrict src, byte cov, int len, byte * restrict dst,
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h;
-	byte ag[2];
+	byte ga[2];
 
 	while (len--)
 	{
@@ -178,12 +178,12 @@ img_2o2(byte * restrict src, byte cov, int len, byte * restrict dst,
 		cov += *src; *src = 0; src++;
 		if (cov != 0)
 		{
-			sampleag(samples, w, h, u, v, ag);
-			sa = FZ_COMBINE(FZ_EXPAND(ag[0]), FZ_EXPAND(cov));
+			samplega(samples, w, h, u, v, ga);
+			sa = FZ_COMBINE(FZ_EXPAND(ga[1]), FZ_EXPAND(cov));
 			if (sa != 0)
 			{
-				dst[0] = FZ_BLEND(255, dst[0], sa);
-				dst[1] = FZ_BLEND(ag[1], dst[1], sa);
+				dst[0] = FZ_BLEND(ga[0], dst[0], sa);
+				dst[1] = FZ_BLEND(255, dst[1], sa);
 			}
 		}
 		dst += 2;
@@ -199,7 +199,7 @@ img_4o4(byte * restrict src, byte cov, int len, byte * restrict dst,
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h;
-	byte argb[4];
+	byte rgba[4];
 
 	while (len--)
 	{
@@ -207,14 +207,14 @@ img_4o4(byte * restrict src, byte cov, int len, byte * restrict dst,
 		cov += *src; *src = 0; src++;
 		if (cov != 0)
 		{
-			sampleargb(samples, w, h, u, v, argb);
-			sa = FZ_COMBINE(FZ_EXPAND(argb[0]), FZ_EXPAND(cov));
+			samplergba(samples, w, h, u, v, rgba);
+			sa = FZ_COMBINE(FZ_EXPAND(rgba[3]), FZ_EXPAND(cov));
 			if (sa != 0)
 			{
-				dst[0] = FZ_BLEND(255, dst[0], sa);
-				dst[1] = FZ_BLEND(argb[1], dst[1], sa);
-				dst[2] = FZ_BLEND(argb[2], dst[2], sa);
-				dst[3] = FZ_BLEND(argb[3], dst[3], sa);
+				dst[0] = FZ_BLEND(rgba[0], dst[0], sa);
+				dst[1] = FZ_BLEND(rgba[1], dst[1], sa);
+				dst[2] = FZ_BLEND(rgba[2], dst[2], sa);
+				dst[3] = FZ_BLEND(255, dst[3], sa);
 			}
 		}
 		dst += 4;
@@ -224,18 +224,18 @@ img_4o4(byte * restrict src, byte cov, int len, byte * restrict dst,
 }
 
 static void
-img_w2i1o2(byte *ag, byte * restrict src, byte cov, int len, byte * restrict dst,
+img_w2i1o2(byte *ga, byte * restrict src, byte cov, int len, byte * restrict dst,
 	fz_pixmap *image, int u, int v, int fa, int fb)
 {
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h;
-	int alpha = FZ_EXPAND(ag[0]);
-	byte g = ag[1];
+	byte g = ga[0];
+	byte a = ga[1];
 
-	if (alpha == 0)
+	if (a == 0)
 		return;
-	if (alpha != 256)
+	if (a != 255)
 	{
 		while (len--)
 		{
@@ -244,12 +244,12 @@ img_w2i1o2(byte *ag, byte * restrict src, byte cov, int len, byte * restrict dst
 			if (cov != 0)
 			{
 				ca = samplemask(samples, w, h, u, v);
-				ca =FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(ca));
-				ca = FZ_COMBINE(ca, alpha);
+				ca = FZ_COMBINE(FZ_EXPAND(cov), FZ_EXPAND(ca));
+				ca = FZ_COMBINE(ca, FZ_EXPAND(a));
 				if (ca != 0)
 				{
-					dst[0] = FZ_BLEND(255, dst[0], ca);
-					dst[1] = FZ_BLEND(g, dst[1], ca);
+					dst[0] = FZ_BLEND(g, dst[0], ca);
+					dst[1] = FZ_BLEND(255, dst[1], ca);
 				}
 			}
 			dst += 2;
@@ -266,11 +266,11 @@ img_w2i1o2(byte *ag, byte * restrict src, byte cov, int len, byte * restrict dst
 			if (cov != 0)
 			{
 				ca = samplemask(samples, w, h, u, v);
-				ca =FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(ca));
+				ca = FZ_COMBINE(FZ_EXPAND(cov), FZ_EXPAND(ca));
 				if (ca != 0)
 				{
-					dst[0] = FZ_BLEND(255, dst[0], ca);
-					dst[1] = FZ_BLEND(g, dst[1], ca);
+					dst[0] = FZ_BLEND(g, dst[0], ca);
+					dst[1] = FZ_BLEND(255, dst[1], ca);
 				}
 			}
 			dst += 2;
@@ -281,20 +281,20 @@ img_w2i1o2(byte *ag, byte * restrict src, byte cov, int len, byte * restrict dst
 }
 
 static void
-img_w4i1o4(byte *argb, byte * restrict src, byte cov, int len, byte * restrict dst,
+img_w4i1o4(byte *rgba, byte * restrict src, byte cov, int len, byte * restrict dst,
 	fz_pixmap *image, int u, int v, int fa, int fb)
 {
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h;
-	int alpha = FZ_EXPAND(argb[0]);
-	byte r = argb[1];
-	byte g = argb[2];
-	byte b = argb[3];
+	byte r = rgba[0];
+	byte g = rgba[1];
+	byte b = rgba[2];
+	byte a = rgba[3];
 
-	if (alpha == 0)
+	if (a == 0)
 		return;
-	if (alpha != 256)
+	if (a != 255)
 	{
 		while (len--)
 		{
@@ -303,14 +303,14 @@ img_w4i1o4(byte *argb, byte * restrict src, byte cov, int len, byte * restrict d
 			if (cov != 0)
 			{
 				ca = samplemask(samples, w, h, u, v);
-				ca =FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(ca));
-				ca = FZ_COMBINE(ca, alpha);
+				ca = FZ_COMBINE(FZ_EXPAND(cov), FZ_EXPAND(ca));
+				ca = FZ_COMBINE(ca, FZ_EXPAND(a));
 				if (ca != 0)
 				{
-					dst[0] = FZ_BLEND(255, dst[0], ca);
-					dst[1] = FZ_BLEND(r, dst[1], ca);
-					dst[2] = FZ_BLEND(g, dst[2], ca);
-					dst[3] = FZ_BLEND(b, dst[3], ca);
+					dst[0] = FZ_BLEND(r, dst[0], ca);
+					dst[1] = FZ_BLEND(g, dst[1], ca);
+					dst[2] = FZ_BLEND(b, dst[2], ca);
+					dst[3] = FZ_BLEND(255, dst[3], ca);
 				}
 			}
 			dst += 4;
@@ -327,13 +327,13 @@ img_w4i1o4(byte *argb, byte * restrict src, byte cov, int len, byte * restrict d
 			if (cov != 0)
 			{
 				ca = samplemask(samples, w, h, u, v);
-				ca =FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(ca));
+				ca = FZ_COMBINE(FZ_EXPAND(cov), FZ_EXPAND(ca));
 				if (ca != 0)
 				{
-					dst[0] = FZ_BLEND(255, dst[0], ca);
-					dst[1] = FZ_BLEND(r, dst[1], ca);
-					dst[2] = FZ_BLEND(g, dst[2], ca);
-					dst[3] = FZ_BLEND(b, dst[3], ca);
+					dst[0] = FZ_BLEND(r, dst[0], ca);
+					dst[1] = FZ_BLEND(g, dst[1], ca);
+					dst[2] = FZ_BLEND(b, dst[2], ca);
+					dst[3] = FZ_BLEND(255, dst[3], ca);
 				}
 			}
 			dst += 4;
diff --git a/draw/imageunpack.c b/draw/imageunpack.c
index 6a1a1628..48c420bf 100644
--- a/draw/imageunpack.c
+++ b/draw/imageunpack.c
@@ -19,19 +19,22 @@ static void decodetile(fz_pixmap *pix, int skip, float *decode)
 	int justinvert = 1;
 	unsigned int mask;
 
-	min[0] = 0;
-	max[0] = 255;
-	sub[0] = 255;
-
-	for (i = skip; i < n; i++)
+	for (i = 0; i < n-skip; i++)
 	{
-		min[i] = decode[(i - skip) * 2] * 255;
-		max[i] = decode[(i - skip) * 2 + 1] * 255;
+		min[i] = decode[i * 2] * 255;
+		max[i] = decode[i * 2 + 1] * 255;
 		sub[i] = max[i] - min[i];
 		needed |= (min[i] != 0) | (max[i] != 255);
 		justinvert &= min[i] == 255 && max[i] == 0 && sub[i] == -255;
 	}
 
+	if (skip)
+	{
+		min[i] = 0;
+		max[i] = 255;
+		sub[i] = 255;
+	}
+
 	if (fz_isbigendian())
 		mask = 0x00ff00ff;
 	else
@@ -115,8 +118,8 @@ static void init1(void)
 		{
 			x = tbit(bits, k);
 			t1pad0[i][k] = x;
-			t1pad1[i][k * 2 + 0] = 255;
-			t1pad1[i][k * 2 + 1] = x;
+			t1pad1[i][k * 2 + 0] = x;
+			t1pad1[i][k * 2 + 1] = 255;
 		}
 	}
 
@@ -178,7 +181,7 @@ static void loadtile1(byte * restrict src, int sw, byte * restrict dst, int dw,
 			dp = dst;
 			for (x = 0; x < w; x++)
 			{
-				if ((x % pad) == 0)
+				if ((x % pad) == pad-1)
 					*dp++ = 255;
 				*dp++ = tbit(src, x);
 			}
@@ -204,14 +207,14 @@ static void loadtile1(byte * restrict src, int sw, byte * restrict dst, int dw,
 		while (h--) \
 		{ \
 			byte *dp = dst; \
-			tpad = 0; \
+			tpad = pad; \
 			for (x = 0; x < w; x++) \
 			{ \
-				if (!tpad--) { \
-					tpad = pad-1; \
+				*dp++ = getf(src, x); \
+				if (--tpad == 0) { \
+					tpad = pad; \
 					*dp++ = 255; \
 				} \
-				*dp++ = getf(src, x); \
 			} \
 			src += sw; \
 			dst += dw; \
@@ -247,8 +250,8 @@ static void loadtile8(byte * restrict src, int sw, byte * restrict dst, int dw,
 			int x;
 			for (x = w; x > 0; x --)
 			{
-				*dst++ = 255;
 				*dst++ = *src++;
+				*dst++ = 255;
 			}
 			src += sw;
 			dst += dw;
@@ -263,10 +266,10 @@ static void loadtile8(byte * restrict src, int sw, byte * restrict dst, int dw,
 			int x;
 			for (x = w; x > 0; x -= 3)
 			{
-				*dp++ = 255;
 				*dp++ = *src++;
 				*dp++ = *src++;
 				*dp++ = *src++;
+				*dp++ = 255;
 			}
 			src += sw;
 			dst += dw;
@@ -278,16 +281,16 @@ static void loadtile8(byte * restrict src, int sw, byte * restrict dst, int dw,
 		while (h--)
 		{
 			byte *dp = dst;
-			int tpad = 1;
+			int tpad = pad;
 			int x;
 			for (x = w; x > 0; x--)
 			{
+				*dp++ = *src++;
 				tpad--;
 				if (tpad == 0) {
 					tpad = pad;
 					*dp++ = 255;
 				}
-				*dp++ = *src++;
 			}
 			src += sw;
 			dst += dw;
diff --git a/draw/meshdraw.c b/draw/meshdraw.c
index c0e219c4..72a723e0 100644
--- a/draw/meshdraw.c
+++ b/draw/meshdraw.c
@@ -147,12 +147,12 @@ drawscan(fz_pixmap *pix, int y, int x1, int x2, int *v1, int *v2, int n)
 
 	while (w--)
 	{
-		*p++ = 255;
 		for (k = 0; k < n; k++)
 		{
 			*p++ = v[k] >> 16;
 			v[k] += dv[k];
 		}
+		*p++ = 255;
 	}
 }
 
@@ -376,11 +376,11 @@ fz_rendershade(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
 			d = dest->samples + ((bbox.x0 - dest->x) + (y - dest->y) * dest->w) * dest->n;
 			for (x = bbox.x0; x < bbox.x1; x++)
 			{
-				sa = s[0];
+				sa = s[1];
 				ssa = 255 - sa;
-				d[0] = s[0] + fz_mul255(d[0], ssa);
 				for (k = 0; k < dest->colorspace->n; k++)
-					d[k+1] = fz_mul255(clut[s[1]][k], sa) + fz_mul255(d[k+1], ssa);
+					d[k] = fz_mul255(clut[s[0]][k], sa) + fz_mul255(d[k+1], ssa);
+				d[k] = s[1] + fz_mul255(d[k], ssa);
 				s += 2;
 				d += 1 + dest->colorspace->n;
 			}
diff --git a/draw/porterduff.c b/draw/porterduff.c
index fd7a9d45..05f00f63 100644
--- a/draw/porterduff.c
+++ b/draw/porterduff.c
@@ -1,5 +1,74 @@
 #include "fitz.h"
 
+/*
+ * The functions in this file implement various flavours of Porter-Duff
+ * blending.
+ *
+ * We take the following as definitions:
+ *
+ *  Cx =          Color (from plane x)
+ *  ax =          Alpha (from plane x)
+ *  cx = Cx.ax  = Premultiplied color (from plane x)
+ *
+ * The general PorterDuff blending equation is:
+ *
+ * Blend Z = X op Y     cz = Fx.cx + Fy. cy    where Fx and Fy depend on op
+ *
+ * The two operations we use in this file are: '(X in Y) over Z' and
+ * 'S over Z'. The definitions of the 'over' and 'in' operations are as
+ * follows:
+ *
+ * For S over Z,    Fs = 1,  Fz = 1-as
+ * For X in Y,      Fx = ay, Fy = 0
+ *
+ * We have 2 choices; we can either work with premultiplied data, or non
+ * premultiplied data. Our
+ *
+ * First the premultiplied case:
+ *
+ * Let S = (X in Y)
+ * Let R = (X in Y) over Z = S over Z
+ *
+ * cs = cx.Fx + cy.Fy           (where Fx = ay, Fy = 0)
+ *    = cx.ay
+ * as = ax.Fx + ay.Fy
+ *    = ax.ay
+ *
+ * cr = cs.Fs + cz.Fz           (where Fs = 1, Fz = 1-as)
+ *    = cs    + cz.(1-as)
+ *    = cx.ay + cz.(1-ax.ay)
+ * ar = as.Fs + az.Fz
+ *    = as    + az.(1-as)
+ *    = ax.ay + az.(1-ax.ay)
+ *
+ * This has various nice properties, like not needing any divisions, and
+ * being symmetric in color and alpha, so this is what we use. Because we
+ * went through the pain of deriving the non premultiplied forms, we list
+ * them here too, though they are not used.
+ *
+ * Non Pre-multiplied case:
+ *
+ * Cs.as =  Fx.Cx.ax + Fy.Cy.ay           (where Fx = ay, Fy = 0)
+ *       =  Cx.ay.ax
+ * Cs    = (Cx.ay.ax)/(ay.ax)
+ *       =  Cx
+ * Cr.ar =  Fs.Cs.as + Fz.Cz.az           (where Fs = 1, Fz = 1-as)
+ *       =  Cs.as    + (1-as).Cz.az
+ *       =  Cx.ax.ay + Cz.az.(1-ax.ay)
+ * Cr    = (Cx.ax.ay + Cz.az.(1-ax.ay))/(ax.ay + az.(1-ax-ay))
+ *
+ * Much more complex, it seems. However, if we could restrict ourselves to
+ * the case where we were always plotting onto an opaque background (i.e.
+ * az = 1), then:
+ *
+ * Cr = Cx.(ax.ay) + Cz.(1-ax.ay)
+ *    = (Cx-Cz)*(1-ax.ay) + Cz            (a single MLA operation)
+ * ar = 1
+ *
+ * Sadly, this is not true in the general case, so we abandon this effort
+ * and stick to using the premultiplied form.
+ */
+
 typedef unsigned char byte;
 
 /*
@@ -11,6 +80,7 @@ static void
 duff_non(byte * restrict sp, int sw, int sn, byte * restrict dp, int dw, int w0, int h)
 {
 	int k;
+
 	sw -= w0*sn;
 	dw -= w0*sn;
 	while (h--)
@@ -18,12 +88,10 @@ duff_non(byte * restrict sp, int sw, int sn, byte * restrict dp, int dw, int w0,
 		int w = w0;
 		while (w--)
 		{
-			/* RJW: Alpha handling suspicious here; sp[0] counts twice */
-			int sa = FZ_EXPAND(sp[0]);
-			dp[0] = FZ_BLEND(255, dp[0], sa);
-			for (k = 1; k < sn; k++)
+			int ssa = 255 - sp[sn-1];
+			for (k = 0; k < sn; k++)
 			{
-				dp[k] = FZ_BLEND(sp[k], dp[k], sa);
+				dp[k] = sp[k] + fz_mul255(dp[k], ssa);
 			}
 			sp += sn;
 			dp += sn;
@@ -38,6 +106,7 @@ static void
 duff_nimon(byte * restrict sp, int sw, int sn, byte * restrict mp, int mw, int mn, byte * restrict dp, int dw, int w0, int h)
 {
 	int k;
+
 	sw -= w0*sn;
 	mw -= w0*mn;
 	dw -= w0*sn;
@@ -46,12 +115,11 @@ duff_nimon(byte * restrict sp, int sw, int sn, byte * restrict mp, int mw, int m
 		int w = w0;
 		while (w--)
 		{
-			/* TODO: validate this */
-			int ma = FZ_COMBINE(FZ_EXPAND(mp[0]), FZ_EXPAND(sp[0]));
-			dp[0] = FZ_BLEND(255, dp[0], ma);
-			for (k = 1; k < sn; k++)
+                        int ma = mp[0];
+			int ssa = 255-fz_mul255(sp[sn-1], ma);
+			for (k = 0; k < sn; k++)
 			{
-				dp[k] = FZ_BLEND(sp[k], dp[k], ma);
+				dp[k] = fz_mul255(sp[k], ma) + fz_mul255(dp[k], ssa);
 			}
 			sp += sn;
 			mp += mn;
@@ -64,22 +132,22 @@ duff_nimon(byte * restrict sp, int sw, int sn, byte * restrict mp, int mw, int m
 }
 
 static void
-duff_1o1(byte * restrict sp, int sw, byte * restrict dp, int dw, int w0, int h)
+duff_1o1(byte * restrict sp0, int sw, byte * restrict dp0, int dw, int w0, int h)
 {
 	/* duff_non(sp0, sw, 1, dp0, dw, w0, h); */
-	sw -= w0;
-	dw -= w0;
 	while (h--)
 	{
+		byte *sp = sp0;
+		byte *dp = dp0;
 		int w = w0;
 		while (w--)
 		{
-			dp[0] = FZ_BLEND(255, dp[0], FZ_EXPAND(sp[0]));
+			dp[0] = sp[0] + fz_mul255(dp[0], 255 - sp[0]);
 			sp ++;
 			dp ++;
 		}
-		sp += sw;
-		dp += dw;
+		sp0 += sw;
+		dp0 += dw;
 	}
 }
 
@@ -87,6 +155,7 @@ static void
 duff_4o4(byte *sp, int sw, byte *dp, int dw, int w0, int h)
 {
 	/* duff_non(sp0, sw, 4, dp0, dw, w0, h); */
+
 	sw -= w0<<2;
 	dw -= w0<<2;
 	while (h--)
@@ -94,11 +163,11 @@ duff_4o4(byte *sp, int sw, byte *dp, int dw, int w0, int h)
 		int w = w0;
 		while (w--)
 		{
-			int alpha = FZ_EXPAND(sp[0]);
-			dp[0] = FZ_BLEND(255, dp[0], alpha);
-			dp[1] = FZ_BLEND(sp[1], dp[1], alpha);
-			dp[2] = FZ_BLEND(sp[2], dp[2], alpha);
-			dp[3] = FZ_BLEND(sp[3], dp[3], alpha);
+			byte ssa = 255 - sp[3];
+			dp[0] = sp[0] + fz_mul255(dp[0], ssa);
+			dp[1] = sp[1] + fz_mul255(dp[1], ssa);
+			dp[2] = sp[2] + fz_mul255(dp[2], ssa);
+			dp[3] = sp[3] + fz_mul255(dp[3], ssa);
 			sp += 4;
 			dp += 4;
 		}
@@ -111,16 +180,16 @@ static void
 duff_1i1o1(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restrict dp, int dw, int w0, int h)
 {
 	/* duff_nimon(sp0, sw, 1, mp0, mw, 1, dp0, dw, w0, h); */
-	sw -= w0;
-	mw -= w0;
-	dw -= w0;
+
 	while (h--)
 	{
 		int w = w0;
 		while (w--)
 		{
-			int ma = FZ_COMBINE(FZ_EXPAND(mp[0]), FZ_EXPAND(sp[0]));
-			dp[0] = FZ_BLEND(255, dp[0], ma);
+			byte ma = mp[0];
+			byte sa = fz_mul255(sp[0], ma);
+			byte ssa = 255 - sa;
+			dp[0] = sa + fz_mul255(dp[0], ssa);
 			sp ++;
 			mp ++;
 			dp ++;
@@ -132,9 +201,37 @@ duff_1i1o1(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restri
 }
 
 static void
+duff_2i1o2(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restrict dp, int dw, int w0, int h)
+{
+
+	/* duff_nimon(sp, sw, 2, mp, mw, 1, dp, dw, w0, h); */
+	sw -= w0<<1;
+	dw -= w0<<1;
+	mw -= w0;
+	while (h--)
+	{
+		int w = w0;
+		while (w--)
+		{
+			byte ma = mp[0];
+			byte ssa = 255 - fz_mul255(sp[1], ma);
+			dp[0] = fz_mul255(sp[0], ma) + fz_mul255(dp[0], ssa);
+			dp[1] = fz_mul255(sp[1], ma) + fz_mul255(dp[1], ssa);
+			sp += 2;
+			mp += 1;
+			dp += 2;
+		}
+		sp += sw;
+		mp += mw;
+		dp += dw;
+	}
+}
+
+static void
 duff_4i1o4(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restrict dp, int dw, int w0, int h)
 {
 	/* duff_nimon(sp, sw, 4, mp, mw, 1, dp, dw, w0, h); */
+
 	sw -= w0<<2;
 	dw -= w0<<2;
 	mw -= w0;
@@ -143,11 +240,12 @@ duff_4i1o4(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restri
 		int w = w0;
 		while (w--)
 		{
-			int ma = FZ_COMBINE(FZ_EXPAND(mp[0]), FZ_EXPAND(sp[0]));
-			dp[0] = FZ_BLEND(255, dp[0], ma);
-			dp[1] = FZ_BLEND(sp[1], dp[1], ma);
-			dp[2] = FZ_BLEND(sp[2], dp[2], ma);
-			dp[3] = FZ_BLEND(sp[3], dp[3], ma);
+			byte ma = mp[0];
+			byte ssa = 255 - fz_mul255(sp[3], ma);
+			dp[0] = fz_mul255(sp[0], ma) + fz_mul255(dp[0], ssa);
+			dp[1] = fz_mul255(sp[1], ma) + fz_mul255(dp[1], ssa);
+			dp[2] = fz_mul255(sp[2], ma) + fz_mul255(dp[2], ssa);
+			dp[3] = fz_mul255(sp[3], ma) + fz_mul255(dp[3], ssa);
 			sp += 4;
 			mp += 1;
 			dp += 4;
@@ -176,39 +274,39 @@ path_1o1(byte * restrict src, byte cov, int len, byte * restrict dst)
 }
 
 static void
-path_w2i1o2(byte * restrict ag, byte * restrict src, byte cov, int len, byte * restrict dst)
+path_w2i1o2(byte * restrict ga, byte * restrict src, byte cov, int len, byte * restrict dst)
 {
-	int alpha = FZ_EXPAND(ag[0]);
-	byte g = ag[1];
+	byte g = ga[0];
+	int a = FZ_EXPAND(ga[1]);
 
 	while (len--)
 	{
 		int ca;
 		cov += *src; *src = 0; src++;
-		ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
-		dst[0] = FZ_BLEND(255, dst[0], ca);
-		dst[1] = FZ_BLEND(g, dst[1], ca);
+		ca = FZ_COMBINE(FZ_EXPAND(cov), a);
+		dst[0] = FZ_BLEND(g, dst[0], ca);
+		dst[1] = FZ_BLEND(255, dst[1], ca);
 		dst += 2;
 	}
 }
 
 static void
-path_w4i1o4(byte * restrict argb, byte * restrict src, byte cov, int len, byte * restrict dst)
+path_w4i1o4(byte * restrict rgba, byte * restrict src, byte cov, int len, byte * restrict dst)
 {
-	int alpha = FZ_EXPAND(argb[0]);
-	byte r = argb[1];
-	byte g = argb[2];
-	byte b = argb[3];
+	byte r = rgba[0];
+	byte g = rgba[1];
+	byte b = rgba[2];
+	int a = FZ_EXPAND(rgba[3]);
 
 	while (len--)
 	{
 		int ca;
 		cov += *src; *src = 0; src++;
-		ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
-		dst[0] = FZ_BLEND(255, dst[0], ca);
-		dst[1] = FZ_BLEND(r, dst[1], ca);
-		dst[2] = FZ_BLEND(g, dst[2], ca);
-		dst[3] = FZ_BLEND(b, dst[3], ca);
+		ca = FZ_COMBINE(FZ_EXPAND(cov), a);
+		dst[0] = FZ_BLEND(r, dst[0], ca);
+		dst[1] = FZ_BLEND(g, dst[1], ca);
+		dst[2] = FZ_BLEND(b, dst[2], ca);
+		dst[3] = FZ_BLEND(255, dst[3], ca);
 		dst += 4;
 	}
 }
@@ -220,6 +318,7 @@ path_w4i1o4(byte * restrict argb, byte * restrict src, byte cov, int len, byte *
 static void
 text_1o1(byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
 {
+
 	srcw -= w0;
 	dstw -= w0;
 	while (h--)
@@ -227,8 +326,7 @@ text_1o1(byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, i
 		int w = w0;
 		while (w--)
 		{
-			int c = FZ_EXPAND(src[0]);
-			dst[0] = FZ_BLEND(255, dst[0], c);
+			dst[0] = src[0] + fz_mul255(dst[0], 255 - src[0]);
 			src++;
 			dst++;
 		}
@@ -238,10 +336,10 @@ text_1o1(byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, i
 }
 
 static void
-text_w2i1o2(byte * restrict ag, byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
+text_w2i1o2(byte * restrict ga, byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
 {
-	int alpha = FZ_EXPAND(ag[0]);
-	byte g = ag[1];
+	byte g = ga[0];
+	int a = FZ_EXPAND(ga[1]);
 
 	srcw -= w0;
 	dstw -= w0<<1;
@@ -250,9 +348,9 @@ text_w2i1o2(byte * restrict ag, byte * restrict src, int srcw, byte * restrict d
 		int w = w0;
 		while (w--)
 		{
-			int c = FZ_COMBINE(FZ_EXPAND(src[0]), alpha);
-			dst[0] = FZ_BLEND(255, dst[0], c);
-			dst[1] = FZ_BLEND(g, dst[1], c);
+			int c = FZ_COMBINE(FZ_EXPAND(src[0]), a);
+			dst[0] = FZ_BLEND(g, dst[0], c);
+			dst[1] = FZ_BLEND(255, dst[1], c);
 			src ++;
 			dst += 2;
 		}
@@ -262,12 +360,12 @@ text_w2i1o2(byte * restrict ag, byte * restrict src, int srcw, byte * restrict d
 }
 
 static void
-text_w4i1o4(byte * restrict argb, byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
+text_w4i1o4(byte * restrict rgba, byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
 {
-	int alpha = FZ_EXPAND(argb[0]);
-	byte r = argb[1];
-	byte g = argb[2];
-	byte b = argb[3];
+	byte r = rgba[0];
+	byte g = rgba[1];
+	byte b = rgba[2];
+	int a = FZ_EXPAND(rgba[3]);
 
 	srcw -= w0;
 	dstw -= w0<<2;
@@ -276,11 +374,11 @@ text_w4i1o4(byte * restrict argb, byte * restrict src, int srcw, byte * restrict
 		int w = w0;
 		while (w--)
 		{
-			int c = FZ_COMBINE(FZ_EXPAND(src[0]), alpha);
-			dst[0] = FZ_BLEND(255, dst[0], c);
-			dst[1] = FZ_BLEND(r, dst[1], c);
-			dst[2] = FZ_BLEND(g, dst[2], c);
-			dst[3] = FZ_BLEND(b, dst[3], c);
+			int c = FZ_COMBINE(FZ_EXPAND(src[0]), a);
+			dst[0] = FZ_BLEND(r, dst[0], c);
+			dst[1] = FZ_BLEND(g, dst[1], c);
+			dst[2] = FZ_BLEND(b, dst[2], c);
+			dst[3] = FZ_BLEND(255, dst[3], c);
 			src ++;
 			dst += 4;
 		}
@@ -298,6 +396,7 @@ void (*fz_duff_nimon)(byte*,int,int,byte*,int,int,byte*,int,int,int) = duff_nimo
 void (*fz_duff_1o1)(byte*,int,byte*,int,int,int) = duff_1o1;
 void (*fz_duff_4o4)(byte*,int,byte*,int,int,int) = duff_4o4;
 void (*fz_duff_1i1o1)(byte*,int,byte*,int,byte*,int,int,int) = duff_1i1o1;
+void (*fz_duff_2i1o2)(byte*,int,byte*,int,byte*,int,int,int) = duff_2i1o2;
 void (*fz_duff_4i1o4)(byte*,int,byte*,int,byte*,int,int,int) = duff_4i1o4;
 
 void (*fz_path_1o1)(byte*,byte,int,byte*) = path_1o1;
diff --git a/fitz/dev_draw.c b/fitz/dev_draw.c
index 871c41c1..6050c09f 100644
--- a/fitz/dev_draw.c
+++ b/fitz/dev_draw.c
@@ -59,6 +59,8 @@ blendmaskover(fz_pixmap *src, fz_pixmap *msk, fz_pixmap *dst)
 
 	if (src->n == 1 && msk->n == 1 && dst->n == 1)
 		fz_duff_1i1o1(sp, src->w, mp, msk->w, dp, dst->w, w, h);
+	else if (src->n == 2 && msk->n == 1 && dst->n == 2)
+		fz_duff_2i1o2(sp, src->w * 2, mp, msk->w, dp, dst->w * 2, w, h);
 	else if (src->n == 4 && msk->n == 1 && dst->n == 4)
 		fz_duff_4i1o4(sp, src->w * 4, mp, msk->w, dp, dst->w * 4, w, h);
 	else if (src->n == dst->n)
@@ -76,8 +78,8 @@ fz_drawfillpath(void *user, fz_path *path, int evenodd, fz_matrix ctm,
 	fz_drawdevice *dev = user;
 	float expansion = fz_matrixexpansion(ctm);
 	float flatness = 0.3f / expansion;
-	unsigned char argb[FZ_MAXCOLORS + 1];
-	float rgb[FZ_MAXCOLORS];
+	unsigned char colorbv[FZ_MAXCOLORS + 1];
+	float colorfv[FZ_MAXCOLORS];
 	fz_bbox bbox;
 	int i;
 
@@ -93,11 +95,11 @@ fz_drawfillpath(void *user, fz_path *path, int evenodd, fz_matrix ctm,
 
 	if (dev->model)
 	{
-		fz_convertcolor(colorspace, color, dev->model, rgb);
-		argb[0] = alpha * 255;
+		fz_convertcolor(colorspace, color, dev->model, colorfv);
 		for (i = 0; i < dev->model->n; i++)
-			argb[i + 1] = rgb[i] * 255;
-		fz_scanconvert(dev->gel, dev->ael, evenodd, bbox, dev->dest, argb, nil, nil);
+			colorbv[i] = colorfv[i] * 255;
+		colorbv[i] = alpha * 255;
+		fz_scanconvert(dev->gel, dev->ael, evenodd, bbox, dev->dest, colorbv, nil, nil);
 	}
 	else
 	{
@@ -113,8 +115,8 @@ fz_drawstrokepath(void *user, fz_path *path, fz_strokestate *stroke, fz_matrix c
 	float expansion = fz_matrixexpansion(ctm);
 	float flatness = 0.3f / expansion;
 	float linewidth = stroke->linewidth;
-	unsigned char argb[FZ_MAXCOLORS + 1];
-	float rgb[FZ_MAXCOLORS];
+	unsigned char colorbv[FZ_MAXCOLORS + 1];
+	float colorfv[FZ_MAXCOLORS];
 	fz_bbox bbox;
 	int i;
 
@@ -136,11 +138,11 @@ fz_drawstrokepath(void *user, fz_path *path, fz_strokestate *stroke, fz_matrix c
 
 	if (dev->model)
 	{
-		fz_convertcolor(colorspace, color, dev->model, rgb);
-		argb[0] = alpha * 255;
+		fz_convertcolor(colorspace, color, dev->model, colorfv);
 		for (i = 0; i < dev->model->n; i++)
-			argb[i + 1] = rgb[i] * 255;
-		fz_scanconvert(dev->gel, dev->ael, 0, bbox, dev->dest, argb, nil, nil);
+			colorbv[i] = colorfv[i] * 255;
+		colorbv[i] = alpha * 255;
+		fz_scanconvert(dev->gel, dev->ael, 0, bbox, dev->dest, colorbv, nil, nil);
 	}
 	else
 	{
@@ -243,7 +245,7 @@ fz_drawclipstrokepath(void *user, fz_path *path, fz_strokestate *stroke, fz_matr
 }
 
 static void
-drawglyph(unsigned char *argb, fz_pixmap *dst, fz_pixmap *src, int xorig, int yorig, fz_bbox scissor)
+drawglyph(unsigned char *colorbv, fz_pixmap *dst, fz_pixmap *src, int xorig, int yorig, fz_bbox scissor)
 {
 	unsigned char *dp, *sp;
 	int w, h;
@@ -281,10 +283,10 @@ drawglyph(unsigned char *argb, fz_pixmap *dst, fz_pixmap *src, int xorig, int yo
 		switch (dst->n)
 		{
 		case 2:
-			fz_text_w2i1o2(argb, sp, src->w, dp, dst->w * 2, w, h);
+			fz_text_w2i1o2(colorbv, sp, src->w, dp, dst->w * 2, w, h);
 			break;
 		case 4:
-			fz_text_w4i1o4(argb, sp, src->w, dp, dst->w * 4, w, h);
+			fz_text_w4i1o4(colorbv, sp, src->w, dp, dst->w * 4, w, h);
 			break;
 		default:
 			assert("Write fz_text_wni1on" != NULL);
@@ -300,18 +302,18 @@ fz_drawfilltext(void *user, fz_text *text, fz_matrix ctm,
 	fz_colorspace *colorspace, float *color, float alpha)
 {
 	fz_drawdevice *dev = user;
-	unsigned char argb[FZ_MAXCOLORS + 1];
-	float rgb[FZ_MAXCOLORS];
+	unsigned char colorbv[FZ_MAXCOLORS + 1];
+	float colorfv[FZ_MAXCOLORS];
 	fz_matrix tm, trm;
 	fz_pixmap *glyph;
 	int i, x, y, gid;
 
 	if (dev->model)
 	{
-		fz_convertcolor(colorspace, color, dev->model, rgb);
-		argb[0] = alpha * 255;
+		fz_convertcolor(colorspace, color, dev->model, colorfv);
 		for (i = 0; i < dev->model->n; i++)
-			argb[i + 1] = rgb[i] * 255;
+			colorbv[i] = colorfv[i] * 255;
+		colorbv[i] = alpha * 255;
 	}
 
 	tm = text->trm;
@@ -334,7 +336,7 @@ fz_drawfilltext(void *user, fz_text *text, fz_matrix ctm,
 		if (glyph)
 		{
 			if (dev->model)
-				drawglyph(argb, dev->dest, glyph, x, y, dev->scissor);
+				drawglyph(colorbv, dev->dest, glyph, x, y, dev->scissor);
 			else
 				drawglyph(nil, dev->dest, glyph, x, y, dev->scissor);
 			fz_droppixmap(glyph);
@@ -448,8 +450,8 @@ fz_drawfillshade(void *user, fz_shade *shade, fz_matrix ctm)
 	fz_pixmap *dest = dev->dest;
 	fz_rect bounds;
 	fz_bbox bbox;
-	float rgb[FZ_MAXCOLORS];
-	unsigned char argb[FZ_MAXCOLORS + 1];
+	float colorfv[FZ_MAXCOLORS];
+	unsigned char colorbv[FZ_MAXCOLORS + 1];
 	unsigned char *s;
 	int x, y;
 
@@ -470,17 +472,18 @@ fz_drawfillshade(void *user, fz_shade *shade, fz_matrix ctm)
 	{
 		/* FIXME: Could use optimisation */
 		int i, n = dev->model->n + 1;
-		fz_convertcolor(shade->cs, shade->background, dev->model, rgb);
-		argb[n] = 255;
+
+		fz_convertcolor(shade->cs, shade->background, dev->model, colorfv);
+		colorbv[0] = 255;
 		for (i = 1; i < n; i++)
-			argb[n-i] = rgb[i-1] * 255;
+			colorbv[n-i] = colorfv[i-1] * 255;
 		for (y = bbox.y0; y < bbox.y1; y++)
 		{
 			s = dest->samples + ((bbox.x0 - dest->x) + (y - dest->y) * dest->w) * dest->n;
 			for (x = bbox.x0; x < bbox.x1; x++)
 			{
 				for (i = n; i > 0; i--)
-					*s++ = argb[i];
+					*s++ = colorbv[i];
 			}
 		}
 	}
@@ -587,8 +590,8 @@ fz_drawfillimagemask(void *user, fz_pixmap *image, fz_matrix ctm,
 	fz_colorspace *colorspace, float *color, float alpha)
 {
 	fz_drawdevice *dev = user;
-	unsigned char argb[FZ_MAXCOLORS + 1];
-	float rgb[FZ_MAXCOLORS];
+	unsigned char colorbv[FZ_MAXCOLORS + 1];
+	float colorfv[FZ_MAXCOLORS];
 	fz_bbox bbox;
 	int dx, dy;
 	fz_pixmap *scaled = nil;
@@ -608,11 +611,11 @@ fz_drawfillimagemask(void *user, fz_pixmap *image, fz_matrix ctm,
 
 	if (dev->dest->colorspace)
 	{
-		fz_convertcolor(colorspace, color, dev->model, rgb);
-		argb[0] = alpha * 255;
+		fz_convertcolor(colorspace, color, dev->model, colorfv);
 		for (i = 0; i < dev->model->n; i++)
-			argb[i + 1] = rgb[i] * 255;
-		fz_scanconvert(dev->gel, dev->ael, 0, bbox, dev->dest, argb, image, &invmat);
+			colorbv[i] = colorfv[i] * 255;
+		colorbv[i] = alpha * 255;
+		fz_scanconvert(dev->gel, dev->ael, 0, bbox, dev->dest, colorbv, image, &invmat);
 	}
 	else
 	{
diff --git a/fitz/filt_predict.c b/fitz/filt_predict.c
index a39231fb..07470647 100644
--- a/fitz/filt_predict.c
+++ b/fitz/filt_predict.c
@@ -165,8 +165,8 @@ fz_predictpng(fz_predict *p, unsigned char *in, unsigned char *out, int predicto
 		}
 		for (i = p->stride - bpp; i > 0; i--)
 		{
-			 *out = *in++ + out[-bpp];
-			 out++;
+			*out = *in++ + out[-bpp];
+			out++;
 		}
 		break;
 	case 2:
diff --git a/fitz/fitz_base.h b/fitz/fitz_base.h
index 4d48ac7d..c3b1f4a2 100644
--- a/fitz/fitz_base.h
+++ b/fitz/fitz_base.h
@@ -39,8 +39,6 @@
 
 extern int gettimeofday(struct timeval *tv, struct timezone *tz);
 
-#define restrict
-
 #define inline __inline
 
 #define __func__ __FUNCTION__
@@ -53,14 +51,18 @@ extern int gettimeofday(struct timeval *tv, struct timezone *tz);
 #define vsnprintf _vsnprintf
 #endif
 
-#else /* C99 or close enough */
+#else /* unix or close enough */
+
+#include <unistd.h>
+
+#endif
 
+#ifndef _C99
 #ifdef __GNUC__
 #define restrict __restrict
+#else
+#define restrict
 #endif
-
-#include <unistd.h>
-
 #endif
 
 /*
diff --git a/fitz/fitz_draw.h b/fitz/fitz_draw.h
index abd94a1e..0e13a831 100644
--- a/fitz/fitz_draw.h
+++ b/fitz/fitz_draw.h
@@ -38,13 +38,14 @@ typedef enum fz_blendkind_e
 } fz_blendkind;
 
 /*
-pixmaps have n components per pixel. the first is always alpha.
+pixmaps have n components per pixel. the last is always alpha.
 premultiplied alpha when rendering, but non-premultiplied for colorspace
 conversions and rescaling.
 */
 
 extern fz_colorspace *pdf_devicegray;
 extern fz_colorspace *pdf_devicergb;
+extern fz_colorspace *pdf_devicebgr;
 extern fz_colorspace *pdf_devicecmyk;
 extern fz_colorspace *pdf_devicelab;
 extern fz_colorspace *pdf_devicepattern;
@@ -451,7 +452,7 @@ fz_ael * fz_newael(void);
 void fz_freeael(fz_ael *ael);
 
 fz_error fz_scanconvert(fz_gel *gel, fz_ael *ael, int eofill,
-	fz_bbox clip, fz_pixmap *pix, unsigned char *argb, fz_pixmap *image, fz_matrix *invmat);
+	fz_bbox clip, fz_pixmap *pix, unsigned char *colorbv, fz_pixmap *image, fz_matrix *invmat);
 
 void fz_fillpath(fz_gel *gel, fz_path *path, fz_matrix ctm, float flatness);
 void fz_strokepath(fz_gel *gel, fz_path *path, fz_strokestate *stroke, fz_matrix ctm, float flatness, float linewidth);
@@ -484,6 +485,7 @@ extern void (*fz_duff_nimon)(unsigned char*,int,int,unsigned char*,int,int,unsig
 extern void (*fz_duff_1o1)(unsigned char*,int,unsigned char*,int,int,int);
 extern void (*fz_duff_4o4)(unsigned char*,int,unsigned char*,int,int,int);
 extern void (*fz_duff_1i1o1)(unsigned char*,int,unsigned char*,int,unsigned char*,int,int,int);
+extern void (*fz_duff_2i1o2)(unsigned char*,int,unsigned char*,int,unsigned char*,int,int,int);
 extern void (*fz_duff_4i1o4)(unsigned char*,int,unsigned char*,int,unsigned char*,int,int,int);
 
 extern void (*fz_path_1o1)(unsigned char*,unsigned char,int,unsigned char*);
diff --git a/fitz/res_colorspace.c b/fitz/res_colorspace.c
index c30ea7c4..79ca8ff4 100644
--- a/fitz/res_colorspace.c
+++ b/fitz/res_colorspace.c
@@ -72,8 +72,6 @@ fz_stdconvpixmap(fz_colorspace *srcs, fz_pixmap *src, fz_colorspace *dsts, fz_pi
 	{
 		for (x = 0; x < src->w; x++)
 		{
-			*d++ = *s++;
-
 			for (k = 0; k < src->n - 1; k++)
 				srcv[k] = *s++ / 255.0f;
 
@@ -81,6 +79,8 @@ fz_stdconvpixmap(fz_colorspace *srcs, fz_pixmap *src, fz_colorspace *dsts, fz_pi
 
 			for (k = 0; k < dst->n - 1; k++)
 				*d++ = dstv[k] * 255;
+
+			*d++ = *s++;
 		}
 	}
 }
diff --git a/mupdf/pdf_colorspace.c b/mupdf/pdf_colorspace.c
index d04cad4b..29be7139 100644
--- a/mupdf/pdf_colorspace.c
+++ b/mupdf/pdf_colorspace.c
@@ -30,8 +30,8 @@ static void fastgraytorgb(fz_pixmap *src, fz_pixmap *dst)
 	while (n--)
 	{
 		d[0] = s[0];
-		d[1] = s[1];
-		d[2] = s[1];
+		d[1] = s[0];
+		d[2] = s[0];
 		d[3] = s[1];
 		s += 2;
 		d += 4;
@@ -45,10 +45,10 @@ static void fastgraytocmyk(fz_pixmap *src, fz_pixmap *dst)
 	int n = src->w * src->h;
 	while (n--)
 	{
-		d[0] = s[0];
+		d[0] = 0;
 		d[1] = 0;
 		d[2] = 0;
-		d[3] = 0;
+		d[3] = s[0];
 		d[4] = s[1];
 		s += 2;
 		d += 5;
@@ -62,8 +62,22 @@ static void fastrgbtogray(fz_pixmap *src, fz_pixmap *dst)
 	int n = src->w * src->h;
 	while (n--)
 	{
-		d[0] = s[0];
-		d[1] = ((s[1]+1) * 77 + (s[2]+1) * 150 + (s[3]+1) * 28) >> 8;
+		d[0] = ((s[0]+1) * 77 + (s[1]+1) * 150 + (s[2]+1) * 28) >> 8;
+		d[1] = s[3];
+		s += 4;
+		d += 2;
+	}
+}
+
+static void fastbgrtogray(fz_pixmap *src, fz_pixmap *dst)
+{
+	unsigned char *s = src->samples;
+	unsigned char *d = dst->samples;
+	int n = src->w * src->h;
+	while (n--)
+	{
+		d[0] = ((s[0]+1) * 28 + (s[1]+1) * 150 + (s[2]+1) * 77) >> 8;
+		d[1] = s[3];
 		s += 4;
 		d += 2;
 	}
@@ -76,15 +90,36 @@ static void fastrgbtocmyk(fz_pixmap *src, fz_pixmap *dst)
 	int n = src->w * src->h;
 	while (n--)
 	{
-		unsigned char c = 255 - s[1];
-		unsigned char m = 255 - s[2];
-		unsigned char y = 255 - s[3];
+		unsigned char c = 255 - s[0];
+		unsigned char m = 255 - s[1];
+		unsigned char y = 255 - s[2];
 		unsigned char k = MIN(c, MIN(m, y));
-		d[0] = s[0];
-		d[1] = c - k;
-		d[2] = m - k;
-		d[3] = y - k;
-		d[4] = k;
+		d[0] = c - k;
+		d[1] = m - k;
+		d[2] = y - k;
+		d[3] = k;
+		d[4] = s[3];
+		s += 4;
+		d += 5;
+	}
+}
+
+static void fastbgrtocmyk(fz_pixmap *src, fz_pixmap *dst)
+{
+	unsigned char *s = src->samples;
+	unsigned char *d = dst->samples;
+	int n = src->w * src->h;
+	while (n--)
+	{
+		unsigned char c = 255 - s[2];
+		unsigned char m = 255 - s[1];
+		unsigned char y = 255 - s[0];
+		unsigned char k = MIN(c, MIN(m, y));
+		d[0] = c - k;
+		d[1] = m - k;
+		d[2] = y - k;
+		d[3] = k;
+		d[4] = s[3];
 		s += 4;
 		d += 5;
 	}
@@ -97,11 +132,11 @@ static void fastcmyktogray(fz_pixmap *src, fz_pixmap *dst)
 	int n = src->w * src->h;
 	while (n--)
 	{
-		unsigned char c = fz_mul255(s[1], 77);
-		unsigned char m = fz_mul255(s[2], 150);
-		unsigned char y = fz_mul255(s[3], 28);
-		d[0] = s[0];
-		d[1] = 255 - MIN(c + m + y + s[4], 255);
+		unsigned char c = fz_mul255(s[0], 77);
+		unsigned char m = fz_mul255(s[1], 150);
+		unsigned char y = fz_mul255(s[2], 28);
+		d[0] = 255 - MIN(c + m + y + s[3], 255);
+		d[1] = s[4];
 		s += 5;
 		d += 2;
 	}
@@ -114,15 +149,46 @@ static void fastcmyktorgb(fz_pixmap *src, fz_pixmap *dst)
 	int n = src->w * src->h;
 	while (n--)
 	{
-		d[0] = s[0];
-		d[1] = 255 - MIN(s[1] + s[4], 255);
-		d[2] = 255 - MIN(s[2] + s[4], 255);
-		d[3] = 255 - MIN(s[3] + s[4], 255);
+		d[0] = 255 - MIN(s[0] + s[3], 255);
+		d[1] = 255 - MIN(s[1] + s[3], 255);
+		d[2] = 255 - MIN(s[2] + s[3], 255);
+		d[3] = s[4];
 		s += 5;
 		d += 4;
 	}
 }
 
+static void fastcmyktobgr(fz_pixmap *src, fz_pixmap *dst)
+{
+	unsigned char *s = src->samples;
+	unsigned char *d = dst->samples;
+	int n = src->w * src->h;
+	while (n--)
+	{
+		d[0] = 255 - MIN(s[2] + s[3], 255);
+		d[1] = 255 - MIN(s[1] + s[3], 255);
+		d[2] = 255 - MIN(s[0] + s[3], 255);
+		d[3] = s[4];
+		s += 5;
+		d += 4;
+	}
+}
+
+static void fastrgbtobgr(fz_pixmap *src, fz_pixmap *dst)
+{
+	unsigned char *s = src->samples;
+	unsigned char *d = dst->samples;
+	int n = src->w * src->h;
+	while (n--)
+	{
+		d[0] = s[2];
+		d[1] = s[1];
+		d[2] = s[0];
+		s += 3;
+		d += 3;
+	}
+}
+
 void pdf_convpixmap(fz_colorspace *ss, fz_pixmap *sp, fz_colorspace *ds, fz_pixmap *dp)
 {
 	pdf_logimage("convert pixmap from %s to %s\n", ss->name, ds->name);
@@ -130,6 +196,7 @@ void pdf_convpixmap(fz_colorspace *ss, fz_pixmap *sp, fz_colorspace *ds, fz_pixm
 	if (ss == pdf_devicegray)
 	{
 		if (ds == pdf_devicergb) fastgraytorgb(sp, dp);
+		else if (ds == pdf_devicebgr) fastgraytorgb(sp, dp); /* bgr == rgb here */
 		else if (ds == pdf_devicecmyk) fastgraytocmyk(sp, dp);
 		else fz_stdconvpixmap(ss, sp, ds, dp);
 	}
@@ -137,14 +204,25 @@ void pdf_convpixmap(fz_colorspace *ss, fz_pixmap *sp, fz_colorspace *ds, fz_pixm
 	else if (ss == pdf_devicergb)
 	{
 		if (ds == pdf_devicegray) fastrgbtogray(sp, dp);
+		else if (ds == pdf_devicebgr) fastrgbtobgr(sp, dp);
 		else if (ds == pdf_devicecmyk) fastrgbtocmyk(sp, dp);
 		else fz_stdconvpixmap(ss, sp, ds, dp);
 
 	}
 
+	else if (ss == pdf_devicebgr)
+	{
+		if (ds == pdf_devicegray) fastbgrtogray(sp, dp);
+		else if (ds == pdf_devicergb) fastrgbtobgr(sp, dp); /* bgr = rgb her */
+		else if (ds == pdf_devicecmyk) fastbgrtocmyk(sp, dp);
+		else fz_stdconvpixmap(ss, sp, ds, dp);
+
+	}
+
 	else if (ss == pdf_devicecmyk)
 	{
 		if (ds == pdf_devicegray) fastcmyktogray(sp, dp);
+		else if (ds == pdf_devicebgr) fastcmyktobgr(sp, dp);
 		else if (ds == pdf_devicergb) fastcmyktorgb(sp, dp);
 		else fz_stdconvpixmap(ss, sp, ds, dp);
 	}
@@ -157,7 +235,7 @@ void pdf_convcolor(fz_colorspace *ss, float *sv, fz_colorspace *ds, float *dv)
 
 	if (ss == pdf_devicegray)
 	{
-		if (ds == pdf_devicergb)
+		if ((ds == pdf_devicergb) || (ds == pdf_devicebgr))
 		{
 			dv[0] = sv[0];
 			dv[1] = sv[0];
@@ -180,11 +258,44 @@ void pdf_convcolor(fz_colorspace *ss, float *sv, fz_colorspace *ds, float *dv)
 		{
 			dv[0] = sv[0] * 0.3f + sv[1] * 0.59f + sv[2] * 0.11f;
 		}
+		else if (ds == pdf_devicebgr)
+		{
+			dv[0] = sv[2];
+			dv[1] = sv[1];
+			dv[2] = sv[0];
+		}
+		else if (ds == pdf_devicecmyk)
+		{
+			float c = 1 - sv[0];
+			float m = 1 - sv[1];
+			float y = 1 - sv[2];
+			float k = MIN(c, MIN(m, y));
+			dv[0] = c - k;
+			dv[1] = m - k;
+			dv[2] = y - k;
+			dv[3] = k;
+		}
+		else
+			fz_stdconvcolor(ss, sv, ds, dv);
+	}
+
+	else if (ss == pdf_devicebgr)
+	{
+		if (ds == pdf_devicegray)
+		{
+			dv[0] = sv[0] * 0.11f + sv[1] * 0.59f + sv[2] * 0.3f;
+		}
+		else if (ds == pdf_devicebgr)
+		{
+			dv[0] = sv[2];
+			dv[1] = sv[1];
+			dv[2] = sv[0];
+		}
 		else if (ds == pdf_devicecmyk)
 		{
-			float c = 1 - sv[1];
-			float m = 1 - sv[2];
-			float y = 1 - sv[3];
+			float c = 1 - sv[2];
+			float m = 1 - sv[1];
+			float y = 1 - sv[0];
 			float k = MIN(c, MIN(m, y));
 			dv[0] = c - k;
 			dv[1] = m - k;
@@ -210,6 +321,12 @@ void pdf_convcolor(fz_colorspace *ss, float *sv, fz_colorspace *ds, float *dv)
 			dv[1] = 1 - MIN(sv[1] + sv[3], 1);
 			dv[2] = 1 - MIN(sv[2] + sv[3], 1);
 		}
+		else if (ds == pdf_devicebgr)
+		{
+			dv[0] = 1 - MIN(sv[2] + sv[3], 1);
+			dv[1] = 1 - MIN(sv[1] + sv[3], 1);
+			dv[2] = 1 - MIN(sv[0] + sv[3], 1);
+		}
 		else
 			fz_stdconvcolor(ss, sv, ds, dv);
 	}
@@ -312,6 +429,28 @@ static void xyztodevicecmyk(fz_colorspace *cs, float *xyz, float *cmyk)
 }
 
 /*
+ * DeviceBGR piggybacks on DeviceRGB
+ */
+
+static void bgrtoxyz(fz_colorspace *cs, float *bgr, float *xyz)
+{
+	float rgb[3];
+	rgb[0] = bgr[2];
+	rgb[1] = bgr[1];
+	rgb[2] = bgr[0];
+	rgbtoxyz(pdf_devicergb, rgb, xyz);
+}
+
+static void xyztobgr(fz_colorspace *cs, float *xyz, float *bgr)
+{
+	float rgb[3];
+	xyztorgb(pdf_devicergb, xyz, rgb);
+	bgr[0] = rgb[2];
+	bgr[1] = rgb[1];
+	bgr[2] = rgb[0];
+}
+
+/*
  * CIE Lab
  */
 
@@ -394,6 +533,16 @@ static struct calrgb kdevicergb =
 	{ 1,0,0, 0,1,0, 0,0,1 },
 };
 
+static struct calrgb kdevicebgr =
+{
+	{ -1, "DeviceBGR", 3, pdf_convpixmap, pdf_convcolor, bgrtoxyz, xyztobgr, nil },
+	{ 1, 1, 1 },
+	{ 0, 0, 0 },
+	{ 1, 1, 1 },
+	{ 1,0,0, 0,1,0, 0,0,1 },
+	{ 1,0,0, 0,1,0, 0,0,1 },
+};
+
 static fz_colorspace kdevicecmyk =
 {
 	-1, "DeviceCMYK", 4, pdf_convpixmap, pdf_convcolor, devicecmyktoxyz, xyztodevicecmyk, nil
@@ -414,6 +563,7 @@ static fz_colorspace kdevicepattern =
 
 fz_colorspace *pdf_devicegray = &kdevicegray.super;
 fz_colorspace *pdf_devicergb = &kdevicergb.super;
+fz_colorspace *pdf_devicebgr = &kdevicebgr.super;
 fz_colorspace *pdf_devicecmyk = &kdevicecmyk;
 fz_colorspace *pdf_devicelab = &kdevicelab.super;
 fz_colorspace *pdf_devicepattern = &kdevicepattern;
diff --git a/mupdf/pdf_image.c b/mupdf/pdf_image.c
index 554dee3f..332c8d3a 100644
--- a/mupdf/pdf_image.c
+++ b/mupdf/pdf_image.c
@@ -540,13 +540,13 @@ pdf_loadtile(pdf_image *src, fz_pixmap *tile)
 			int sn = src->indexed->base->n;
 			for (x = 0; x < tile->w; x++)
 			{
-				dst[x * dn] = 255; /* alpha */
 				i = st[x] / bpcfact;
 				i = CLAMP(i, 0, high);
 				for (k = 0; k < sn; k++)
 				{
-					dst[x * dn + k + 1] = index[i * sn + k];
+					dst[x * dn + k] = index[i * sn + k];
 				}
+				dst[x * dn + sn] = 255; /* alpha */
 			}
 		}