7 files changed, 360 insertions, 349 deletions
diff --git a/draw/archarm.c b/draw/archarm.c
index c532a195..2904ce00 100644
--- a/draw/archarm.c
+++ b/draw/archarm.c
@@ -14,24 +14,25 @@ extern void fz_srow4_arm(byte *src, byte *dst, int w, int denom);
 extern void fz_scol4_arm(byte *src, byte *dst, int w, int denom);
 
 static void
-path_w4i1o4_arm(byte * restrict argb, byte * restrict src, byte cov, int len, byte * restrict dst)
+path_w4i1o4_arm(byte * restrict rgba, byte * restrict src, byte cov, int len, byte * restrict dst)
 {
 	/* The ARM code here is a hand coded implementation
 	 * of the optimized C version. */
+
 	if (len <= 0)
 		return;
 	asm volatile(
-	"ldr	%0, [%0]		@ %0 = argb			\n"
+	"ldr	%0, [%0]		@ %0 = rgba			\n"
 	"mov	r11,#0							\n"
 	"mov	r8, #0xFF00						\n"
-	"and	r14,%0,#255		@ r14= alpha			\n"
-	"orr	%0, %0, #255		@ %0 = argb |= 255		\n"
+	"mov	r14,%0,lsr #24		@ r14= alpha			\n"
+	"orr	%0, %0, #0xFF000000	@ %0 = rgba |= 0xFF000000	\n"
 	"orr	r8, r8, r8, LSL #16	@ r8 = 0xFF00FF00		\n"
 	"adds	r14,r14,r14,LSR #7	@ r14 = alpha += alpha>>7	\n"
 	"beq	9f			@ if (alpha == 0) bale		\n"
-	"and	r6, %0, r8		@ r6 = rb<<8			\n"
-	"bic	%0, %0, r8		@ %0 = ag			\n"
-	"mov	r6, r6, LSR #8		@ r6 = rb			\n"
+	"and	r6, %0, r8		@ r6 = ga<<8			\n"
+	"bic	%0, %0, r8		@ %0 = rb			\n"
+	"mov	r6, r6, LSR #8		@ r6 = ga			\n"
 	"cmp	r14,#256		@ if (alpha == 256)		\n"
 	"beq	4f			@     no-alpha loop		\n"
 	"B	2f			@ enter the loop		\n"
@@ -40,37 +41,37 @@ path_w4i1o4_arm(byte * restrict argb, byte * restrict src, byte cov, int len, by
 	"ble	9f							\n"
 	"2:								\n"
 	"ldrb	r12,[%1]		@ r12= *src			\n"
-	"ldr	r9, [%4], #4		@ r9 = dag = *dst32++		\n"
+	"ldr	r9, [%4], #4		@ r9 = drb = *dst32++		\n"
 	"strb	r11,[%1], #1		@ r11= *src++ = 0		\n"
 	"add	%2, r12, %2		@ %2 = cov += r12		\n"
 	"ands	%2, %2, #255		@ %2 = cov &= 255		\n"
 	"beq	1b			@ if coverage == 0 loop back	\n"
 	"add	r10,%2, %2, LSR #7	@ r10= ca = cov+(cov>>7)	\n"
 	"mul	r10,r14,r10		@ r10= ca *= alpha		\n"
-	"and	r7, r8, r9		@ r7 = drb =  dag     & MASK	\n"
+	"and	r7, r8, r9		@ r7 = dga =  drb     & MASK	\n"
 	"mov	r10,r10,LSR #8		@ r10= ca >>= 8			\n"
-	"and	r9, r8, r9, LSL #8	@ r9 = dag = (dag<<8) & MASK	\n"
-	"sub	r12,r6, r7, LSR #8	@ r12= crb = rb - (drb>>8)	\n"
-	"sub	r5, %0, r9, LSR #8	@ r5 = cag = ag - (dag>>8)	\n"
-	"mla	r7, r12,r10,r7		@ r7 = drb += crb * ca		\n"
+	"and	r9, r8, r9, LSL #8	@ r9 = drb = (drb<<8) & MASK	\n"
+	"sub	r12,r6, r7, LSR #8	@ r12= cga = ga - (dga>>8)	\n"
+	"sub	r5, %0, r9, LSR #8	@ r5 = crb = rb - (drb>>8)	\n"
+	"mla	r7, r12,r10,r7		@ r7 = dga += cga * ca		\n"
 	"subs	%3, %3, #1		@ len--				\n"
-	"mla	r9, r5, r10,r9		@ r9 = dag += cag * ca		\n"
-	"and	r7, r8, r7		@ r7 = drb &= MASK		\n"
-	"and	r9, r8, r9		@ r9 = dag &= MASK		\n"
-	"orr	r9, r7, r9, LSR #8	@ r9 = dag = drb | (dag>>8)	\n"
+	"mla	r9, r5, r10,r9		@ r9 = drb += crb * ca		\n"
+	"and	r7, r8, r7		@ r7 = dga &= MASK		\n"
+	"and	r9, r8, r9		@ r9 = drb &= MASK		\n"
+	"orr	r9, r7, r9, LSR #8	@ r9 = drb = dga | (drb>>8)	\n"
 	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
 	"bgt	2b							\n"
 	"b	9f							\n"
 	"@ --- Solid alpha loop	---------------------------------------	\n"
 	"3:	@ Loop used when coverage == 256			\n"
-	"orr	r9, %0, r6, LSL #8	@ r9 = argb			\n"
+	"orr	r9, %0, r6, LSL #8	@ r9 = rgba			\n"
 	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
 	"4:	@ Loop used for when coverage*alpha == 0		\n"
 	"subs	%3, %3, #1		@ len--				\n"
 	"ble	9f							\n"
 	"5:								\n"
 	"ldrb	r12,[%1]		@ r12= *src			\n"
-	"ldr	r9, [%4], #4		@ r9 = dag = *dst32++		\n"
+	"ldr	r9, [%4], #4		@ r9 = drb = *dst32++		\n"
 	"strb	r11,[%1], #1		@ r11= *src++ = 0		\n"
 	"add	%2, r12, %2		@ %2 = cov += r12		\n"
 	"ands	%2, %2, #255		@ %2 = cov &= 255		\n"
@@ -78,21 +79,21 @@ path_w4i1o4_arm(byte * restrict argb, byte * restrict src, byte cov, int len, by
 	"cmp	%2, #255		@ if coverage == solid		\n"
 	"beq	3b			@    loop back			\n"
 	"add	r10,%2, %2, LSR #7	@ r10= ca = cov+(cov>>7)	\n"
-	"and	r7, r8, r9		@ r7 = drb =  dag     & MASK	\n"
-	"and	r9, r8, r9, LSL #8	@ r9 = dag = (dag<<8) & MASK	\n"
-	"sub	r12,r6, r7, LSR #8	@ r12= crb = rb - (drb>>8)	\n"
-	"sub	r5, %0, r9, LSR #8	@ r5 = cag = ag - (dag>>8)	\n"
-	"mla	r7, r12,r10,r7		@ r7 = drb += crb * ca		\n"
+	"and	r7, r8, r9		@ r7 = dga =  drb     & MASK	\n"
+	"and	r9, r8, r9, LSL #8	@ r9 = dga = (drb<<8) & MASK	\n"
+	"sub	r12,r6, r7, LSR #8	@ r12= cga = ga - (dga>>8)	\n"
+	"sub	r5, %0, r9, LSR #8	@ r5 = crb = rb - (drb>>8)	\n"
+	"mla	r7, r12,r10,r7		@ r7 = dga += cga * ca		\n"
 	"subs	%3, %3, #1		@ len--				\n"
-	"mla	r9, r5, r10,r9		@ r9 = dag += cag * ca		\n"
-	"and	r7, r8, r7		@ r7 = drb &= MASK		\n"
-	"and	r9, r8, r9		@ r9 = dag &= MASK		\n"
-	"orr	r9, r7, r9, LSR #8	@ r9 = dag = drb | (dag>>8)	\n"
+	"mla	r9, r5, r10,r9		@ r9 = drb += crb * ca		\n"
+	"and	r7, r8, r7		@ r7 = dga &= MASK		\n"
+	"and	r9, r8, r9		@ r9 = drb &= MASK		\n"
+	"orr	r9, r7, r9, LSR #8	@ r9 = drb = dga | (drb>>8)	\n"
 	"str	r9, [%4, #-4]		@ dst32[-1] = r9		\n"
 	"bgt	5b							\n"
 	"9:				@ End				\n"
 	:
-	"+r" (argb),
+	"+r" (rgba),
 	"+r" (src),
 	"+r" (cov),
 	"+r" (len),
@@ -129,8 +130,8 @@ static void loadtile8_arm(byte * restrict src, int sw, byte * restrict dst, int
 			"2:						\n"
 			"LDRB	r4, [%[src]], #1	@ r4 = *src++	\n"
 			"SUBS	r5, r5, #1				\n"
-			"STRB	r11,[%[dst]], #1	@ *dst++ = 255	\n"
 			"STRB	r4, [%[dst]], #1	@ *dst++ = r4	\n"
+			"STRB	r11,[%[dst]], #1	@ *dst++ = 255	\n"
 			"BGT	2b					\n"
 			"ADD	%[src],%[src],%[sw]	@ src += sw	\n"
 			"ADD	%[dst],%[dst],%[dw]	@ dst += dw	\n"
@@ -161,10 +162,10 @@ static void loadtile8_arm(byte * restrict src, int sw, byte * restrict dst, int
 			"LDRB	r6, [%[src]], #1	@ r6 = *src++	\n"
 			"LDRB	r7, [%[src]], #1	@ r7 = *src++	\n"
 			"SUBS	r5, r5, #3				\n"
-			"STRB	r11,[r8], #1		@ *dp++ = 255	\n"
 			"STRB	r4, [r8], #1		@ *dp++ = r4	\n"
 			"STRB	r6, [r8], #1		@ *dp++ = r6	\n"
 			"STRB	r7, [r8], #1		@ *dp++ = r7	\n"
+			"STRB	r11,[r8], #1		@ *dp++ = 255	\n"
 			"BGT	2b					\n"
 			"ADD	%[src],%[src],%[sw]	@ src += sw	\n"
 			"ADD	%[dst],%[dst],%[dw]	@ dst += dw	\n"
diff --git a/draw/archport.c b/draw/archport.c
index f4fea5bc..337ad3c1 100644
--- a/draw/archport.c
+++ b/draw/archport.c
@@ -7,15 +7,15 @@ typedef unsigned char byte;
 #define MASK 0xFF00FF00;
 
 static void
-path_w4i1o4_32bit(byte * restrict argb,
+path_w4i1o4_32bit(byte * restrict rgba,
 	byte * restrict src, byte cov, int len,
 	byte * restrict dst)
 {
 	/* COLOR * coverage + DST * (256-coverage) = (COLOR - DST)*coverage + DST*256 */
 	unsigned int *dst32 = (unsigned int *)(void *)dst;
-	int alpha = argb[0];
-	unsigned int rb = argb[1] | (argb[3] << 16);
-	unsigned int ag = 255 | (argb[2] << 16);
+	int alpha = rgba[3];
+	unsigned int rb = rgba[0] | (rgba[2] << 16);
+	unsigned int ga = rgba[1] | 0xFF0000;
 
 	if (alpha == 0)
 		return;
@@ -25,23 +25,23 @@ path_w4i1o4_32bit(byte * restrict argb,
 		alpha += alpha>>7; /* alpha is now in the 0...256 range */
 		while (len--)
 		{
-			unsigned int ca, drb, dag, crb, cag;
+			unsigned int ca, drb, dga, crb, cga;
 			cov += *src; *src++ = 0;
 			ca = cov + (cov>>7); /* ca is in 0...256 range */
 			ca = (ca*alpha)>>8; /* ca is is in 0...256 range */
-			dag = *dst32++;
+			drb = *dst32++;
 			if (ca != 0)
 			{
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
-				dst32[-1] = dag;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
 			}
 		}
 	}
@@ -49,133 +49,42 @@ path_w4i1o4_32bit(byte * restrict argb,
 	{
 		while (len--)
 		{
-			unsigned int ca, drb, dag, crb, cag;
+			unsigned int ca, drb, dga, crb, cga;
 			cov += *src; *src++ = 0;
 			ca = cov + (cov>>7); /* ca is in 0...256 range */
-			dag = *dst32++;
+			drb = *dst32++;
 			if (ca == 0)
 				continue;
 			if (ca == 255)
 			{
-				dag = (rb<<8) | ag;
+				drb = (ga<<8) | rb;
 			}
 			else
 			{
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
+				drb = dga |(drb>>8);
 			}
-			dst32[-1] = dag;
+			dst32[-1] = drb;
 		}
 	}
 }
 
 static void
-duff_4o4_32bit(byte * restrict sp, int sw, byte * restrict dp, int dw, int w0, int h)
-{
-	unsigned int *sp32 = (unsigned int *)(void *)sp;
-	unsigned int *dp32 = (unsigned int *)(void *)dp;
-
-	/* duff_non(sp0, sw, 4, dp0, dw, w0, h); */
-
-	sw = (sw>>2)-w0;
-	dw = (dw>>2)-w0;
-	while (h--)
-	{
-		int w = w0;
-		while (w--)
-		{
-			unsigned int sag = *sp32++;
-			unsigned int dag = *dp32++;
-			unsigned int srb, drb;
-			int alpha = sag & 255;
-			if (alpha == 0)
-				continue;
-			alpha += alpha>>7;
-			sag |= 0xFF;
-			drb = dag & MASK;
-			dag = (dag<<8) & MASK;
-			srb = (sag>>8) & ~MASK;
-			sag = sag & ~MASK;
-			srb -= (drb>>8);
-			sag -= (dag>>8);
-			drb += srb * alpha;
-			dag += sag * alpha;
-			drb &= MASK;
-			dag &= MASK;
-			dag = drb | (dag>>8);
-			dp32[-1] = dag;
-		}
-		sp32 += sw;
-		dp32 += dw;
-	}
-}
-
-static void
-duff_4i1o4_32bit(byte * restrict sp, int sw,
-	byte * restrict mp, int mw,
-	byte * restrict dp, int dw, int w0, int h)
-{
-	unsigned int *sp32 = (unsigned int *)(void *)sp;
-	unsigned int *dp32 = (unsigned int *)(void *)dp;
-
-	/* duff_nimon(sp, sw, 4, mp, mw, 1, dp, dw, w0, h); */
-
-	sw = (sw>>2)-w0;
-	dw = (dw>>2)-w0;
-	mw -= w0;
-	while (h--)
-	{
-		int w = w0;
-		while (w--)
-		{
-			unsigned int sag = *sp32++;
-			unsigned int dag = *dp32++;
-			unsigned int srb, drb, alpha, ma;
-			alpha = sag & 255;
-			ma = *mp++;
-			if (alpha == 0)
-				continue;
-			ma += ma>>7;
-			if (ma == 0)
-				continue;
-			alpha += alpha>>7;
-			alpha = (alpha*ma)>>8;
-			sag |= 0xFF;
-			drb = dag & MASK;
-			dag = (dag<<8) & MASK;
-			srb = (sag>>8) & ~MASK;
-			sag = sag & ~MASK;
-			srb -= (drb>>8);
-			sag -= (dag>>8);
-			drb += srb * alpha;
-			dag += sag * alpha;
-			drb &= MASK;
-			dag &= MASK;
-			dag = drb | (dag>>8);
-			dp32[-1] = dag;
-		}
-		sp32 += sw;
-		mp += mw;
-		dp32 += dw;
-	}
-}
-
-static void
-text_w4i1o4_32bit(byte * restrict argb,
+text_w4i1o4_32bit(byte * restrict rgba,
 	byte * restrict src, int srcw,
 	byte * restrict dst, int dstw, int w0, int h)
 {
 	unsigned int *dst32 = (unsigned int *)(void *)dst;
-	unsigned int alpha = argb[0];
-	unsigned int rb = argb[1] | (argb[3] << 16);
-	unsigned int ag = 255 | (argb[2] << 16);
+	unsigned int alpha = rgba[3];
+	unsigned int rb = rgba[1] | (rgba[2] << 16);
+	unsigned int ga = rgba[2] | 0xFF0000;
 
 	if (alpha == 0)
 		return;
@@ -185,29 +94,29 @@ text_w4i1o4_32bit(byte * restrict argb,
 
 	if (alpha != 255)
 	{
-		alpha += alpha>>7;
+		alpha += alpha>>7; /* alpha is now in the 0...256 range */
 		while (h--)
 		{
 			int w = w0;
 			while (w--)
 			{
-				unsigned int ca, drb, dag, crb, cag;
+				unsigned int ca, drb, dga, crb, cga;
 				ca = *src++;
-				dag = *dst32++;
+				dga = *dst32++;
 				ca += ca>>7;
 				ca = (ca*alpha)>>8;
 				if (ca == 0)
 					continue;
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
-				dst32[-1] = dag;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
 			}
 			src += srcw;
 			dst32 += dstw;
@@ -215,28 +124,27 @@ text_w4i1o4_32bit(byte * restrict argb,
 	}
 	else
 	{
-		alpha += alpha>>7;
 		while (h--)
 		{
 			int w = w0;
 			while (w--)
 			{
-				unsigned int ca, drb, dag, crb, cag;
+				unsigned int ca, drb, dga, crb, cga;
 				ca = *src++;
-				dag = *dst32++;
+				drb = *dst32++;
 				ca += ca>>7;
 				if (ca == 0)
 					continue;
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
-				dst32[-1] = dag;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
 			}
 			src += srcw;
 			dst32 += dstw;
@@ -313,9 +221,9 @@ img_4o4_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
 			a = (((c >>8)-(a >>8)) * vd + a ) & MASK;
 			a1 = (((c1>>8)-(a1>>8)) * vd + a1) & MASK;
 		}
-		sa = (a>>8) & 0xFF;
+		sa = (a1>>24);
 		sa = FZ_COMBINE(FZ_EXPAND(sa), FZ_EXPAND(cov));
-		a |= 0xFF00;
+		a1 |= 0xFF000000;
 		d = *dst32++;
 		d1 = d & MASK;
 		d = (d<<8) & MASK;
@@ -328,15 +236,15 @@ img_4o4_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
 }
 
 static void
-img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
+img_w4i1o4_32bit(byte *rgba, byte * restrict src, byte cov, int len,
 	byte * restrict dst, fz_pixmap *image, int u, int v, int fa, int fb)
 {
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h-1;
-	int alpha = FZ_EXPAND(argb[0]);
-	unsigned int rb = argb[1] | (argb[3] << 16);
-	unsigned int ag = 255 | (argb[2] << 16);
+	int alpha = FZ_EXPAND(rgba[3]);
+	unsigned int rb = rgba[0] | (rgba[2] << 16);
+	unsigned int ga = rgba[1] | 0xFF0000;
 	unsigned int *dst32 = (unsigned int *)(void *)dst;
 
 	if (alpha == 0)
@@ -345,10 +253,10 @@ img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
 	{
 		while (len--)
 		{
-			unsigned int ca, drb, dag, crb, cag;
+			unsigned int ca, drb, dga, crb, cga;
 			unsigned int a, b;
 			cov += *src; *src = 0; src++;
-			dag = *dst32++;
+			drb = *dst32++;
 			ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
 			if (ca != 0)
 			{
@@ -396,16 +304,16 @@ img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
 			}
 			if (ca != 0)
 			{
-				drb = dag & MASK;
-				dag = (dag<<8) & MASK;
+				dga = drb & MASK;
+				drb = (drb<<8) & MASK;
+				cga = ga - (dga>>8);
 				crb = rb - (drb>>8);
-				cag = ag - (dag>>8);
+				dga += cga * ca;
 				drb += crb * ca;
-				dag += cag * ca;
+				dga &= MASK;
 				drb &= MASK;
-				dag &= MASK;
-				dag = drb | (dag>>8);
-				dst32[-1] = dag;
+				drb = dga | (drb>>8);
+				dst32[-1] = drb;
 			}
 			u += fa;
 			v += fb;
@@ -415,10 +323,10 @@ img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
 	{
 		while (len--)
 		{
-			unsigned int ca, drb, dag, crb, cag;
+			unsigned int ca, drb, dga, crb, cga;
 			unsigned int a, b;
 			cov += *src; *src = 0; src++;
-			dag = *dst32++;
+			drb = *dst32++;
 			if (cov != 0)
 			{
 				int ui, ui1, vi, vi1, ud, vd;
@@ -466,21 +374,21 @@ img_w4i1o4_32bit(byte *argb, byte * restrict src, byte cov, int len,
 				{
 					if (ca == 256)
 					{
-						dag = (rb<<8) | ag;
+						drb = (ga<<8) | rb;
 					}
 					else
 					{
-						drb = dag & MASK;
-						dag = (dag<<8) & MASK;
+						dga = drb & MASK;
+						drb = (drb<<8) & MASK;
+						cga = ga - (dga>>8);
 						crb = rb - (drb>>8);
-						cag = ag - (dag>>8);
+						dga += cga * ca;
 						drb += crb * ca;
-						dag += cag * ca;
+						dga &= MASK;
 						drb &= MASK;
-						dag &= MASK;
-						dag = drb | (dag>>8);
+						drb = dga | (drb>>8);
 					}
-					dst32[-1] = dag;
+					dst32[-1] = drb;
 				}
 			}
 			u += fa;
@@ -560,10 +468,8 @@ img_1o1_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
 
 void fz_accelerate(void)
 {
-	if (sizeof(int) == 4 && sizeof(unsigned int) == 4)
+	if (sizeof(int) == 4 && sizeof(unsigned int) == 4 && !fz_isbigendian())
 	{
-		fz_duff_4o4 = duff_4o4_32bit;
-		fz_duff_4i1o4 = duff_4i1o4_32bit;
 		fz_path_w4i1o4 = path_w4i1o4_32bit;
 		fz_text_w4i1o4 = text_w4i1o4_32bit;
 		fz_img_4o4 = img_4o4_32bit;
diff --git a/draw/archx86.c b/draw/archx86.c
index 0c313f33..5418e9f7 100644
--- a/draw/archx86.c
+++ b/draw/archx86.c
@@ -40,7 +40,7 @@ static void duff_4i1o4mmx(byte *sp0, int sw, byte *mp0, int mw, byte *dp0, int d
 		{
 			int ts = *s++;
 			int ma = *mp++ + 1;
-			int sa = ((ts & 0xff) * ma) >> 8;
+			int sa = (((ts>>24) & 0xff) * ma) >> 8;
 			int ssa = 255 - sa;
 
 			__m64 d0 = _mm_cvtsi32_si64(*d);
@@ -50,11 +50,11 @@ static void duff_4i1o4mmx(byte *sp0, int sw, byte *mp0, int mw, byte *dp0, int d
 			__m64 mma = _mm_set1_pi16(ma);
 			__m64 mssa = _mm_set1_pi16(ssa);
 
-			/* unpack 0000argb => a0r0g0b0 */
+			/* unpack 0000rgba => r0g0b0a0 */
 			__m64 d1 = _mm_unpacklo_pi8(d0, mzero);
 			__m64 s1 = _mm_unpacklo_pi8(s0, mzero);
 
-			/* s1 * ma => a0r0g0b0 */
+			/* s1 * ma => r0g0b0a0 */
 			__m64 msma = _mm_mullo_pi16(s1, mma);
 			/* d1 * mssa */
 			__m64 mdssa = _mm_mullo_pi16(d1, mssa);
@@ -79,6 +79,8 @@ static void duff_4i1o4mmx(byte *sp0, int sw, byte *mp0, int mw, byte *dp0, int d
 
 #if 0 /* TODO */
 
+/* Needs to be rgba, not bgra, as well as needing finishing */
+
 static inline unsigned
 getargb(unsigned *s, int w, int h, int u, int v)
 {
diff --git a/draw/imagedraw.c b/draw/imagedraw.c
index 81d2bb05..dd887f53 100644
--- a/draw/imagedraw.c
+++ b/draw/imagedraw.c
@@ -13,7 +13,7 @@ getmask(byte *s, int w, int h, int u, int v)
 }
 
 static inline byte *
-getargb(byte *s, int w, int h, int u, int v)
+getrgba(byte *s, int w, int h, int u, int v)
 {
 	if (u < 0) u = 0;
 	if (v < 0) v = 0;
@@ -23,7 +23,7 @@ getargb(byte *s, int w, int h, int u, int v)
 }
 
 static inline byte *
-getag(byte *s, int w, int h, int u, int v)
+getga(byte *s, int w, int h, int u, int v)
 {
 	if (u < 0) u = 0;
 	if (v < 0) v = 0;
@@ -49,14 +49,14 @@ lerp(int a, int b, int t)
 }
 
 static inline void
-lerpag(byte *dst, byte *a, byte *b, int t)
+lerpga(byte *dst, byte *a, byte *b, int t)
 {
 	dst[0] = lerp(a[0], b[0], t);
 	dst[1] = lerp(a[1], b[1], t);
 }
 
 static inline void
-lerpargb(byte *dst, byte *a, byte *b, int t)
+lerprgba(byte *dst, byte *a, byte *b, int t)
 {
 	dst[0] = lerp(a[0], b[0], t);
 	dst[1] = lerp(a[1], b[1], t);
@@ -81,7 +81,7 @@ samplemask(byte *s, int w, int h, int u, int v)
 }
 
 static inline void
-sampleag(byte *s, int w, int h, int u, int v, byte *out)
+samplega(byte *s, int w, int h, int u, int v, byte *out)
 {
 	byte ab[4];
 	byte cd[4];
@@ -89,17 +89,17 @@ sampleag(byte *s, int w, int h, int u, int v, byte *out)
 	int vi = v >> 16;
 	int ud = u & 0xFFFF;
 	int vd = v & 0xFFFF;
-	byte *a = getag(s, w, h, ui, vi);
-	byte *b = getag(s, w, h, ui+1, vi);
-	byte *c = getag(s, w, h, ui, vi+1);
-	byte *d = getag(s, w, h, ui+1, vi+1);
-	lerpag(ab, a, b, ud);
-	lerpag(cd, c, d, ud);
-	lerpag(out, ab, cd, vd);
+	byte *a = getga(s, w, h, ui, vi);
+	byte *b = getga(s, w, h, ui+1, vi);
+	byte *c = getga(s, w, h, ui, vi+1);
+	byte *d = getga(s, w, h, ui+1, vi+1);
+	lerpga(ab, a, b, ud);
+	lerpga(cd, c, d, ud);
+	lerpga(out, ab, cd, vd);
 }
 
 static inline void
-sampleargb(byte *s, int w, int h, int u, int v, byte *out)
+samplergba(byte *s, int w, int h, int u, int v, byte *out)
 {
 	byte ab[4];
 	byte cd[4];
@@ -107,13 +107,13 @@ sampleargb(byte *s, int w, int h, int u, int v, byte *out)
 	int vi = v >> 16;
 	int ud = u & 0xFFFF;
 	int vd = v & 0xFFFF;
-	byte *a = getargb(s, w, h, ui, vi);
-	byte *b = getargb(s, w, h, ui+1, vi);
-	byte *c = getargb(s, w, h, ui, vi+1);
-	byte *d = getargb(s, w, h, ui+1, vi+1);
-	lerpargb(ab, a, b, ud);
-	lerpargb(cd, c, d, ud);
-	lerpargb(out, ab, cd, vd);
+	byte *a = getrgba(s, w, h, ui, vi);
+	byte *b = getrgba(s, w, h, ui+1, vi);
+	byte *c = getrgba(s, w, h, ui, vi+1);
+	byte *d = getrgba(s, w, h, ui+1, vi+1);
+	lerprgba(ab, a, b, ud);
+	lerprgba(cd, c, d, ud);
+	lerprgba(out, ab, cd, vd);
 }
 
 static inline void
@@ -170,7 +170,7 @@ img_2o2(byte * restrict src, byte cov, int len, byte * restrict dst,
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h;
-	byte ag[2];
+	byte ga[2];
 
 	while (len--)
 	{
@@ -178,12 +178,12 @@ img_2o2(byte * restrict src, byte cov, int len, byte * restrict dst,
 		cov += *src; *src = 0; src++;
 		if (cov != 0)
 		{
-			sampleag(samples, w, h, u, v, ag);
-			sa = FZ_COMBINE(FZ_EXPAND(ag[0]), FZ_EXPAND(cov));
+			samplega(samples, w, h, u, v, ga);
+			sa = FZ_COMBINE(FZ_EXPAND(ga[1]), FZ_EXPAND(cov));
 			if (sa != 0)
 			{
-				dst[0] = FZ_BLEND(255, dst[0], sa);
-				dst[1] = FZ_BLEND(ag[1], dst[1], sa);
+				dst[0] = FZ_BLEND(ga[0], dst[0], sa);
+				dst[1] = FZ_BLEND(255, dst[1], sa);
 			}
 		}
 		dst += 2;
@@ -199,7 +199,7 @@ img_4o4(byte * restrict src, byte cov, int len, byte * restrict dst,
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h;
-	byte argb[4];
+	byte rgba[4];
 
 	while (len--)
 	{
@@ -207,14 +207,14 @@ img_4o4(byte * restrict src, byte cov, int len, byte * restrict dst,
 		cov += *src; *src = 0; src++;
 		if (cov != 0)
 		{
-			sampleargb(samples, w, h, u, v, argb);
-			sa = FZ_COMBINE(FZ_EXPAND(argb[0]), FZ_EXPAND(cov));
+			samplergba(samples, w, h, u, v, rgba);
+			sa = FZ_COMBINE(FZ_EXPAND(rgba[3]), FZ_EXPAND(cov));
 			if (sa != 0)
 			{
-				dst[0] = FZ_BLEND(255, dst[0], sa);
-				dst[1] = FZ_BLEND(argb[1], dst[1], sa);
-				dst[2] = FZ_BLEND(argb[2], dst[2], sa);
-				dst[3] = FZ_BLEND(argb[3], dst[3], sa);
+				dst[0] = FZ_BLEND(rgba[0], dst[0], sa);
+				dst[1] = FZ_BLEND(rgba[1], dst[1], sa);
+				dst[2] = FZ_BLEND(rgba[2], dst[2], sa);
+				dst[3] = FZ_BLEND(255, dst[3], sa);
 			}
 		}
 		dst += 4;
@@ -224,18 +224,18 @@ img_4o4(byte * restrict src, byte cov, int len, byte * restrict dst,
 }
 
 static void
-img_w2i1o2(byte *ag, byte * restrict src, byte cov, int len, byte * restrict dst,
+img_w2i1o2(byte *ga, byte * restrict src, byte cov, int len, byte * restrict dst,
 	fz_pixmap *image, int u, int v, int fa, int fb)
 {
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h;
-	int alpha = FZ_EXPAND(ag[0]);
-	byte g = ag[1];
+	byte g = ga[0];
+	byte a = ga[1];
 
-	if (alpha == 0)
+	if (a == 0)
 		return;
-	if (alpha != 256)
+	if (a != 255)
 	{
 		while (len--)
 		{
@@ -244,12 +244,12 @@ img_w2i1o2(byte *ag, byte * restrict src, byte cov, int len, byte * restrict dst
 			if (cov != 0)
 			{
 				ca = samplemask(samples, w, h, u, v);
-				ca =FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(ca));
-				ca = FZ_COMBINE(ca, alpha);
+				ca = FZ_COMBINE(FZ_EXPAND(cov), FZ_EXPAND(ca));
+				ca = FZ_COMBINE(ca, FZ_EXPAND(a));
 				if (ca != 0)
 				{
-					dst[0] = FZ_BLEND(255, dst[0], ca);
-					dst[1] = FZ_BLEND(g, dst[1], ca);
+					dst[0] = FZ_BLEND(g, dst[0], ca);
+					dst[1] = FZ_BLEND(255, dst[1], ca);
 				}
 			}
 			dst += 2;
@@ -266,11 +266,11 @@ img_w2i1o2(byte *ag, byte * restrict src, byte cov, int len, byte * restrict dst
 			if (cov != 0)
 			{
 				ca = samplemask(samples, w, h, u, v);
-				ca =FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(ca));
+				ca = FZ_COMBINE(FZ_EXPAND(cov), FZ_EXPAND(ca));
 				if (ca != 0)
 				{
-					dst[0] = FZ_BLEND(255, dst[0], ca);
-					dst[1] = FZ_BLEND(g, dst[1], ca);
+					dst[0] = FZ_BLEND(g, dst[0], ca);
+					dst[1] = FZ_BLEND(255, dst[1], ca);
 				}
 			}
 			dst += 2;
@@ -281,20 +281,20 @@ img_w2i1o2(byte *ag, byte * restrict src, byte cov, int len, byte * restrict dst
 }
 
 static void
-img_w4i1o4(byte *argb, byte * restrict src, byte cov, int len, byte * restrict dst,
+img_w4i1o4(byte *rgba, byte * restrict src, byte cov, int len, byte * restrict dst,
 	fz_pixmap *image, int u, int v, int fa, int fb)
 {
 	byte *samples = image->samples;
 	int w = image->w;
 	int h = image->h;
-	int alpha = FZ_EXPAND(argb[0]);
-	byte r = argb[1];
-	byte g = argb[2];
-	byte b = argb[3];
+	byte r = rgba[0];
+	byte g = rgba[1];
+	byte b = rgba[2];
+	byte a = rgba[3];
 
-	if (alpha == 0)
+	if (a == 0)
 		return;
-	if (alpha != 256)
+	if (a != 255)
 	{
 		while (len--)
 		{
@@ -303,14 +303,14 @@ img_w4i1o4(byte *argb, byte * restrict src, byte cov, int len, byte * restrict d
 			if (cov != 0)
 			{
 				ca = samplemask(samples, w, h, u, v);
-				ca =FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(ca));
-				ca = FZ_COMBINE(ca, alpha);
+				ca = FZ_COMBINE(FZ_EXPAND(cov), FZ_EXPAND(ca));
+				ca = FZ_COMBINE(ca, FZ_EXPAND(a));
 				if (ca != 0)
 				{
-					dst[0] = FZ_BLEND(255, dst[0], ca);
-					dst[1] = FZ_BLEND(r, dst[1], ca);
-					dst[2] = FZ_BLEND(g, dst[2], ca);
-					dst[3] = FZ_BLEND(b, dst[3], ca);
+					dst[0] = FZ_BLEND(r, dst[0], ca);
+					dst[1] = FZ_BLEND(g, dst[1], ca);
+					dst[2] = FZ_BLEND(b, dst[2], ca);
+					dst[3] = FZ_BLEND(255, dst[3], ca);
 				}
 			}
 			dst += 4;
@@ -327,13 +327,13 @@ img_w4i1o4(byte *argb, byte * restrict src, byte cov, int len, byte * restrict d
 			if (cov != 0)
 			{
 				ca = samplemask(samples, w, h, u, v);
-				ca =FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(ca));
+				ca = FZ_COMBINE(FZ_EXPAND(cov), FZ_EXPAND(ca));
 				if (ca != 0)
 				{
-					dst[0] = FZ_BLEND(255, dst[0], ca);
-					dst[1] = FZ_BLEND(r, dst[1], ca);
-					dst[2] = FZ_BLEND(g, dst[2], ca);
-					dst[3] = FZ_BLEND(b, dst[3], ca);
+					dst[0] = FZ_BLEND(r, dst[0], ca);
+					dst[1] = FZ_BLEND(g, dst[1], ca);
+					dst[2] = FZ_BLEND(b, dst[2], ca);
+					dst[3] = FZ_BLEND(255, dst[3], ca);
 				}
 			}
 			dst += 4;
diff --git a/draw/imageunpack.c b/draw/imageunpack.c
index 6a1a1628..48c420bf 100644
--- a/draw/imageunpack.c
+++ b/draw/imageunpack.c
@@ -19,19 +19,22 @@ static void decodetile(fz_pixmap *pix, int skip, float *decode)
 	int justinvert = 1;
 	unsigned int mask;
 
-	min[0] = 0;
-	max[0] = 255;
-	sub[0] = 255;
-
-	for (i = skip; i < n; i++)
+	for (i = 0; i < n-skip; i++)
 	{
-		min[i] = decode[(i - skip) * 2] * 255;
-		max[i] = decode[(i - skip) * 2 + 1] * 255;
+		min[i] = decode[i * 2] * 255;
+		max[i] = decode[i * 2 + 1] * 255;
 		sub[i] = max[i] - min[i];
 		needed |= (min[i] != 0) | (max[i] != 255);
 		justinvert &= min[i] == 255 && max[i] == 0 && sub[i] == -255;
 	}
 
+	if (skip)
+	{
+		min[i] = 0;
+		max[i] = 255;
+		sub[i] = 255;
+	}
+
 	if (fz_isbigendian())
 		mask = 0x00ff00ff;
 	else
@@ -115,8 +118,8 @@ static void init1(void)
 		{
 			x = tbit(bits, k);
 			t1pad0[i][k] = x;
-			t1pad1[i][k * 2 + 0] = 255;
-			t1pad1[i][k * 2 + 1] = x;
+			t1pad1[i][k * 2 + 0] = x;
+			t1pad1[i][k * 2 + 1] = 255;
 		}
 	}
 
@@ -178,7 +181,7 @@ static void loadtile1(byte * restrict src, int sw, byte * restrict dst, int dw,
 			dp = dst;
 			for (x = 0; x < w; x++)
 			{
-				if ((x % pad) == 0)
+				if ((x % pad) == pad-1)
 					*dp++ = 255;
 				*dp++ = tbit(src, x);
 			}
@@ -204,14 +207,14 @@ static void loadtile1(byte * restrict src, int sw, byte * restrict dst, int dw,
 		while (h--) \
 		{ \
 			byte *dp = dst; \
-			tpad = 0; \
+			tpad = pad; \
 			for (x = 0; x < w; x++) \
 			{ \
-				if (!tpad--) { \
-					tpad = pad-1; \
+				*dp++ = getf(src, x); \
+				if (--tpad == 0) { \
+					tpad = pad; \
 					*dp++ = 255; \
 				} \
-				*dp++ = getf(src, x); \
 			} \
 			src += sw; \
 			dst += dw; \
@@ -247,8 +250,8 @@ static void loadtile8(byte * restrict src, int sw, byte * restrict dst, int dw,
 			int x;
 			for (x = w; x > 0; x --)
 			{
-				*dst++ = 255;
 				*dst++ = *src++;
+				*dst++ = 255;
 			}
 			src += sw;
 			dst += dw;
@@ -263,10 +266,10 @@ static void loadtile8(byte * restrict src, int sw, byte * restrict dst, int dw,
 			int x;
 			for (x = w; x > 0; x -= 3)
 			{
-				*dp++ = 255;
 				*dp++ = *src++;
 				*dp++ = *src++;
 				*dp++ = *src++;
+				*dp++ = 255;
 			}
 			src += sw;
 			dst += dw;
@@ -278,16 +281,16 @@ static void loadtile8(byte * restrict src, int sw, byte * restrict dst, int dw,
 		while (h--)
 		{
 			byte *dp = dst;
-			int tpad = 1;
+			int tpad = pad;
 			int x;
 			for (x = w; x > 0; x--)
 			{
+				*dp++ = *src++;
 				tpad--;
 				if (tpad == 0) {
 					tpad = pad;
 					*dp++ = 255;
 				}
-				*dp++ = *src++;
 			}
 			src += sw;
 			dst += dw;
diff --git a/draw/meshdraw.c b/draw/meshdraw.c
index c0e219c4..72a723e0 100644
--- a/draw/meshdraw.c
+++ b/draw/meshdraw.c
@@ -147,12 +147,12 @@ drawscan(fz_pixmap *pix, int y, int x1, int x2, int *v1, int *v2, int n)
 
 	while (w--)
 	{
-		*p++ = 255;
 		for (k = 0; k < n; k++)
 		{
 			*p++ = v[k] >> 16;
 			v[k] += dv[k];
 		}
+		*p++ = 255;
 	}
 }
 
@@ -376,11 +376,11 @@ fz_rendershade(fz_shade *shade, fz_matrix ctm, fz_pixmap *dest, fz_bbox bbox)
 			d = dest->samples + ((bbox.x0 - dest->x) + (y - dest->y) * dest->w) * dest->n;
 			for (x = bbox.x0; x < bbox.x1; x++)
 			{
-				sa = s[0];
+				sa = s[1];
 				ssa = 255 - sa;
-				d[0] = s[0] + fz_mul255(d[0], ssa);
 				for (k = 0; k < dest->colorspace->n; k++)
-					d[k+1] = fz_mul255(clut[s[1]][k], sa) + fz_mul255(d[k+1], ssa);
+					d[k] = fz_mul255(clut[s[0]][k], sa) + fz_mul255(d[k+1], ssa);
+				d[k] = s[1] + fz_mul255(d[k], ssa);
 				s += 2;
 				d += 1 + dest->colorspace->n;
 			}
diff --git a/draw/porterduff.c b/draw/porterduff.c
index fd7a9d45..05f00f63 100644
--- a/draw/porterduff.c
+++ b/draw/porterduff.c
@@ -1,5 +1,74 @@
 #include "fitz.h"
 
+/*
+ * The functions in this file implement various flavours of Porter-Duff
+ * blending.
+ *
+ * We take the following as definitions:
+ *
+ *  Cx =          Color (from plane x)
+ *  ax =          Alpha (from plane x)
+ *  cx = Cx.ax  = Premultiplied color (from plane x)
+ *
+ * The general PorterDuff blending equation is:
+ *
+ * Blend Z = X op Y     cz = Fx.cx + Fy. cy    where Fx and Fy depend on op
+ *
+ * The two operations we use in this file are: '(X in Y) over Z' and
+ * 'S over Z'. The definitions of the 'over' and 'in' operations are as
+ * follows:
+ *
+ * For S over Z,    Fs = 1,  Fz = 1-as
+ * For X in Y,      Fx = ay, Fy = 0
+ *
+ * We have 2 choices; we can either work with premultiplied data, or non
+ * premultiplied data. Our
+ *
+ * First the premultiplied case:
+ *
+ * Let S = (X in Y)
+ * Let R = (X in Y) over Z = S over Z
+ *
+ * cs = cx.Fx + cy.Fy           (where Fx = ay, Fy = 0)
+ *    = cx.ay
+ * as = ax.Fx + ay.Fy
+ *    = ax.ay
+ *
+ * cr = cs.Fs + cz.Fz           (where Fs = 1, Fz = 1-as)
+ *    = cs    + cz.(1-as)
+ *    = cx.ay + cz.(1-ax.ay)
+ * ar = as.Fs + az.Fz
+ *    = as    + az.(1-as)
+ *    = ax.ay + az.(1-ax.ay)
+ *
+ * This has various nice properties, like not needing any divisions, and
+ * being symmetric in color and alpha, so this is what we use. Because we
+ * went through the pain of deriving the non premultiplied forms, we list
+ * them here too, though they are not used.
+ *
+ * Non Pre-multiplied case:
+ *
+ * Cs.as =  Fx.Cx.ax + Fy.Cy.ay           (where Fx = ay, Fy = 0)
+ *       =  Cx.ay.ax
+ * Cs    = (Cx.ay.ax)/(ay.ax)
+ *       =  Cx
+ * Cr.ar =  Fs.Cs.as + Fz.Cz.az           (where Fs = 1, Fz = 1-as)
+ *       =  Cs.as    + (1-as).Cz.az
+ *       =  Cx.ax.ay + Cz.az.(1-ax.ay)
+ * Cr    = (Cx.ax.ay + Cz.az.(1-ax.ay))/(ax.ay + az.(1-ax-ay))
+ *
+ * Much more complex, it seems. However, if we could restrict ourselves to
+ * the case where we were always plotting onto an opaque background (i.e.
+ * az = 1), then:
+ *
+ * Cr = Cx.(ax.ay) + Cz.(1-ax.ay)
+ *    = (Cx-Cz)*(1-ax.ay) + Cz            (a single MLA operation)
+ * ar = 1
+ *
+ * Sadly, this is not true in the general case, so we abandon this effort
+ * and stick to using the premultiplied form.
+ */
+
 typedef unsigned char byte;
 
 /*
@@ -11,6 +80,7 @@ static void
 duff_non(byte * restrict sp, int sw, int sn, byte * restrict dp, int dw, int w0, int h)
 {
 	int k;
+
 	sw -= w0*sn;
 	dw -= w0*sn;
 	while (h--)
@@ -18,12 +88,10 @@ duff_non(byte * restrict sp, int sw, int sn, byte * restrict dp, int dw, int w0,
 		int w = w0;
 		while (w--)
 		{
-			/* RJW: Alpha handling suspicious here; sp[0] counts twice */
-			int sa = FZ_EXPAND(sp[0]);
-			dp[0] = FZ_BLEND(255, dp[0], sa);
-			for (k = 1; k < sn; k++)
+			int ssa = 255 - sp[sn-1];
+			for (k = 0; k < sn; k++)
 			{
-				dp[k] = FZ_BLEND(sp[k], dp[k], sa);
+				dp[k] = sp[k] + fz_mul255(dp[k], ssa);
 			}
 			sp += sn;
 			dp += sn;
@@ -38,6 +106,7 @@ static void
 duff_nimon(byte * restrict sp, int sw, int sn, byte * restrict mp, int mw, int mn, byte * restrict dp, int dw, int w0, int h)
 {
 	int k;
+
 	sw -= w0*sn;
 	mw -= w0*mn;
 	dw -= w0*sn;
@@ -46,12 +115,11 @@ duff_nimon(byte * restrict sp, int sw, int sn, byte * restrict mp, int mw, int m
 		int w = w0;
 		while (w--)
 		{
-			/* TODO: validate this */
-			int ma = FZ_COMBINE(FZ_EXPAND(mp[0]), FZ_EXPAND(sp[0]));
-			dp[0] = FZ_BLEND(255, dp[0], ma);
-			for (k = 1; k < sn; k++)
+                        int ma = mp[0];
+			int ssa = 255-fz_mul255(sp[sn-1], ma);
+			for (k = 0; k < sn; k++)
 			{
-				dp[k] = FZ_BLEND(sp[k], dp[k], ma);
+				dp[k] = fz_mul255(sp[k], ma) + fz_mul255(dp[k], ssa);
 			}
 			sp += sn;
 			mp += mn;
@@ -64,22 +132,22 @@ duff_nimon(byte * restrict sp, int sw, int sn, byte * restrict mp, int mw, int m
 }
 
 static void
-duff_1o1(byte * restrict sp, int sw, byte * restrict dp, int dw, int w0, int h)
+duff_1o1(byte * restrict sp0, int sw, byte * restrict dp0, int dw, int w0, int h)
 {
 	/* duff_non(sp0, sw, 1, dp0, dw, w0, h); */
-	sw -= w0;
-	dw -= w0;
 	while (h--)
 	{
+		byte *sp = sp0;
+		byte *dp = dp0;
 		int w = w0;
 		while (w--)
 		{
-			dp[0] = FZ_BLEND(255, dp[0], FZ_EXPAND(sp[0]));
+			dp[0] = sp[0] + fz_mul255(dp[0], 255 - sp[0]);
 			sp ++;
 			dp ++;
 		}
-		sp += sw;
-		dp += dw;
+		sp0 += sw;
+		dp0 += dw;
 	}
 }
 
@@ -87,6 +155,7 @@ static void
 duff_4o4(byte *sp, int sw, byte *dp, int dw, int w0, int h)
 {
 	/* duff_non(sp0, sw, 4, dp0, dw, w0, h); */
+
 	sw -= w0<<2;
 	dw -= w0<<2;
 	while (h--)
@@ -94,11 +163,11 @@ duff_4o4(byte *sp, int sw, byte *dp, int dw, int w0, int h)
 		int w = w0;
 		while (w--)
 		{
-			int alpha = FZ_EXPAND(sp[0]);
-			dp[0] = FZ_BLEND(255, dp[0], alpha);
-			dp[1] = FZ_BLEND(sp[1], dp[1], alpha);
-			dp[2] = FZ_BLEND(sp[2], dp[2], alpha);
-			dp[3] = FZ_BLEND(sp[3], dp[3], alpha);
+			byte ssa = 255 - sp[3];
+			dp[0] = sp[0] + fz_mul255(dp[0], ssa);
+			dp[1] = sp[1] + fz_mul255(dp[1], ssa);
+			dp[2] = sp[2] + fz_mul255(dp[2], ssa);
+			dp[3] = sp[3] + fz_mul255(dp[3], ssa);
 			sp += 4;
 			dp += 4;
 		}
@@ -111,16 +180,16 @@ static void
 duff_1i1o1(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restrict dp, int dw, int w0, int h)
 {
 	/* duff_nimon(sp0, sw, 1, mp0, mw, 1, dp0, dw, w0, h); */
-	sw -= w0;
-	mw -= w0;
-	dw -= w0;
+
 	while (h--)
 	{
 		int w = w0;
 		while (w--)
 		{
-			int ma = FZ_COMBINE(FZ_EXPAND(mp[0]), FZ_EXPAND(sp[0]));
-			dp[0] = FZ_BLEND(255, dp[0], ma);
+			byte ma = mp[0];
+			byte sa = fz_mul255(sp[0], ma);
+			byte ssa = 255 - sa;
+			dp[0] = sa + fz_mul255(dp[0], ssa);
 			sp ++;
 			mp ++;
 			dp ++;
@@ -132,9 +201,37 @@ duff_1i1o1(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restri
 }
 
 static void
+duff_2i1o2(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restrict dp, int dw, int w0, int h)
+{
+
+	/* duff_nimon(sp, sw, 2, mp, mw, 1, dp, dw, w0, h); */
+	sw -= w0<<1;
+	dw -= w0<<1;
+	mw -= w0;
+	while (h--)
+	{
+		int w = w0;
+		while (w--)
+		{
+			byte ma = mp[0];
+			byte ssa = 255 - fz_mul255(sp[1], ma);
+			dp[0] = fz_mul255(sp[0], ma) + fz_mul255(dp[0], ssa);
+			dp[1] = fz_mul255(sp[1], ma) + fz_mul255(dp[1], ssa);
+			sp += 2;
+			mp += 1;
+			dp += 2;
+		}
+		sp += sw;
+		mp += mw;
+		dp += dw;
+	}
+}
+
+static void
 duff_4i1o4(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restrict dp, int dw, int w0, int h)
 {
 	/* duff_nimon(sp, sw, 4, mp, mw, 1, dp, dw, w0, h); */
+
 	sw -= w0<<2;
 	dw -= w0<<2;
 	mw -= w0;
@@ -143,11 +240,12 @@ duff_4i1o4(byte * restrict sp, int sw, byte * restrict mp, int mw, byte * restri
 		int w = w0;
 		while (w--)
 		{
-			int ma = FZ_COMBINE(FZ_EXPAND(mp[0]), FZ_EXPAND(sp[0]));
-			dp[0] = FZ_BLEND(255, dp[0], ma);
-			dp[1] = FZ_BLEND(sp[1], dp[1], ma);
-			dp[2] = FZ_BLEND(sp[2], dp[2], ma);
-			dp[3] = FZ_BLEND(sp[3], dp[3], ma);
+			byte ma = mp[0];
+			byte ssa = 255 - fz_mul255(sp[3], ma);
+			dp[0] = fz_mul255(sp[0], ma) + fz_mul255(dp[0], ssa);
+			dp[1] = fz_mul255(sp[1], ma) + fz_mul255(dp[1], ssa);
+			dp[2] = fz_mul255(sp[2], ma) + fz_mul255(dp[2], ssa);
+			dp[3] = fz_mul255(sp[3], ma) + fz_mul255(dp[3], ssa);
 			sp += 4;
 			mp += 1;
 			dp += 4;
@@ -176,39 +274,39 @@ path_1o1(byte * restrict src, byte cov, int len, byte * restrict dst)
 }
 
 static void
-path_w2i1o2(byte * restrict ag, byte * restrict src, byte cov, int len, byte * restrict dst)
+path_w2i1o2(byte * restrict ga, byte * restrict src, byte cov, int len, byte * restrict dst)
 {
-	int alpha = FZ_EXPAND(ag[0]);
-	byte g = ag[1];
+	byte g = ga[0];
+	int a = FZ_EXPAND(ga[1]);
 
 	while (len--)
 	{
 		int ca;
 		cov += *src; *src = 0; src++;
-		ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
-		dst[0] = FZ_BLEND(255, dst[0], ca);
-		dst[1] = FZ_BLEND(g, dst[1], ca);
+		ca = FZ_COMBINE(FZ_EXPAND(cov), a);
+		dst[0] = FZ_BLEND(g, dst[0], ca);
+		dst[1] = FZ_BLEND(255, dst[1], ca);
 		dst += 2;
 	}
 }
 
 static void
-path_w4i1o4(byte * restrict argb, byte * restrict src, byte cov, int len, byte * restrict dst)
+path_w4i1o4(byte * restrict rgba, byte * restrict src, byte cov, int len, byte * restrict dst)
 {
-	int alpha = FZ_EXPAND(argb[0]);
-	byte r = argb[1];
-	byte g = argb[2];
-	byte b = argb[3];
+	byte r = rgba[0];
+	byte g = rgba[1];
+	byte b = rgba[2];
+	int a = FZ_EXPAND(rgba[3]);
 
 	while (len--)
 	{
 		int ca;
 		cov += *src; *src = 0; src++;
-		ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
-		dst[0] = FZ_BLEND(255, dst[0], ca);
-		dst[1] = FZ_BLEND(r, dst[1], ca);
-		dst[2] = FZ_BLEND(g, dst[2], ca);
-		dst[3] = FZ_BLEND(b, dst[3], ca);
+		ca = FZ_COMBINE(FZ_EXPAND(cov), a);
+		dst[0] = FZ_BLEND(r, dst[0], ca);
+		dst[1] = FZ_BLEND(g, dst[1], ca);
+		dst[2] = FZ_BLEND(b, dst[2], ca);
+		dst[3] = FZ_BLEND(255, dst[3], ca);
 		dst += 4;
 	}
 }
@@ -220,6 +318,7 @@ path_w4i1o4(byte * restrict argb, byte * restrict src, byte cov, int len, byte *
 static void
 text_1o1(byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
 {
+
 	srcw -= w0;
 	dstw -= w0;
 	while (h--)
@@ -227,8 +326,7 @@ text_1o1(byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, i
 		int w = w0;
 		while (w--)
 		{
-			int c = FZ_EXPAND(src[0]);
-			dst[0] = FZ_BLEND(255, dst[0], c);
+			dst[0] = src[0] + fz_mul255(dst[0], 255 - src[0]);
 			src++;
 			dst++;
 		}
@@ -238,10 +336,10 @@ text_1o1(byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, i
 }
 
 static void
-text_w2i1o2(byte * restrict ag, byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
+text_w2i1o2(byte * restrict ga, byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
 {
-	int alpha = FZ_EXPAND(ag[0]);
-	byte g = ag[1];
+	byte g = ga[0];
+	int a = FZ_EXPAND(ga[1]);
 
 	srcw -= w0;
 	dstw -= w0<<1;
@@ -250,9 +348,9 @@ text_w2i1o2(byte * restrict ag, byte * restrict src, int srcw, byte * restrict d
 		int w = w0;
 		while (w--)
 		{
-			int c = FZ_COMBINE(FZ_EXPAND(src[0]), alpha);
-			dst[0] = FZ_BLEND(255, dst[0], c);
-			dst[1] = FZ_BLEND(g, dst[1], c);
+			int c = FZ_COMBINE(FZ_EXPAND(src[0]), a);
+			dst[0] = FZ_BLEND(g, dst[0], c);
+			dst[1] = FZ_BLEND(255, dst[1], c);
 			src ++;
 			dst += 2;
 		}
@@ -262,12 +360,12 @@ text_w2i1o2(byte * restrict ag, byte * restrict src, int srcw, byte * restrict d
 }
 
 static void
-text_w4i1o4(byte * restrict argb, byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
+text_w4i1o4(byte * restrict rgba, byte * restrict src, int srcw, byte * restrict dst, int dstw, int w0, int h)
 {
-	int alpha = FZ_EXPAND(argb[0]);
-	byte r = argb[1];
-	byte g = argb[2];
-	byte b = argb[3];
+	byte r = rgba[0];
+	byte g = rgba[1];
+	byte b = rgba[2];
+	int a = FZ_EXPAND(rgba[3]);
 
 	srcw -= w0;
 	dstw -= w0<<2;
@@ -276,11 +374,11 @@ text_w4i1o4(byte * restrict argb, byte * restrict src, int srcw, byte * restrict
 		int w = w0;
 		while (w--)
 		{
-			int c = FZ_COMBINE(FZ_EXPAND(src[0]), alpha);
-			dst[0] = FZ_BLEND(255, dst[0], c);
-			dst[1] = FZ_BLEND(r, dst[1], c);
-			dst[2] = FZ_BLEND(g, dst[2], c);
-			dst[3] = FZ_BLEND(b, dst[3], c);
+			int c = FZ_COMBINE(FZ_EXPAND(src[0]), a);
+			dst[0] = FZ_BLEND(r, dst[0], c);
+			dst[1] = FZ_BLEND(g, dst[1], c);
+			dst[2] = FZ_BLEND(b, dst[2], c);
+			dst[3] = FZ_BLEND(255, dst[3], c);
 			src ++;
 			dst += 4;
 		}
@@ -298,6 +396,7 @@ void (*fz_duff_nimon)(byte*,int,int,byte*,int,int,byte*,int,int,int) = duff_nimo
 void (*fz_duff_1o1)(byte*,int,byte*,int,int,int) = duff_1o1;
 void (*fz_duff_4o4)(byte*,int,byte*,int,int,int) = duff_4o4;
 void (*fz_duff_1i1o1)(byte*,int,byte*,int,byte*,int,int,int) = duff_1i1o1;
+void (*fz_duff_2i1o2)(byte*,int,byte*,int,byte*,int,int,int) = duff_2i1o2;
 void (*fz_duff_4i1o4)(byte*,int,byte*,int,byte*,int,int,int) = duff_4i1o4;
 
 void (*fz_path_1o1)(byte*,byte,int,byte*) = path_1o1;