gka fixes. use truetypes in fontfile.

author: Tor Andersson <tor@ghostscript.com> 2004-11-28 17:41:15 +0100
committer: Tor Andersson <tor@ghostscript.com> 2004-11-28 17:41:15 +0100
commit: 85792218b05cb41d7dd4696443a4fdd6c16e1817 (patch)
tree: 8da4dab2e44204e3ebba47ed01ff029ab00df35d /render
parent: 2235b780ce692e1393fdd925eea0cdd9e1a422a1 (diff)
download: mupdf-85792218b05cb41d7dd4696443a4fdd6c16e1817.tar.xz
3 files changed, 93 insertions, 20 deletions
diff --git a/render/rastport.c b/render/rastport.c
index 9cae06d5..3eb89e43 100644
--- a/render/rastport.c
+++ b/render/rastport.c
@@ -264,15 +264,18 @@ static void msk_1o1(byte *src, byte *dst, int w)
 
 static void msk_w3i1o4(byte *rgb, byte *src, byte *dst, int n)
 {
+	byte rgb0 = rgb[0];
+	byte rgb1 = rgb[1];
+	byte rgb2 = rgb[2];
 	byte sa, ssa;
 	while (n--)
 	{
 		sa = src[0];
 		ssa = 255 - sa;
 		dst[0] = sa + fz_mul255(dst[0], ssa);
-		dst[1] = rgb[0] + fz_mul255((short)dst[1] - rgb[0], ssa);
-		dst[2] = rgb[1] + fz_mul255((short)dst[2] - rgb[1], ssa);
-		dst[3] = rgb[2] + fz_mul255((short)dst[3] - rgb[2], ssa);
+		dst[1] = rgb0 + fz_mul255((short)dst[1] - rgb0, ssa);
+		dst[2] = rgb1 + fz_mul255((short)dst[2] - rgb1, ssa);
+		dst[3] = rgb2 + fz_mul255((short)dst[3] - rgb2, ssa);
 		src ++;
 		dst += 4;
 	}
@@ -496,6 +499,9 @@ static void img_4o4(FZ_PSRC, FZ_PDST, FZ_PCTM)
 
 static void img_w3i1o4(byte *rgb, FZ_PSRC, FZ_PDST, FZ_PCTM)
 {
+	byte rgb0 = rgb[0];
+	byte rgb1 = rgb[1];
+	byte rgb2 = rgb[2];
 	byte sa, ssa;
 	while (h--)
 	{
@@ -508,9 +514,9 @@ static void img_w3i1o4(byte *rgb, FZ_PSRC, FZ_PDST, FZ_PCTM)
 			sa = samplemask(src, srcw, srch, u, v);
 			ssa = 255 - sa;
 			dstp[0] = sa + fz_mul255(dstp[0], ssa);
-			dstp[1] = rgb[0] + fz_mul255((short)dstp[1] - rgb[0], ssa);
-			dstp[2] = rgb[1] + fz_mul255((short)dstp[2] - rgb[1], ssa);
-			dstp[3] = rgb[2] + fz_mul255((short)dstp[3] - rgb[2], ssa);
+			dstp[1] = rgb0 + fz_mul255((short)dstp[1] - rgb0, ssa);
+			dstp[2] = rgb1 + fz_mul255((short)dstp[2] - rgb1, ssa);
+			dstp[3] = rgb2 + fz_mul255((short)dstp[3] - rgb2, ssa);
 			dstp += 4;
 			u += fa;
 			v += fb;
diff --git a/render/rastppc.c b/render/rastppc.c
index f26e5b66..276ee2d7 100644
--- a/render/rastppc.c
+++ b/render/rastppc.c
@@ -1,6 +1,7 @@
 /*
-PowerPC specific render optims live here
-*/
+ * PowerPC specific render optims live here
+ */
+
 #include <fitz.h>
 
 #ifdef HAVE_ALTIVEC
@@ -14,6 +15,7 @@ fz_accelrastfuncs(fz_rastfuncs *tab)
 #  ifdef HAVE_ALTIVEC
 	if (fz_cpuflags & HAVE_ALTIVEC)
 	{
+		puts("installed altivec rastfuncs");
 	}
 #  endif
 }
diff --git a/render/rastx86.c b/render/rastx86.c
index 79020fb1..9360b5e2 100644
--- a/render/rastx86.c
+++ b/render/rastx86.c
@@ -3,6 +3,8 @@ x86 specific render optims live here
 */
 #include <fitz.h>
 
+typedef unsigned char byte;
+
 /* always surround cpu specific code with HAVE_XXX */
 #ifdef HAVE_MMX
 
@@ -10,15 +12,77 @@ x86 specific render optims live here
    shouldn't require anything */
 #include <mmintrin.h>
 
+static void duff_4i1o4mmx(byte *sp0, int sw, byte *mp0, int mw, byte *dp0, int dw, int w0, int h)
+{
+	/*
+	  rendering all pages of
+	  x11pdf ~/doc/OpenGL/Presentations/CEDEC2003_Venus_and_Vulcan.pdf
+	    %     cumulative  self	     self     total
+	  time     seconds   seconds    calls ms/call ms/call  name
+	  30.50     20.04    20.04      261    76.76    76.76  duff_4i1o4
+	  21.67     22.02    10.95      221    49.55    49.55  duff_4i1o4mmx
+	*/
+	__m64 mzero = _mm_setzero_si64();
+	while (h--)
+	{
+		byte *sp = sp0;
+		byte *mp = mp0;
+		byte *dp = dp0;
+
+		unsigned *s = (unsigned *)sp;
+		unsigned *d = (unsigned *)dp;
+
+		int w = w0;
+
+		/* TODO: unroll and process two pixels/iteration */
+		while (w--)
+		{
+			int ts = *s++;
+			int ma = *mp++ + 1;
+			int sa = ((ts & 0xff) * ma) >> 8;
+			int ssa = 254 - sa;
+
+			__m64 d0 = _mm_cvtsi32_si64(*d);
+			__m64 s0 = _mm_cvtsi32_si64(ts);
+
+			/* 4 x 9 bit alpha value */
+			__m64 mma = _mm_set1_pi16(ma);
+			__m64 mssa = _mm_set1_pi16(ssa);
+
+			/* unpack 0000argb => a0r0g0b0 */
+			__m64 d1 = _mm_unpacklo_pi8(d0, mzero);
+			__m64 s1 = _mm_unpacklo_pi8(s0, mzero);
+
+			/* s1 * ma => a0r0g0b0 */
+			__m64 msma = _mm_mullo_pi16(s1, mma);
+			/* d1 * mssa */
+			__m64 mdssa = _mm_mullo_pi16(d1, mssa);
+
+			__m64 res0 = _mm_add_pi16(msma, mdssa);
+			/* TODO: is it possible to get rid of the shift? */
+			__m64 res1 = _mm_srli_pi16(res0, 8);
+
+			/* pack */
+			__m64 res2 = _mm_packs_pu16(res1, mzero);
+
+			*d++ = _mm_cvtsi64_si32(res2);
+		}
+
+		sp0 += sw;
+		mp0 += mw;
+		dp0 += dw;
+	}
+
+	_mm_empty();
+}
+
 static inline unsigned
 getargb(unsigned *s, int w, int h, int u, int v)
 {
-	if (u < 0 || u >= w) return 0;
-	if (v < 0 || v >= h) return 0;
+	if (u < 0 | u >= w | v < 0 | v >= h) return 0;
 	return s[w * v + u];
 }
 
-/* this code has not been tested since refactoring */
 static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
 {
 	/* since mmx does not have an unsigned multiply instruction we use
@@ -33,12 +97,16 @@ static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
 		unsigned *d = (unsigned *)dst0;
 		int u = u0;
 		int v = v0;
-				int w = w0;
+		int w = w0;
+
+		__m64 mzero = _mm_setzero_si64();
+		__m64 m256 = _mm_set1_pi16(256);
+		__m64 malphamask = _mm_cvtsi32_si64(0xff);
 
 		while (w--)
 		{
 			int iu = u >> 17;
-			int iv = u >> 17;
+			int iv = v >> 17;
 
 			int fu = u & 0x7fff;
 			int fv = v & 0x7fff;
@@ -75,7 +143,6 @@ static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
 			}
 
 			/* unpack src into 4x16bit vectors */
-			__m64 mzero = _mm_setzero_si64();
 			__m64 ms0 = _mm_unpackhi_pi8(ms0s1, mzero);
 			__m64 ms1 = _mm_unpacklo_pi8(ms0s1, mzero);
 			__m64 ms2 = _mm_unpackhi_pi8(ms2s3, mzero);
@@ -110,14 +177,12 @@ static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
 			__m64 d1 = _mm_unpacklo_pi8(d0, mzero);
 
 			/* get src alpha */
-			__m64 m256 = _mm_set1_pi16(256);
-			__m64 malphamask = _mm_cvtsi32_si64(0xff);
 
-			/* splat alpha TODO: better way? */
+			/* splat alpha */
 			__m64 a0001 = _mm_and_si64(malphamask, t8);
-			__m64 a0010 = _mm_slli_si64(a0001, 16);
-			__m64 a0011 = _mm_or_si64(a0001, a0010);
+			__m64 a0011 = _mm_unpacklo_pi16(a0001, a0001);
 			__m64 a1111 = _mm_unpacklo_pi16(a0011, a0011);
+
 			/* 255+1 - sa */
 			__m64 sna = _mm_sub_pi16(m256, a1111);
 
@@ -152,9 +217,9 @@ fz_accelrastfuncs(fz_rastfuncs *tab)
 #  ifdef HAVE_MMX
 	if (fz_cpuflags & HAVE_MMX)
 	{
+		tab->duff_4i1o4 = duff_4i1o4mmx;
 		tab->img_4o4 = img_4o4mmx;
 	}
 #  endif
 }
 #endif
-
author	Tor Andersson <tor@ghostscript.com>	2004-11-28 17:41:15 +0100
committer	Tor Andersson <tor@ghostscript.com>	2004-11-28 17:41:15 +0100
commit	85792218b05cb41d7dd4696443a4fdd6c16e1817 (patch)
tree	8da4dab2e44204e3ebba47ed01ff029ab00df35d /render
parent	2235b780ce692e1393fdd925eea0cdd9e1a422a1 (diff)
download	mupdf-85792218b05cb41d7dd4696443a4fdd6c16e1817.tar.xz