summaryrefslogtreecommitdiff
path: root/render/rastx86.c
diff options
context:
space:
mode:
Diffstat (limited to 'render/rastx86.c')
-rw-r--r--render/rastx86.c89
1 files changed, 77 insertions, 12 deletions
diff --git a/render/rastx86.c b/render/rastx86.c
index 79020fb1..9360b5e2 100644
--- a/render/rastx86.c
+++ b/render/rastx86.c
@@ -3,6 +3,8 @@ x86 specific render optims live here
*/
#include <fitz.h>
+typedef unsigned char byte;
+
/* always surround cpu specific code with HAVE_XXX */
#ifdef HAVE_MMX
@@ -10,15 +12,77 @@ x86 specific render optims live here
shouldn't require anything */
#include <mmintrin.h>
+static void duff_4i1o4mmx(byte *sp0, int sw, byte *mp0, int mw, byte *dp0, int dw, int w0, int h)
+{
+ /*
+ rendering all pages of
+ x11pdf ~/doc/OpenGL/Presentations/CEDEC2003_Venus_and_Vulcan.pdf
+ % cumulative self self total
+ time seconds seconds calls ms/call ms/call name
+ 30.50 20.04 20.04 261 76.76 76.76 duff_4i1o4
+ 21.67 22.02 10.95 221 49.55 49.55 duff_4i1o4mmx
+ */
+ __m64 mzero = _mm_setzero_si64();
+ while (h--)
+ {
+ byte *sp = sp0;
+ byte *mp = mp0;
+ byte *dp = dp0;
+
+ unsigned *s = (unsigned *)sp;
+ unsigned *d = (unsigned *)dp;
+
+ int w = w0;
+
+ /* TODO: unroll and process two pixels/iteration */
+ while (w--)
+ {
+ int ts = *s++;
+ int ma = *mp++ + 1;
+ int sa = ((ts & 0xff) * ma) >> 8;
+ int ssa = 254 - sa;
+
+ __m64 d0 = _mm_cvtsi32_si64(*d);
+ __m64 s0 = _mm_cvtsi32_si64(ts);
+
+ /* 4 x 9 bit alpha value */
+ __m64 mma = _mm_set1_pi16(ma);
+ __m64 mssa = _mm_set1_pi16(ssa);
+
+ /* unpack 0000argb => a0r0g0b0 */
+ __m64 d1 = _mm_unpacklo_pi8(d0, mzero);
+ __m64 s1 = _mm_unpacklo_pi8(s0, mzero);
+
+ /* s1 * ma => a0r0g0b0 */
+ __m64 msma = _mm_mullo_pi16(s1, mma);
+ /* d1 * mssa */
+ __m64 mdssa = _mm_mullo_pi16(d1, mssa);
+
+ __m64 res0 = _mm_add_pi16(msma, mdssa);
+ /* TODO: is it possible to get rid of the shift? */
+ __m64 res1 = _mm_srli_pi16(res0, 8);
+
+ /* pack */
+ __m64 res2 = _mm_packs_pu16(res1, mzero);
+
+ *d++ = _mm_cvtsi64_si32(res2);
+ }
+
+ sp0 += sw;
+ mp0 += mw;
+ dp0 += dw;
+ }
+
+ _mm_empty();
+}
+
static inline unsigned
getargb(unsigned *s, int w, int h, int u, int v)
{
- if (u < 0 || u >= w) return 0;
- if (v < 0 || v >= h) return 0;
+ if (u < 0 | u >= w | v < 0 | v >= h) return 0;
return s[w * v + u];
}
-/* this code has not been tested since refactoring */
static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
{
/* since mmx does not have an unsigned multiply instruction we use
@@ -33,12 +97,16 @@ static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
unsigned *d = (unsigned *)dst0;
int u = u0;
int v = v0;
- int w = w0;
+ int w = w0;
+
+ __m64 mzero = _mm_setzero_si64();
+ __m64 m256 = _mm_set1_pi16(256);
+ __m64 malphamask = _mm_cvtsi32_si64(0xff);
while (w--)
{
int iu = u >> 17;
- int iv = u >> 17;
+ int iv = v >> 17;
int fu = u & 0x7fff;
int fv = v & 0x7fff;
@@ -75,7 +143,6 @@ static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
}
/* unpack src into 4x16bit vectors */
- __m64 mzero = _mm_setzero_si64();
__m64 ms0 = _mm_unpackhi_pi8(ms0s1, mzero);
__m64 ms1 = _mm_unpacklo_pi8(ms0s1, mzero);
__m64 ms2 = _mm_unpackhi_pi8(ms2s3, mzero);
@@ -110,14 +177,12 @@ static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
__m64 d1 = _mm_unpacklo_pi8(d0, mzero);
/* get src alpha */
- __m64 m256 = _mm_set1_pi16(256);
- __m64 malphamask = _mm_cvtsi32_si64(0xff);
- /* splat alpha TODO: better way? */
+ /* splat alpha */
__m64 a0001 = _mm_and_si64(malphamask, t8);
- __m64 a0010 = _mm_slli_si64(a0001, 16);
- __m64 a0011 = _mm_or_si64(a0001, a0010);
+ __m64 a0011 = _mm_unpacklo_pi16(a0001, a0001);
__m64 a1111 = _mm_unpacklo_pi16(a0011, a0011);
+
/* 255+1 - sa */
__m64 sna = _mm_sub_pi16(m256, a1111);
@@ -152,9 +217,9 @@ fz_accelrastfuncs(fz_rastfuncs *tab)
# ifdef HAVE_MMX
if (fz_cpuflags & HAVE_MMX)
{
+ tab->duff_4i1o4 = duff_4i1o4mmx;
tab->img_4o4 = img_4o4mmx;
}
# endif
}
#endif
-