diff options
author | Tor Andersson <tor@ghostscript.com> | 2004-11-26 03:20:37 +0100 |
---|---|---|
committer | Tor Andersson <tor@ghostscript.com> | 2004-11-26 03:20:37 +0100 |
commit | 6c9bcc6f41bb738c216fc810246f60576d9ec6e4 (patch) | |
tree | a98834bbc2491c417743760d2869d1c066962b16 /render | |
parent | e8696b4046767fcbfc05056c13f919fdeff65158 (diff) | |
download | mupdf-6c9bcc6f41bb738c216fc810246f60576d9ec6e4.tar.xz |
cpu dependent skeleton
Diffstat (limited to 'render')
-rw-r--r-- | render/rastport.c | 5 | ||||
-rw-r--r-- | render/rastppc.c | 21 | ||||
-rw-r--r-- | render/rastsparc.c | 21 | ||||
-rw-r--r-- | render/rastx86.c | 160 | ||||
-rw-r--r-- | render/render.c | 2 | ||||
-rw-r--r-- | render/scale.c | 128 |
6 files changed, 331 insertions, 6 deletions
diff --git a/render/rastport.c b/render/rastport.c index a6dedf05..d329b9e2 100644 --- a/render/rastport.c +++ b/render/rastport.c @@ -442,8 +442,11 @@ static fz_rastfuncs deftab = }; void -fz_defaultrastfuncs(fz_rastfuncs *tab) +fz_loadrastfuncs(fz_rastfuncs *tab) { *tab = deftab; +#ifdef HAVE_CPUDEP + fz_accelrastfuncs(tab); +#endif } diff --git a/render/rastppc.c b/render/rastppc.c new file mode 100644 index 00000000..f26e5b66 --- /dev/null +++ b/render/rastppc.c @@ -0,0 +1,21 @@ +/* +PowerPC specific render optims live here +*/ +#include <fitz.h> + +#ifdef HAVE_ALTIVEC + +#endif /* HAVE_ALTIVEC */ + +#if defined (ARCH_PPC) +void +fz_accelrastfuncs(fz_rastfuncs *tab) +{ +# ifdef HAVE_ALTIVEC + if (fz_cpuflags & HAVE_ALTIVEC) + { + } +# endif +} +#endif + diff --git a/render/rastsparc.c b/render/rastsparc.c new file mode 100644 index 00000000..c18db9a5 --- /dev/null +++ b/render/rastsparc.c @@ -0,0 +1,21 @@ +/* +SPARC specific render optims live here +*/ +#include <fitz.h> + +#ifdef HAVE_VIS + +#endif + +#if defined (ARCH_SPARC) +void +fz_accelrastfuncs(fz_rastfuncs *tab) +{ +# ifdef HAVE_VIS + if (fz_cpuflags & HAVE_VIS) + { + } +# endif +} +#endif + diff --git a/render/rastx86.c b/render/rastx86.c new file mode 100644 index 00000000..79020fb1 --- /dev/null +++ b/render/rastx86.c @@ -0,0 +1,160 @@ +/* +x86 specific render optims live here +*/ +#include <fitz.h> + +/* always surround cpu specific code with HAVE_XXX */ +#ifdef HAVE_MMX + +/* -mmmx for gcc >= 3.4 enables the mmx intrinsic functions, icc and VC + shouldn't require anything */ +#include <mmintrin.h> + +static inline unsigned +getargb(unsigned *s, int w, int h, int u, int v) +{ + if (u < 0 || u >= w) return 0; + if (v < 0 || v >= h) return 0; + return s[w * v + u]; +} + +/* this code has not been tested since refactoring */ +static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM) +{ + /* since mmx does not have an unsigned multiply instruction we use + 17.15 fixed point */ + u0 <<= 1; v0 <<= 1; + fa <<= 1; fb <<= 1; + fc <<= 1; fd <<= 1; + + while (h--) + { + unsigned *s = (unsigned *)src; + unsigned *d = (unsigned *)dst0; + int u = u0; + int v = v0; + int w = w0; + + while (w--) + { + int iu = u >> 17; + int iv = u >> 17; + + int fu = u & 0x7fff; + int fv = v & 0x7fff; + + int atedge = + iu < 0 | iu >= (srcw - 1) | + iv < 0 | iv >= (srch - 1); + + __m64 ms0s1; + __m64 ms2s3; + + if (atedge) + { + unsigned s0, s1, s2, s3; + + /* edge cases use scalar loads */ + s0 = getargb(s, srcw, srch, iu + 0, iv + 0); + s1 = getargb(s, srcw, srch, iu + 1, iv + 0); + s2 = getargb(s, srcw, srch, iu + 0, iv + 1); + s3 = getargb(s, srcw, srch, iu + 1, iv + 1); + + /* move to mmx registers */ + ms0s1 = _mm_set_pi32(s0, s1); + ms2s3 = _mm_set_pi32(s2, s3); + } + else + { + __m64 *m0s = (__m64*)(s + srcw * (iv + 0) + iu); + __m64 *m2s = (__m64*)(s + srcw * (iv + 1) + iu); + + /* faster vector loads for interior */ + ms0s1 = *m0s; + ms2s3 = *m2s; + } + + /* unpack src into 4x16bit vectors */ + __m64 mzero = _mm_setzero_si64(); + __m64 ms0 = _mm_unpackhi_pi8(ms0s1, mzero); + __m64 ms1 = _mm_unpacklo_pi8(ms0s1, mzero); + __m64 ms2 = _mm_unpackhi_pi8(ms2s3, mzero); + __m64 ms3 = _mm_unpacklo_pi8(ms2s3, mzero); + + /* lerp fu */ + + __m64 mfu = _mm_set1_pi16(fu); + + /* t2 = (s1 - s0) * fu + s0 */ + __m64 t0 = _mm_sub_pi16(ms1, ms0); + __m64 t1 = _mm_mulhi_pi16(t0, mfu); + __m64 t2 = _mm_add_pi16(t1, ms0); + + /* t3 = (s3 - s2) * fu + s2 */ + __m64 t3 = _mm_sub_pi16(ms3, ms2); + __m64 t4 = _mm_mulhi_pi16(t3, mfu); + __m64 t5 = _mm_add_pi16(t4, ms2); + + /* lerp fv */ + + __m64 mfv = _mm_set1_pi16(fv); + + /* t8 = (t5 - t2) * fv + t2 */ + __m64 t6 = _mm_sub_pi16(t5, t2); + __m64 t7 = _mm_mulhi_pi16(t6, mfv); + __m64 t8 = _mm_add_pi16(t7, t2); + + /* load and prepare dst */ + __m64 d0 = _mm_cvtsi32_si64(*d); + + __m64 d1 = _mm_unpacklo_pi8(d0, mzero); + + /* get src alpha */ + __m64 m256 = _mm_set1_pi16(256); + __m64 malphamask = _mm_cvtsi32_si64(0xff); + + /* splat alpha TODO: better way? */ + __m64 a0001 = _mm_and_si64(malphamask, t8); + __m64 a0010 = _mm_slli_si64(a0001, 16); + __m64 a0011 = _mm_or_si64(a0001, a0010); + __m64 a1111 = _mm_unpacklo_pi16(a0011, a0011); + /* 255+1 - sa */ + __m64 sna = _mm_sub_pi16(m256, a1111); + + /* blend src with dst */ + __m64 d2 = _mm_mullo_pi16(d1, sna); + __m64 d3 = _mm_srli_pi16(d2, 8); + __m64 d4 = _mm_add_pi16(t8, d3); + + /* pack and store new dst */ + __m64 d5 = _mm_packs_pu16(d4, mzero); + + *d++ = _mm_cvtsi64_si32(d5); + + u += fa; + v += fb; + } + + dst0 += dstw; + u0 += fc; + v0 += fd; + } + + _mm_empty(); +} + +#endif /* HAVE_MMX */ + +#if defined (ARCH_X86) || defined(ARCH_X86_64) +void +fz_accelrastfuncs(fz_rastfuncs *tab) +{ +# ifdef HAVE_MMX + if (fz_cpuflags & HAVE_MMX) + { + tab->img_4o4 = img_4o4mmx; + } +# endif +} +#endif + diff --git a/render/render.c b/render/render.c index b72f069e..7fac8322 100644 --- a/render/render.c +++ b/render/render.c @@ -37,7 +37,7 @@ fz_newrenderer(fz_renderer **gcp, fz_colorspace *pcm, int maskonly, int gcmem) if (error) goto cleanup; - fz_defaultrastfuncs(&gc->rast); + fz_loadrastfuncs(&gc->rast); gc->dest = nil; gc->over = nil; diff --git a/render/scale.c b/render/scale.c index 6b06dcfa..461f5100 100644 --- a/render/scale.c +++ b/render/scale.c @@ -1,5 +1,8 @@ #include <fitz.h> +typedef void (*rowfunc)(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom); +typedef void (*colfunc)(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom); + static void scalerow(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) { @@ -33,6 +36,66 @@ scalerow(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) } static void +scalerow1(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) +{ + int x, left; + int sum; + + left = 0; + sum = 0; + + for (x = 0; x < w; x++) + { + sum += *src++; + if (++left == denom) + { + left = 0; + *dst++ = sum / denom; + sum = 0; + } + } + + /* left overs */ + if (left) + { + *dst++ = sum / left; + } +} + +static void +scalerow2(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) +{ + int x, left; + int sum0, sum1; + + left = 0; + sum0 = 0; + sum1 = 0; + + for (x = 0; x < w; x++) + { + sum0 += *src++; + sum1 += *src++; + if (++left == denom) + { + left = 0; + *dst++ = sum0 / denom; + *dst++ = sum1 / denom; + sum0 = 0; + sum1 = 0; + } + } + + /* left overs */ + if (left) + { + *dst++ = sum0 / left; + *dst++ = sum1 / left; + } +} + + +static void scalecols(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) { int x, y, k; @@ -53,6 +116,45 @@ scalecols(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) } } +static void +scalecols1(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) +{ + int x, y, k; + unsigned char *s; + int sum; + + for (x = 0; x < w; x++) + { + s = src + x; + sum = 0; + for (y = 0; y < denom; y++) + sum += s[y * w]; + *dst++ = sum / denom; + } +} + +static void +scalecols2(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) +{ + int x, y, k; + unsigned char *s; + int sum0, sum1; + + for (x = 0; x < w; x++) + { + s = src + (x * 2); + sum0 = 0; + sum1 = 0; + for (y = 0; y < denom; y++) + { + sum0 += s[y * w * 2 + 0]; + sum1 += s[y * w * 2 + 1]; + } + *dst++ = sum0 / denom; + *dst++ = sum1 / denom; + } +} + fz_error * fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom) { @@ -61,6 +163,8 @@ fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom) unsigned char *buf; int y, iy, oy; int ow, oh, n; + rowfunc rowfunc; + colfunc colfunc; ow = (src->w + xdenom - 1) / xdenom; oh = (src->h + ydenom - 1) / ydenom; @@ -77,23 +181,39 @@ fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom) return error; } + switch (n) + { + case 1: + rowfunc = scalerow1; + colfunc = scalecols1; + break; + case 2: + rowfunc = scalerow2; + colfunc = scalecols2; + break; + default: + rowfunc = scalerow; + colfunc = scalecols; + break; + } + for (y = 0, oy = 0; y < (src->h / ydenom) * ydenom; y += ydenom, oy++) { for (iy = 0; iy < ydenom; iy++) - scalerow(src->samples + (y + iy) * src->w * n, + rowfunc(src->samples + (y + iy) * src->w * n, buf + iy * ow * n, src->w, n, xdenom); - scalecols(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom); + colfunc(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom); } ydenom = src->h - y; if (ydenom) { for (iy = 0; iy < ydenom; iy++) - scalerow(src->samples + (y + iy) * src->w * n, + rowfunc(src->samples + (y + iy) * src->w * n, buf + iy * ow * n, src->w, n, xdenom); - scalecols(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom); + colfunc(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom); } fz_free(buf); |