diff options
-rw-r--r-- | Jamfile | 4 | ||||
-rw-r--r-- | base/cpudep.c | 195 | ||||
-rw-r--r-- | include/fitz.h | 1 | ||||
-rw-r--r-- | include/fitz/cpudep.h | 26 | ||||
-rw-r--r-- | include/fitz/render.h | 4 | ||||
-rw-r--r-- | render/rastport.c | 5 | ||||
-rw-r--r-- | render/rastppc.c | 21 | ||||
-rw-r--r-- | render/rastsparc.c | 21 | ||||
-rw-r--r-- | render/rastx86.c | 160 | ||||
-rw-r--r-- | render/render.c | 2 | ||||
-rw-r--r-- | render/scale.c | 128 | ||||
-rw-r--r-- | test/x11pdf.c | 2 |
12 files changed, 562 insertions, 7 deletions
@@ -26,6 +26,7 @@ Library libfitz : base/rect.c base/matrix.c base/hash.c + base/cpudep.c # object object/simple.c @@ -92,6 +93,9 @@ Library libfitz : render/scale.c render/rastshade.c render/rastport.c + render/rastppc.c + render/rastsparc.c + render/rastx86.c render/render.c ; diff --git a/base/cpudep.c b/base/cpudep.c new file mode 100644 index 00000000..0301213d --- /dev/null +++ b/base/cpudep.c @@ -0,0 +1,195 @@ +/* +run-time cpu feature detection code +mm, alphabet soup... + +Glenn Kennard <d98gk@efd.lth.se> +*/ + +#include <fitz.h> + +typedef struct { + void (*test)(void); + const unsigned flag; + const char *name; +} featuretest; + + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + +/* need emms?? */ +static void mmx(void) +{ __asm__ ("pand %mm0, %mm0\n\t"); } + +static void m3dnow(void) +{ __asm__ ("pavgusb %mm0, %mm0\n\t"); } + +static void mmxext(void) /* aka Extended 3DNow! */ +{ __asm__ ("pmaxsw %mm0, %mm0\n\t"); } + +static void sse(void) +{ __asm__ ("andps %xmm0, %xmm0\n\t"); } + +static void sse2(void) +{ __asm__ ("andpd %xmm0, %xmm0\n\t"); } + +static void sse3(void) +{ __asm__ ("haddps %%xmm0, %%xmm0\n\t" : : : "%xmm0"); } + +#ifdef ARCH_X86_64 +static void amd64(void) +{ __asm__ ("and %rax, %rax\n\t"); } +#endif + + +static const featuretest features[] = { + { mmx, HAVE_MMX, "mmx" }, + { m3dnow, HAVE_3DNOW, "3dnow" }, + { mmxext, HAVE_MMXEXT, "mmxext" }, + { sse, HAVE_SSE, "sse" }, + { sse2, HAVE_SSE2, "sse2" }, + { sse3, HAVE_SSE3, "sse3" }, +#ifdef ARCH_X86_64 + { amd64, HAVE_AMD64, "amd64" } +#endif +}; + +#endif + + +#if defined(ARCH_SPARC) +/* assembler must have -xarch=v8plusa passed to it (v9a for 64 bit binaries) */ +static void vis(void) +{ __asm__ ("fand %f8, %f8, %f8\n\t"); } + +static const featuretest features[] = { + { vis, HAVE_VIS, "vis" } +}; + +#endif + + +#if defined(ARCH_PPC) + +static void altivec(void) +{ __asm__ ("vand v0, v0, v0\n\t"); } + + +static const featuretest features[] = { + { altivec, HAVE_ALTIVEC, "altivec" }, +}; + +#endif + +#ifndef HAVE_CPUDEP +static const featuretest features[] = {}; +#endif + +#include <signal.h> /* signal/sigaction */ +#include <setjmp.h> /* sigsetjmp/siglongjmp */ + +/* global run-time constant */ +unsigned fz_cpuflags = 0; + +static sigjmp_buf jmpbuf; +static volatile sig_atomic_t canjump; + +static void +sigillhandler(int sig) +{ + if (!canjump) { + signal(sig, SIG_DFL); + raise(sig); + } + + canjump = 0; + siglongjmp(jmpbuf, 1); +} + +static void +dumpflags(void) +{ + unsigned f = fz_cpuflags; + int i, n; + + fputs("detected cpu features: ", stdout); + n = 0; + for (i = 0; i < sizeof(features) / sizeof(featuretest); i++) + { + if (f & features[i].flag) + { + fputs(features[i].name, stdout); + n ++; + } + } + if (!n) + fputs("none", stdout); + fputc('\n', stdout); +} + +/* called by runtime before main()... + * TODO: + * CPUACCEL=0 ./x11pdf disables detection + * CPUACCEL='mmx sse' enables only mmx and sse + * not set enables everything + */ +void fz_cpudetect(void) +{ + static int hasrun = 0; + + unsigned flags = 0; + int i; + void (*oldhandler)(int) = NULL; + void (*tmphandler)(int); + + if (hasrun) + return; + hasrun = 1; + + for (i = 0; i < sizeof(features) / sizeof(featuretest); i++) + { + canjump = 0; + + tmphandler = signal(SIGILL, sigillhandler); + if (!oldhandler) + oldhandler = tmphandler; + + if (sigsetjmp(jmpbuf, 1)) + { + /* test failed - disable feature */ + flags &= ~features[i].flag; + continue; + } + + canjump = 1; + + features[i].test(); + + /* if we got here the test succeeded */ + flags |= features[i].flag; + } + + /* restore previous signal handler */ + signal(SIGILL, oldhandler); + + fz_cpuflags = flags; + + dumpflags(); +} + +static __attribute__((constructor)) void fzcpudetect(void) +{ + fz_cpudetect(); +} + +#ifdef TEST +#include <stdio.h> + +/* compile: gcc -DARCH_SPARC -DTEST cpudep.c -o cpudep */ +int +main(int n, char **a) +{ + dumpflags(); + return 0; +} +#endif + diff --git a/include/fitz.h b/include/fitz.h index 4aa56011..8c38652f 100644 --- a/include/fitz.h +++ b/include/fitz.h @@ -4,6 +4,7 @@ #define _FITZ_H_ #include "fitz/sysdep.h" +#include "fitz/cpudep.h" #include "fitz/base.h" #include "fitz/math.h" diff --git a/include/fitz/cpudep.h b/include/fitz/cpudep.h new file mode 100644 index 00000000..261c3c79 --- /dev/null +++ b/include/fitz/cpudep.h @@ -0,0 +1,26 @@ +#if defined(ARCH_X86) || defined(ARCH_X86_64) +# define HAVE_CPUDEP +# define HAVE_MMX (1<<0) +# define HAVE_MMXEXT (1<<1) +# define HAVE_SSE (1<<2) +# define HAVE_SSE2 (1<<3) +# define HAVE_SSE3 (1<<4) +# define HAVE_3DNOW (1<<5) +# define HAVE_AMD64 (1<<6) + +#elif defined (ARCH_PPC) +# define HAVE_CPUDEP +# define HAVE_ALTIVEC (1<<7) + +#elif defined (ARCH_SPARC) +# define HAVE_CPUDEP +# define HAVE_VIS (1<<8) + +#endif + +/* call this before using fitz */ +extern void fz_cpudetect(); + +/* treat as constant! */ +extern unsigned fz_cpuflags; + diff --git a/include/fitz/render.h b/include/fitz/render.h index e2435382..d951b3da 100644 --- a/include/fitz/render.h +++ b/include/fitz/render.h @@ -3,6 +3,7 @@ typedef struct fz_rastfuncs_s fz_rastfuncs; #define FZ_BYTE unsigned char +/* TODO: use 'restrict' on pointers - they never alias, do they? */ #define FZ_PSRC \ unsigned char *src, int srcw, int srch #define FZ_PDST \ @@ -57,7 +58,8 @@ struct fz_renderer_s int flag; }; -void fz_defaultrastfuncs(fz_rastfuncs *); +extern void fz_loadrastfuncs(fz_rastfuncs *); +extern void fz_accelrastfuncs(fz_rastfuncs *); fz_error *fz_newrenderer(fz_renderer **gcp, fz_colorspace *pcm, int maskonly, int gcmem); void fz_droprenderer(fz_renderer *gc); diff --git a/render/rastport.c b/render/rastport.c index a6dedf05..d329b9e2 100644 --- a/render/rastport.c +++ b/render/rastport.c @@ -442,8 +442,11 @@ static fz_rastfuncs deftab = }; void -fz_defaultrastfuncs(fz_rastfuncs *tab) +fz_loadrastfuncs(fz_rastfuncs *tab) { *tab = deftab; +#ifdef HAVE_CPUDEP + fz_accelrastfuncs(tab); +#endif } diff --git a/render/rastppc.c b/render/rastppc.c new file mode 100644 index 00000000..f26e5b66 --- /dev/null +++ b/render/rastppc.c @@ -0,0 +1,21 @@ +/* +PowerPC specific render optims live here +*/ +#include <fitz.h> + +#ifdef HAVE_ALTIVEC + +#endif /* HAVE_ALTIVEC */ + +#if defined (ARCH_PPC) +void +fz_accelrastfuncs(fz_rastfuncs *tab) +{ +# ifdef HAVE_ALTIVEC + if (fz_cpuflags & HAVE_ALTIVEC) + { + } +# endif +} +#endif + diff --git a/render/rastsparc.c b/render/rastsparc.c new file mode 100644 index 00000000..c18db9a5 --- /dev/null +++ b/render/rastsparc.c @@ -0,0 +1,21 @@ +/* +SPARC specific render optims live here +*/ +#include <fitz.h> + +#ifdef HAVE_VIS + +#endif + +#if defined (ARCH_SPARC) +void +fz_accelrastfuncs(fz_rastfuncs *tab) +{ +# ifdef HAVE_VIS + if (fz_cpuflags & HAVE_VIS) + { + } +# endif +} +#endif + diff --git a/render/rastx86.c b/render/rastx86.c new file mode 100644 index 00000000..79020fb1 --- /dev/null +++ b/render/rastx86.c @@ -0,0 +1,160 @@ +/* +x86 specific render optims live here +*/ +#include <fitz.h> + +/* always surround cpu specific code with HAVE_XXX */ +#ifdef HAVE_MMX + +/* -mmmx for gcc >= 3.4 enables the mmx intrinsic functions, icc and VC + shouldn't require anything */ +#include <mmintrin.h> + +static inline unsigned +getargb(unsigned *s, int w, int h, int u, int v) +{ + if (u < 0 || u >= w) return 0; + if (v < 0 || v >= h) return 0; + return s[w * v + u]; +} + +/* this code has not been tested since refactoring */ +static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM) +{ + /* since mmx does not have an unsigned multiply instruction we use + 17.15 fixed point */ + u0 <<= 1; v0 <<= 1; + fa <<= 1; fb <<= 1; + fc <<= 1; fd <<= 1; + + while (h--) + { + unsigned *s = (unsigned *)src; + unsigned *d = (unsigned *)dst0; + int u = u0; + int v = v0; + int w = w0; + + while (w--) + { + int iu = u >> 17; + int iv = u >> 17; + + int fu = u & 0x7fff; + int fv = v & 0x7fff; + + int atedge = + iu < 0 | iu >= (srcw - 1) | + iv < 0 | iv >= (srch - 1); + + __m64 ms0s1; + __m64 ms2s3; + + if (atedge) + { + unsigned s0, s1, s2, s3; + + /* edge cases use scalar loads */ + s0 = getargb(s, srcw, srch, iu + 0, iv + 0); + s1 = getargb(s, srcw, srch, iu + 1, iv + 0); + s2 = getargb(s, srcw, srch, iu + 0, iv + 1); + s3 = getargb(s, srcw, srch, iu + 1, iv + 1); + + /* move to mmx registers */ + ms0s1 = _mm_set_pi32(s0, s1); + ms2s3 = _mm_set_pi32(s2, s3); + } + else + { + __m64 *m0s = (__m64*)(s + srcw * (iv + 0) + iu); + __m64 *m2s = (__m64*)(s + srcw * (iv + 1) + iu); + + /* faster vector loads for interior */ + ms0s1 = *m0s; + ms2s3 = *m2s; + } + + /* unpack src into 4x16bit vectors */ + __m64 mzero = _mm_setzero_si64(); + __m64 ms0 = _mm_unpackhi_pi8(ms0s1, mzero); + __m64 ms1 = _mm_unpacklo_pi8(ms0s1, mzero); + __m64 ms2 = _mm_unpackhi_pi8(ms2s3, mzero); + __m64 ms3 = _mm_unpacklo_pi8(ms2s3, mzero); + + /* lerp fu */ + + __m64 mfu = _mm_set1_pi16(fu); + + /* t2 = (s1 - s0) * fu + s0 */ + __m64 t0 = _mm_sub_pi16(ms1, ms0); + __m64 t1 = _mm_mulhi_pi16(t0, mfu); + __m64 t2 = _mm_add_pi16(t1, ms0); + + /* t3 = (s3 - s2) * fu + s2 */ + __m64 t3 = _mm_sub_pi16(ms3, ms2); + __m64 t4 = _mm_mulhi_pi16(t3, mfu); + __m64 t5 = _mm_add_pi16(t4, ms2); + + /* lerp fv */ + + __m64 mfv = _mm_set1_pi16(fv); + + /* t8 = (t5 - t2) * fv + t2 */ + __m64 t6 = _mm_sub_pi16(t5, t2); + __m64 t7 = _mm_mulhi_pi16(t6, mfv); + __m64 t8 = _mm_add_pi16(t7, t2); + + /* load and prepare dst */ + __m64 d0 = _mm_cvtsi32_si64(*d); + + __m64 d1 = _mm_unpacklo_pi8(d0, mzero); + + /* get src alpha */ + __m64 m256 = _mm_set1_pi16(256); + __m64 malphamask = _mm_cvtsi32_si64(0xff); + + /* splat alpha TODO: better way? */ + __m64 a0001 = _mm_and_si64(malphamask, t8); + __m64 a0010 = _mm_slli_si64(a0001, 16); + __m64 a0011 = _mm_or_si64(a0001, a0010); + __m64 a1111 = _mm_unpacklo_pi16(a0011, a0011); + /* 255+1 - sa */ + __m64 sna = _mm_sub_pi16(m256, a1111); + + /* blend src with dst */ + __m64 d2 = _mm_mullo_pi16(d1, sna); + __m64 d3 = _mm_srli_pi16(d2, 8); + __m64 d4 = _mm_add_pi16(t8, d3); + + /* pack and store new dst */ + __m64 d5 = _mm_packs_pu16(d4, mzero); + + *d++ = _mm_cvtsi64_si32(d5); + + u += fa; + v += fb; + } + + dst0 += dstw; + u0 += fc; + v0 += fd; + } + + _mm_empty(); +} + +#endif /* HAVE_MMX */ + +#if defined (ARCH_X86) || defined(ARCH_X86_64) +void +fz_accelrastfuncs(fz_rastfuncs *tab) +{ +# ifdef HAVE_MMX + if (fz_cpuflags & HAVE_MMX) + { + tab->img_4o4 = img_4o4mmx; + } +# endif +} +#endif + diff --git a/render/render.c b/render/render.c index b72f069e..7fac8322 100644 --- a/render/render.c +++ b/render/render.c @@ -37,7 +37,7 @@ fz_newrenderer(fz_renderer **gcp, fz_colorspace *pcm, int maskonly, int gcmem) if (error) goto cleanup; - fz_defaultrastfuncs(&gc->rast); + fz_loadrastfuncs(&gc->rast); gc->dest = nil; gc->over = nil; diff --git a/render/scale.c b/render/scale.c index 6b06dcfa..461f5100 100644 --- a/render/scale.c +++ b/render/scale.c @@ -1,5 +1,8 @@ #include <fitz.h> +typedef void (*rowfunc)(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom); +typedef void (*colfunc)(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom); + static void scalerow(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) { @@ -33,6 +36,66 @@ scalerow(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) } static void +scalerow1(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) +{ + int x, left; + int sum; + + left = 0; + sum = 0; + + for (x = 0; x < w; x++) + { + sum += *src++; + if (++left == denom) + { + left = 0; + *dst++ = sum / denom; + sum = 0; + } + } + + /* left overs */ + if (left) + { + *dst++ = sum / left; + } +} + +static void +scalerow2(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) +{ + int x, left; + int sum0, sum1; + + left = 0; + sum0 = 0; + sum1 = 0; + + for (x = 0; x < w; x++) + { + sum0 += *src++; + sum1 += *src++; + if (++left == denom) + { + left = 0; + *dst++ = sum0 / denom; + *dst++ = sum1 / denom; + sum0 = 0; + sum1 = 0; + } + } + + /* left overs */ + if (left) + { + *dst++ = sum0 / left; + *dst++ = sum1 / left; + } +} + + +static void scalecols(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) { int x, y, k; @@ -53,6 +116,45 @@ scalecols(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) } } +static void +scalecols1(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) +{ + int x, y, k; + unsigned char *s; + int sum; + + for (x = 0; x < w; x++) + { + s = src + x; + sum = 0; + for (y = 0; y < denom; y++) + sum += s[y * w]; + *dst++ = sum / denom; + } +} + +static void +scalecols2(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom) +{ + int x, y, k; + unsigned char *s; + int sum0, sum1; + + for (x = 0; x < w; x++) + { + s = src + (x * 2); + sum0 = 0; + sum1 = 0; + for (y = 0; y < denom; y++) + { + sum0 += s[y * w * 2 + 0]; + sum1 += s[y * w * 2 + 1]; + } + *dst++ = sum0 / denom; + *dst++ = sum1 / denom; + } +} + fz_error * fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom) { @@ -61,6 +163,8 @@ fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom) unsigned char *buf; int y, iy, oy; int ow, oh, n; + rowfunc rowfunc; + colfunc colfunc; ow = (src->w + xdenom - 1) / xdenom; oh = (src->h + ydenom - 1) / ydenom; @@ -77,23 +181,39 @@ fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom) return error; } + switch (n) + { + case 1: + rowfunc = scalerow1; + colfunc = scalecols1; + break; + case 2: + rowfunc = scalerow2; + colfunc = scalecols2; + break; + default: + rowfunc = scalerow; + colfunc = scalecols; + break; + } + for (y = 0, oy = 0; y < (src->h / ydenom) * ydenom; y += ydenom, oy++) { for (iy = 0; iy < ydenom; iy++) - scalerow(src->samples + (y + iy) * src->w * n, + rowfunc(src->samples + (y + iy) * src->w * n, buf + iy * ow * n, src->w, n, xdenom); - scalecols(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom); + colfunc(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom); } ydenom = src->h - y; if (ydenom) { for (iy = 0; iy < ydenom; iy++) - scalerow(src->samples + (y + iy) * src->w * n, + rowfunc(src->samples + (y + iy) * src->w * n, buf + iy * ow * n, src->w, n, xdenom); - scalecols(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom); + colfunc(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom); } fz_free(buf); diff --git a/test/x11pdf.c b/test/x11pdf.c index b02b2c58..46e95632 100644 --- a/test/x11pdf.c +++ b/test/x11pdf.c @@ -362,6 +362,8 @@ int main(int argc, char **argv) if (argc - optind == 0) usage(); + fz_cpudetect(); + filename = argv[optind++]; xopen(); |