summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Jamfile4
-rw-r--r--base/cpudep.c195
-rw-r--r--include/fitz.h1
-rw-r--r--include/fitz/cpudep.h26
-rw-r--r--include/fitz/render.h4
-rw-r--r--render/rastport.c5
-rw-r--r--render/rastppc.c21
-rw-r--r--render/rastsparc.c21
-rw-r--r--render/rastx86.c160
-rw-r--r--render/render.c2
-rw-r--r--render/scale.c128
-rw-r--r--test/x11pdf.c2
12 files changed, 562 insertions, 7 deletions
diff --git a/Jamfile b/Jamfile
index ed7718fb..3edd7771 100644
--- a/Jamfile
+++ b/Jamfile
@@ -26,6 +26,7 @@ Library libfitz :
base/rect.c
base/matrix.c
base/hash.c
+ base/cpudep.c
# object
object/simple.c
@@ -92,6 +93,9 @@ Library libfitz :
render/scale.c
render/rastshade.c
render/rastport.c
+ render/rastppc.c
+ render/rastsparc.c
+ render/rastx86.c
render/render.c
;
diff --git a/base/cpudep.c b/base/cpudep.c
new file mode 100644
index 00000000..0301213d
--- /dev/null
+++ b/base/cpudep.c
@@ -0,0 +1,195 @@
+/*
+run-time cpu feature detection code
+mm, alphabet soup...
+
+Glenn Kennard <d98gk@efd.lth.se>
+*/
+
+#include <fitz.h>
+
+typedef struct {
+ void (*test)(void);
+ const unsigned flag;
+ const char *name;
+} featuretest;
+
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+/* need emms?? */
+static void mmx(void)
+{ __asm__ ("pand %mm0, %mm0\n\t"); }
+
+static void m3dnow(void)
+{ __asm__ ("pavgusb %mm0, %mm0\n\t"); }
+
+static void mmxext(void) /* aka Extended 3DNow! */
+{ __asm__ ("pmaxsw %mm0, %mm0\n\t"); }
+
+static void sse(void)
+{ __asm__ ("andps %xmm0, %xmm0\n\t"); }
+
+static void sse2(void)
+{ __asm__ ("andpd %xmm0, %xmm0\n\t"); }
+
+static void sse3(void)
+{ __asm__ ("haddps %%xmm0, %%xmm0\n\t" : : : "%xmm0"); }
+
+#ifdef ARCH_X86_64
+static void amd64(void)
+{ __asm__ ("and %rax, %rax\n\t"); }
+#endif
+
+
+static const featuretest features[] = {
+ { mmx, HAVE_MMX, "mmx" },
+ { m3dnow, HAVE_3DNOW, "3dnow" },
+ { mmxext, HAVE_MMXEXT, "mmxext" },
+ { sse, HAVE_SSE, "sse" },
+ { sse2, HAVE_SSE2, "sse2" },
+ { sse3, HAVE_SSE3, "sse3" },
+#ifdef ARCH_X86_64
+ { amd64, HAVE_AMD64, "amd64" }
+#endif
+};
+
+#endif
+
+
+#if defined(ARCH_SPARC)
+/* assembler must have -xarch=v8plusa passed to it (v9a for 64 bit binaries) */
+static void vis(void)
+{ __asm__ ("fand %f8, %f8, %f8\n\t"); }
+
+static const featuretest features[] = {
+ { vis, HAVE_VIS, "vis" }
+};
+
+#endif
+
+
+#if defined(ARCH_PPC)
+
+static void altivec(void)
+{ __asm__ ("vand v0, v0, v0\n\t"); }
+
+
+static const featuretest features[] = {
+ { altivec, HAVE_ALTIVEC, "altivec" },
+};
+
+#endif
+
+#ifndef HAVE_CPUDEP
+static const featuretest features[] = {};
+#endif
+
+#include <signal.h> /* signal/sigaction */
+#include <setjmp.h> /* sigsetjmp/siglongjmp */
+
+/* global run-time constant */
+unsigned fz_cpuflags = 0;
+
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump;
+
+static void
+sigillhandler(int sig)
+{
+ if (!canjump) {
+ signal(sig, SIG_DFL);
+ raise(sig);
+ }
+
+ canjump = 0;
+ siglongjmp(jmpbuf, 1);
+}
+
+static void
+dumpflags(void)
+{
+ unsigned f = fz_cpuflags;
+ int i, n;
+
+ fputs("detected cpu features: ", stdout);
+ n = 0;
+ for (i = 0; i < sizeof(features) / sizeof(featuretest); i++)
+ {
+ if (f & features[i].flag)
+ {
+ fputs(features[i].name, stdout);
+ n ++;
+ }
+ }
+ if (!n)
+ fputs("none", stdout);
+ fputc('\n', stdout);
+}
+
+/* called by runtime before main()...
+ * TODO:
+ * CPUACCEL=0 ./x11pdf disables detection
+ * CPUACCEL='mmx sse' enables only mmx and sse
+ * not set enables everything
+ */
+void fz_cpudetect(void)
+{
+ static int hasrun = 0;
+
+ unsigned flags = 0;
+ int i;
+ void (*oldhandler)(int) = NULL;
+ void (*tmphandler)(int);
+
+ if (hasrun)
+ return;
+ hasrun = 1;
+
+ for (i = 0; i < sizeof(features) / sizeof(featuretest); i++)
+ {
+ canjump = 0;
+
+ tmphandler = signal(SIGILL, sigillhandler);
+ if (!oldhandler)
+ oldhandler = tmphandler;
+
+ if (sigsetjmp(jmpbuf, 1))
+ {
+ /* test failed - disable feature */
+ flags &= ~features[i].flag;
+ continue;
+ }
+
+ canjump = 1;
+
+ features[i].test();
+
+ /* if we got here the test succeeded */
+ flags |= features[i].flag;
+ }
+
+ /* restore previous signal handler */
+ signal(SIGILL, oldhandler);
+
+ fz_cpuflags = flags;
+
+ dumpflags();
+}
+
+static __attribute__((constructor)) void fzcpudetect(void)
+{
+ fz_cpudetect();
+}
+
+#ifdef TEST
+#include <stdio.h>
+
+/* compile: gcc -DARCH_SPARC -DTEST cpudep.c -o cpudep */
+int
+main(int n, char **a)
+{
+ dumpflags();
+ return 0;
+}
+#endif
+
diff --git a/include/fitz.h b/include/fitz.h
index 4aa56011..8c38652f 100644
--- a/include/fitz.h
+++ b/include/fitz.h
@@ -4,6 +4,7 @@
#define _FITZ_H_
#include "fitz/sysdep.h"
+#include "fitz/cpudep.h"
#include "fitz/base.h"
#include "fitz/math.h"
diff --git a/include/fitz/cpudep.h b/include/fitz/cpudep.h
new file mode 100644
index 00000000..261c3c79
--- /dev/null
+++ b/include/fitz/cpudep.h
@@ -0,0 +1,26 @@
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+# define HAVE_CPUDEP
+# define HAVE_MMX (1<<0)
+# define HAVE_MMXEXT (1<<1)
+# define HAVE_SSE (1<<2)
+# define HAVE_SSE2 (1<<3)
+# define HAVE_SSE3 (1<<4)
+# define HAVE_3DNOW (1<<5)
+# define HAVE_AMD64 (1<<6)
+
+#elif defined (ARCH_PPC)
+# define HAVE_CPUDEP
+# define HAVE_ALTIVEC (1<<7)
+
+#elif defined (ARCH_SPARC)
+# define HAVE_CPUDEP
+# define HAVE_VIS (1<<8)
+
+#endif
+
+/* call this before using fitz */
+extern void fz_cpudetect();
+
+/* treat as constant! */
+extern unsigned fz_cpuflags;
+
diff --git a/include/fitz/render.h b/include/fitz/render.h
index e2435382..d951b3da 100644
--- a/include/fitz/render.h
+++ b/include/fitz/render.h
@@ -3,6 +3,7 @@ typedef struct fz_rastfuncs_s fz_rastfuncs;
#define FZ_BYTE unsigned char
+/* TODO: use 'restrict' on pointers - they never alias, do they? */
#define FZ_PSRC \
unsigned char *src, int srcw, int srch
#define FZ_PDST \
@@ -57,7 +58,8 @@ struct fz_renderer_s
int flag;
};
-void fz_defaultrastfuncs(fz_rastfuncs *);
+extern void fz_loadrastfuncs(fz_rastfuncs *);
+extern void fz_accelrastfuncs(fz_rastfuncs *);
fz_error *fz_newrenderer(fz_renderer **gcp, fz_colorspace *pcm, int maskonly, int gcmem);
void fz_droprenderer(fz_renderer *gc);
diff --git a/render/rastport.c b/render/rastport.c
index a6dedf05..d329b9e2 100644
--- a/render/rastport.c
+++ b/render/rastport.c
@@ -442,8 +442,11 @@ static fz_rastfuncs deftab =
};
void
-fz_defaultrastfuncs(fz_rastfuncs *tab)
+fz_loadrastfuncs(fz_rastfuncs *tab)
{
*tab = deftab;
+#ifdef HAVE_CPUDEP
+ fz_accelrastfuncs(tab);
+#endif
}
diff --git a/render/rastppc.c b/render/rastppc.c
new file mode 100644
index 00000000..f26e5b66
--- /dev/null
+++ b/render/rastppc.c
@@ -0,0 +1,21 @@
+/*
+PowerPC specific render optims live here
+*/
+#include <fitz.h>
+
+#ifdef HAVE_ALTIVEC
+
+#endif /* HAVE_ALTIVEC */
+
+#if defined (ARCH_PPC)
+void
+fz_accelrastfuncs(fz_rastfuncs *tab)
+{
+# ifdef HAVE_ALTIVEC
+ if (fz_cpuflags & HAVE_ALTIVEC)
+ {
+ }
+# endif
+}
+#endif
+
diff --git a/render/rastsparc.c b/render/rastsparc.c
new file mode 100644
index 00000000..c18db9a5
--- /dev/null
+++ b/render/rastsparc.c
@@ -0,0 +1,21 @@
+/*
+SPARC specific render optims live here
+*/
+#include <fitz.h>
+
+#ifdef HAVE_VIS
+
+#endif
+
+#if defined (ARCH_SPARC)
+void
+fz_accelrastfuncs(fz_rastfuncs *tab)
+{
+# ifdef HAVE_VIS
+ if (fz_cpuflags & HAVE_VIS)
+ {
+ }
+# endif
+}
+#endif
+
diff --git a/render/rastx86.c b/render/rastx86.c
new file mode 100644
index 00000000..79020fb1
--- /dev/null
+++ b/render/rastx86.c
@@ -0,0 +1,160 @@
+/*
+x86 specific render optims live here
+*/
+#include <fitz.h>
+
+/* always surround cpu specific code with HAVE_XXX */
+#ifdef HAVE_MMX
+
+/* -mmmx for gcc >= 3.4 enables the mmx intrinsic functions, icc and VC
+ shouldn't require anything */
+#include <mmintrin.h>
+
+static inline unsigned
+getargb(unsigned *s, int w, int h, int u, int v)
+{
+ if (u < 0 || u >= w) return 0;
+ if (v < 0 || v >= h) return 0;
+ return s[w * v + u];
+}
+
+/* this code has not been tested since refactoring */
+static void img_4o4mmx(FZ_PSRC, FZ_PDST, FZ_PCTM)
+{
+ /* since mmx does not have an unsigned multiply instruction we use
+ 17.15 fixed point */
+ u0 <<= 1; v0 <<= 1;
+ fa <<= 1; fb <<= 1;
+ fc <<= 1; fd <<= 1;
+
+ while (h--)
+ {
+ unsigned *s = (unsigned *)src;
+ unsigned *d = (unsigned *)dst0;
+ int u = u0;
+ int v = v0;
+ int w = w0;
+
+ while (w--)
+ {
+ int iu = u >> 17;
+ int iv = u >> 17;
+
+ int fu = u & 0x7fff;
+ int fv = v & 0x7fff;
+
+ int atedge =
+ iu < 0 | iu >= (srcw - 1) |
+ iv < 0 | iv >= (srch - 1);
+
+ __m64 ms0s1;
+ __m64 ms2s3;
+
+ if (atedge)
+ {
+ unsigned s0, s1, s2, s3;
+
+ /* edge cases use scalar loads */
+ s0 = getargb(s, srcw, srch, iu + 0, iv + 0);
+ s1 = getargb(s, srcw, srch, iu + 1, iv + 0);
+ s2 = getargb(s, srcw, srch, iu + 0, iv + 1);
+ s3 = getargb(s, srcw, srch, iu + 1, iv + 1);
+
+ /* move to mmx registers */
+ ms0s1 = _mm_set_pi32(s0, s1);
+ ms2s3 = _mm_set_pi32(s2, s3);
+ }
+ else
+ {
+ __m64 *m0s = (__m64*)(s + srcw * (iv + 0) + iu);
+ __m64 *m2s = (__m64*)(s + srcw * (iv + 1) + iu);
+
+ /* faster vector loads for interior */
+ ms0s1 = *m0s;
+ ms2s3 = *m2s;
+ }
+
+ /* unpack src into 4x16bit vectors */
+ __m64 mzero = _mm_setzero_si64();
+ __m64 ms0 = _mm_unpackhi_pi8(ms0s1, mzero);
+ __m64 ms1 = _mm_unpacklo_pi8(ms0s1, mzero);
+ __m64 ms2 = _mm_unpackhi_pi8(ms2s3, mzero);
+ __m64 ms3 = _mm_unpacklo_pi8(ms2s3, mzero);
+
+ /* lerp fu */
+
+ __m64 mfu = _mm_set1_pi16(fu);
+
+ /* t2 = (s1 - s0) * fu + s0 */
+ __m64 t0 = _mm_sub_pi16(ms1, ms0);
+ __m64 t1 = _mm_mulhi_pi16(t0, mfu);
+ __m64 t2 = _mm_add_pi16(t1, ms0);
+
+ /* t3 = (s3 - s2) * fu + s2 */
+ __m64 t3 = _mm_sub_pi16(ms3, ms2);
+ __m64 t4 = _mm_mulhi_pi16(t3, mfu);
+ __m64 t5 = _mm_add_pi16(t4, ms2);
+
+ /* lerp fv */
+
+ __m64 mfv = _mm_set1_pi16(fv);
+
+ /* t8 = (t5 - t2) * fv + t2 */
+ __m64 t6 = _mm_sub_pi16(t5, t2);
+ __m64 t7 = _mm_mulhi_pi16(t6, mfv);
+ __m64 t8 = _mm_add_pi16(t7, t2);
+
+ /* load and prepare dst */
+ __m64 d0 = _mm_cvtsi32_si64(*d);
+
+ __m64 d1 = _mm_unpacklo_pi8(d0, mzero);
+
+ /* get src alpha */
+ __m64 m256 = _mm_set1_pi16(256);
+ __m64 malphamask = _mm_cvtsi32_si64(0xff);
+
+ /* splat alpha TODO: better way? */
+ __m64 a0001 = _mm_and_si64(malphamask, t8);
+ __m64 a0010 = _mm_slli_si64(a0001, 16);
+ __m64 a0011 = _mm_or_si64(a0001, a0010);
+ __m64 a1111 = _mm_unpacklo_pi16(a0011, a0011);
+ /* 255+1 - sa */
+ __m64 sna = _mm_sub_pi16(m256, a1111);
+
+ /* blend src with dst */
+ __m64 d2 = _mm_mullo_pi16(d1, sna);
+ __m64 d3 = _mm_srli_pi16(d2, 8);
+ __m64 d4 = _mm_add_pi16(t8, d3);
+
+ /* pack and store new dst */
+ __m64 d5 = _mm_packs_pu16(d4, mzero);
+
+ *d++ = _mm_cvtsi64_si32(d5);
+
+ u += fa;
+ v += fb;
+ }
+
+ dst0 += dstw;
+ u0 += fc;
+ v0 += fd;
+ }
+
+ _mm_empty();
+}
+
+#endif /* HAVE_MMX */
+
+#if defined (ARCH_X86) || defined(ARCH_X86_64)
+void
+fz_accelrastfuncs(fz_rastfuncs *tab)
+{
+# ifdef HAVE_MMX
+ if (fz_cpuflags & HAVE_MMX)
+ {
+ tab->img_4o4 = img_4o4mmx;
+ }
+# endif
+}
+#endif
+
diff --git a/render/render.c b/render/render.c
index b72f069e..7fac8322 100644
--- a/render/render.c
+++ b/render/render.c
@@ -37,7 +37,7 @@ fz_newrenderer(fz_renderer **gcp, fz_colorspace *pcm, int maskonly, int gcmem)
if (error)
goto cleanup;
- fz_defaultrastfuncs(&gc->rast);
+ fz_loadrastfuncs(&gc->rast);
gc->dest = nil;
gc->over = nil;
diff --git a/render/scale.c b/render/scale.c
index 6b06dcfa..461f5100 100644
--- a/render/scale.c
+++ b/render/scale.c
@@ -1,5 +1,8 @@
#include <fitz.h>
+typedef void (*rowfunc)(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom);
+typedef void (*colfunc)(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom);
+
static void
scalerow(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom)
{
@@ -33,6 +36,66 @@ scalerow(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom)
}
static void
+scalerow1(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom)
+{
+ int x, left;
+ int sum;
+
+ left = 0;
+ sum = 0;
+
+ for (x = 0; x < w; x++)
+ {
+ sum += *src++;
+ if (++left == denom)
+ {
+ left = 0;
+ *dst++ = sum / denom;
+ sum = 0;
+ }
+ }
+
+ /* left overs */
+ if (left)
+ {
+ *dst++ = sum / left;
+ }
+}
+
+static void
+scalerow2(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom)
+{
+ int x, left;
+ int sum0, sum1;
+
+ left = 0;
+ sum0 = 0;
+ sum1 = 0;
+
+ for (x = 0; x < w; x++)
+ {
+ sum0 += *src++;
+ sum1 += *src++;
+ if (++left == denom)
+ {
+ left = 0;
+ *dst++ = sum0 / denom;
+ *dst++ = sum1 / denom;
+ sum0 = 0;
+ sum1 = 0;
+ }
+ }
+
+ /* left overs */
+ if (left)
+ {
+ *dst++ = sum0 / left;
+ *dst++ = sum1 / left;
+ }
+}
+
+
+static void
scalecols(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom)
{
int x, y, k;
@@ -53,6 +116,45 @@ scalecols(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom)
}
}
+static void
+scalecols1(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom)
+{
+ int x, y, k;
+ unsigned char *s;
+ int sum;
+
+ for (x = 0; x < w; x++)
+ {
+ s = src + x;
+ sum = 0;
+ for (y = 0; y < denom; y++)
+ sum += s[y * w];
+ *dst++ = sum / denom;
+ }
+}
+
+static void
+scalecols2(unsigned char *src, unsigned char *dst, int w, int ncomp, int denom)
+{
+ int x, y, k;
+ unsigned char *s;
+ int sum0, sum1;
+
+ for (x = 0; x < w; x++)
+ {
+ s = src + (x * 2);
+ sum0 = 0;
+ sum1 = 0;
+ for (y = 0; y < denom; y++)
+ {
+ sum0 += s[y * w * 2 + 0];
+ sum1 += s[y * w * 2 + 1];
+ }
+ *dst++ = sum0 / denom;
+ *dst++ = sum1 / denom;
+ }
+}
+
fz_error *
fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom)
{
@@ -61,6 +163,8 @@ fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom)
unsigned char *buf;
int y, iy, oy;
int ow, oh, n;
+ rowfunc rowfunc;
+ colfunc colfunc;
ow = (src->w + xdenom - 1) / xdenom;
oh = (src->h + ydenom - 1) / ydenom;
@@ -77,23 +181,39 @@ fz_scalepixmap(fz_pixmap **dstp, fz_pixmap *src, int xdenom, int ydenom)
return error;
}
+ switch (n)
+ {
+ case 1:
+ rowfunc = scalerow1;
+ colfunc = scalecols1;
+ break;
+ case 2:
+ rowfunc = scalerow2;
+ colfunc = scalecols2;
+ break;
+ default:
+ rowfunc = scalerow;
+ colfunc = scalecols;
+ break;
+ }
+
for (y = 0, oy = 0; y < (src->h / ydenom) * ydenom; y += ydenom, oy++)
{
for (iy = 0; iy < ydenom; iy++)
- scalerow(src->samples + (y + iy) * src->w * n,
+ rowfunc(src->samples + (y + iy) * src->w * n,
buf + iy * ow * n,
src->w, n, xdenom);
- scalecols(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom);
+ colfunc(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom);
}
ydenom = src->h - y;
if (ydenom)
{
for (iy = 0; iy < ydenom; iy++)
- scalerow(src->samples + (y + iy) * src->w * n,
+ rowfunc(src->samples + (y + iy) * src->w * n,
buf + iy * ow * n,
src->w, n, xdenom);
- scalecols(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom);
+ colfunc(buf, dst->samples + oy * dst->w * n, dst->w, n, ydenom);
}
fz_free(buf);
diff --git a/test/x11pdf.c b/test/x11pdf.c
index b02b2c58..46e95632 100644
--- a/test/x11pdf.c
+++ b/test/x11pdf.c
@@ -362,6 +362,8 @@ int main(int argc, char **argv)
if (argc - optind == 0)
usage();
+ fz_cpudetect();
+
filename = argv[optind++];
xopen();