diff options
Diffstat (limited to 'third_party/libpng16/contrib')
-rw-r--r-- | third_party/libpng16/contrib/intel/INSTALL | 158 | ||||
-rw-r--r-- | third_party/libpng16/contrib/intel/filter_sse2_intrinsics.c | 379 | ||||
-rw-r--r-- | third_party/libpng16/contrib/intel/intel_init.c | 54 | ||||
-rw-r--r-- | third_party/libpng16/contrib/intel/intel_sse.patch | 164 |
4 files changed, 0 insertions, 755 deletions
diff --git a/third_party/libpng16/contrib/intel/INSTALL b/third_party/libpng16/contrib/intel/INSTALL deleted file mode 100644 index cd5cdd94eb..0000000000 --- a/third_party/libpng16/contrib/intel/INSTALL +++ /dev/null @@ -1,158 +0,0 @@ -Enabling SSE support - -Copyright (c) 2016 Google, Inc. -Written by Mike Klein, Matt Sarett - -This INSTALL file written by Glenn Randers-Pehrson, 2016. - -If you have moved intel_init.c and filter_sse2_intrinsics.c to a different -directory, be sure to update the '#include "../../pngpriv.h"' line in both -files if necessary to point to the correct relative location of pngpriv.h -with respect to the new location of those files. - -To enable SSE support in libpng, follow the instructions in I, II, or III, -below: - -I. Using patched "configure" scripts: - -First, apply intel_sse.patch in your build directory. - - patch -i contrib/intel/intel_sse.patch -p1 - -Then, if you are not building in a new GIT clone, e.g., in a tar -distribution, remove any existing pre-built configure scripts: - - ./configure --enable-maintainer-mode - make maintainer-clean - ./autogen.sh --maintainer --clean - -Finally, configure libpng with -DPNG_INTEL_SSE in CPPFLAGS: - - ./autogen.sh --maintainer - CPPFLAGS="-DPNG_INTEL_SSE" ./configure [options] - make CPPFLAGS="-DPNG_INTEL_SSE" [options] - make - -II. Using a custom makefile: - -If you are using a custom makefile makefile, you will have to update it -manually to include contrib/intel/*.o in the dependencies, and to define -PNG_INTEL_SSE. - -III. Using manually updated "configure" scripts: - -If you prefer, manually edit pngpriv.h, configure.ac, and Makefile.am, -following the instructions below, then follow the instructions in -section II of INSTALL in the main libpng directory, then configure libpng -with -DPNG_INTEL_SSE in CPPFLAGS. - -1. Add the following code to configure.ac under HOST SPECIFIC OPTIONS -directly beneath the section for ARM: - ------------------cut---------------- -# INTEL -# ===== -# -# INTEL SSE (SIMD) support. - -AC_ARG_ENABLE([intel-sse], - AS_HELP_STRING([[[--enable-intel-sse]]], - [Enable Intel SSE optimizations: =no/off, yes/on:] - [no/off: disable the optimizations;] - [yes/on: enable the optimizations.] - [If not specified: determined by the compiler.]), - [case "$enableval" in - no|off) - # disable the default enabling: - AC_DEFINE([PNG_INTEL_SSE_OPT], [0], - [Disable Intel SSE optimizations]) - # Prevent inclusion of the assembler files below: - enable_intel_sse=no;; - yes|on) - AC_DEFINE([PNG_INTEL_SSE_OPT], [1], - [Enable Intel SSE optimizations]);; - *) - AC_MSG_ERROR([--enable-intel-sse=${enable_intel_sse}: invalid value]) - esac]) - -# Add Intel specific files to all builds where the host_cpu is Intel ('x86*') -# or where Intel optimizations were explicitly requested (this allows a -# fallback if a future host CPU does not match 'x86*') -AM_CONDITIONAL([PNG_INTEL_SSE], - [test "$enable_intel_sse" != 'no' && - case "$host_cpu" in - i?86|x86_64) :;; - *) test "$enable_intel_sse" != '';; - esac]) ------------------cut---------------- - -2. Add the following code to Makefile.am under HOST SPECIFIC OPTIONS -directly beneath the "if PNG_ARM_NEON ... endif" statement: - ------------------cut---------------- -if PNG_INTEL_SSE -libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\ - contrib/intel/filter_sse2_intrinsics.c -endif ------------------cut---------------- - -3. Add the following lines to pngpriv.h, following the PNG_ARM_NEON_OPT -code: - ------------------cut---------------- -#ifndef PNG_INTEL_SSE_OPT -# ifdef PNG_INTEL_SSE - /* Only check for SSE if the build configuration has been modified to - * enable SSE optimizations. This means that these optimizations will - * be off by default. See contrib/intel for more details. - */ -# if defined(__SSE4_1__) || defined(__AVX__) || defined(__SSSE3__) || \ - defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \ - (defined(_M_IX86_FP) && _M_IX86_FP >= 2) -# define PNG_INTEL_SSE_OPT 1 -# endif -# endif -#endif - -#if PNG_INTEL_SSE_OPT > 0 -# ifndef PNG_INTEL_SSE_IMPLEMENTATION -# if defined(__SSE4_1__) || defined(__AVX__) - /* We are not actually using AVX, but checking for AVX is the best - way we can detect SSE4.1 and SSSE3 on MSVC. - */ -# define PNG_INTEL_SSE_IMPLEMENTATION 3 -# elif defined(__SSSE3__) -# define PNG_INTEL_SSE_IMPLEMENTATION 2 -# elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \ - (defined(_M_IX86_FP) && _M_IX86_FP >= 2) -# define PNG_INTEL_SSE_IMPLEMENTATION 1 -# else -# define PNG_INTEL_SSE_IMPLEMENTATION 0 -# endif -# endif - -# if PNG_INTEL_SSE_IMPLEMENTATION > 0 -# define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_sse2 -# endif -#endif - ------------------cut---------------- - -4. Add the following lines to pngpriv.h, following the prototype for -png_read_filter_row_paeth4_neon: - ------------------cut---------------- -PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_sse2,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_sse2,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_sse2,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_sse2,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_sse2,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); - ------------------cut---------------- diff --git a/third_party/libpng16/contrib/intel/filter_sse2_intrinsics.c b/third_party/libpng16/contrib/intel/filter_sse2_intrinsics.c deleted file mode 100644 index aea3f86af5..0000000000 --- a/third_party/libpng16/contrib/intel/filter_sse2_intrinsics.c +++ /dev/null @@ -1,379 +0,0 @@ - -/* filter_sse2_intrinsics.c - SSE2 optimized filter functions - * - * Copyright (c) 2016 Google, Inc. - * Written by Mike Klein and Matt Sarett - * Derived from arm/filter_neon_intrinsics.c, which was - * Copyright (c) 2014,2016 Glenn Randers-Pehrson - * - * Last changed in libpng 1.6.22 [May 26, 2016] - * - * This code is released under the libpng license. - * For conditions of distribution and use, see the disclaimer - * and license in png.h - */ - -#include "../../pngpriv.h" - -#ifdef PNG_READ_SUPPORTED - -#if PNG_INTEL_SSE_IMPLEMENTATION > 0 - -#include <immintrin.h> - -/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). - * They're positioned like this: - * prev: c b - * row: a d - * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be - * whichever of a, b, or c is closest to p=a+b-c. - */ - -static __m128i load4(const void* p) { - return _mm_cvtsi32_si128(*(const int*)p); -} - -static void store4(void* p, __m128i v) { - *(int*)p = _mm_cvtsi128_si32(v); -} - -static __m128i load3(const void* p) { - /* We'll load 2 bytes, then 1 byte, - * then mask them together, and finally load into SSE. - */ - const png_uint_16* p01 = p; - const png_byte* p2 = (const png_byte*)(p01+1); - - png_uint_32 v012 = (png_uint_32)(*p01) - | (png_uint_32)(*p2) << 16; - return load4(&v012); -} - -static void store3(void* p, __m128i v) { - /* We'll pull from SSE as a 32-bit int, then write - * its bottom two bytes, then its third byte. - */ - png_uint_32 v012; - store4(&v012, v); - - png_uint_16* p01 = p; - png_byte* p2 = (png_byte*)(p01+1); - *p01 = v012; - *p2 = v012 >> 16; -} - -void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, - png_const_bytep prev) -{ - /* The Sub filter predicts each pixel as the previous pixel, a. - * There is no pixel to the left of the first pixel. It's encoded directly. - * That works with our main loop if we just say that left pixel was zero. - */ - png_debug(1, "in png_read_filter_row_sub3_sse2"); - __m128i a, d = _mm_setzero_si128(); - - int rb = row_info->rowbytes; - while (rb >= 4) { - a = d; d = load4(row); - d = _mm_add_epi8(d, a); - store3(row, d); - - row += 3; - rb -= 3; - } - if (rb > 0) { - a = d; d = load3(row); - d = _mm_add_epi8(d, a); - store3(row, d); - - row += 3; - rb -= 3; - } -} - -void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, - png_const_bytep prev) -{ - /* The Sub filter predicts each pixel as the previous pixel, a. - * There is no pixel to the left of the first pixel. It's encoded directly. - * That works with our main loop if we just say that left pixel was zero. - */ - png_debug(1, "in png_read_filter_row_sub4_sse2"); - __m128i a, d = _mm_setzero_si128(); - - int rb = row_info->rowbytes; - while (rb > 0) { - a = d; d = load4(row); - d = _mm_add_epi8(d, a); - store4(row, d); - - row += 4; - rb -= 4; - } -} - -void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, - png_const_bytep prev) -{ - /* The Avg filter predicts each pixel as the (truncated) average of a and b. - * There's no pixel to the left of the first pixel. Luckily, it's - * predicted to be half of the pixel above it. So again, this works - * perfectly with our loop if we make sure a starts at zero. - */ - png_debug(1, "in png_read_filter_row_avg3_sse2"); - const __m128i zero = _mm_setzero_si128(); - __m128i b; - __m128i a, d = zero; - - int rb = row_info->rowbytes; - while (rb >= 4) { - b = load4(prev); - a = d; d = load4(row ); - - /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ - __m128i avg = _mm_avg_epu8(a,b); - /* ...but we can fix it up by subtracting off 1 if it rounded up. */ - avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), - _mm_set1_epi8(1))); - d = _mm_add_epi8(d, avg); - store3(row, d); - - prev += 3; - row += 3; - rb -= 3; - } - if (rb > 0) { - b = load3(prev); - a = d; d = load3(row ); - - /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ - __m128i avg = _mm_avg_epu8(a,b); - /* ...but we can fix it up by subtracting off 1 if it rounded up. */ - avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), - _mm_set1_epi8(1))); - - d = _mm_add_epi8(d, avg); - store3(row, d); - - prev += 3; - row += 3; - rb -= 3; - } -} - -void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, - png_const_bytep prev) -{ - /* The Avg filter predicts each pixel as the (truncated) average of a and b. - * There's no pixel to the left of the first pixel. Luckily, it's - * predicted to be half of the pixel above it. So again, this works - * perfectly with our loop if we make sure a starts at zero. - */ - png_debug(1, "in png_read_filter_row_avg4_sse2"); - const __m128i zero = _mm_setzero_si128(); - __m128i b; - __m128i a, d = zero; - - int rb = row_info->rowbytes; - while (rb > 0) { - b = load4(prev); - a = d; d = load4(row ); - - /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ - __m128i avg = _mm_avg_epu8(a,b); - /* ...but we can fix it up by subtracting off 1 if it rounded up. */ - avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), - _mm_set1_epi8(1))); - - d = _mm_add_epi8(d, avg); - store4(row, d); - - prev += 4; - row += 4; - rb -= 4; - } -} - -/* Returns |x| for 16-bit lanes. */ -static __m128i abs_i16(__m128i x) { -#if PNG_INTEL_SSE_IMPLEMENTATION >= 2 - return _mm_abs_epi16(x); -#else - /* Read this all as, return x<0 ? -x : x. - * To negate two's complement, you flip all the bits then add 1. - */ - __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); - - /* Flip negative lanes. */ - x = _mm_xor_si128(x, is_negative); - - /* +1 to negative lanes, else +0. */ - x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); - return x; -#endif -} - -/* Bytewise c ? t : e. */ -static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { -#if PNG_INTEL_SSE_IMPLEMENTATION >= 3 - return _mm_blendv_epi8(e,t,c); -#else - return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); -#endif -} - -void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, - png_const_bytep prev) -{ - /* Paeth tries to predict pixel d using the pixel to the left of it, a, - * and two pixels from the previous row, b and c: - * prev: c b - * row: a d - * The Paeth function predicts d to be whichever of a, b, or c is nearest to - * p=a+b-c. - * - * The first pixel has no left context, and so uses an Up filter, p = b. - * This works naturally with our main loop's p = a+b-c if we force a and c - * to zero. - * Here we zero b and d, which become c and a respectively at the start of - * the loop. - */ - png_debug(1, "in png_read_filter_row_paeth3_sse2"); - const __m128i zero = _mm_setzero_si128(); - __m128i c, b = zero, - a, d = zero; - - int rb = row_info->rowbytes; - while (rb >= 4) { - /* It's easiest to do this math (particularly, deal with pc) with 16-bit - * intermediates. - */ - c = b; b = _mm_unpacklo_epi8(load4(prev), zero); - a = d; d = _mm_unpacklo_epi8(load4(row ), zero); - - /* (p-a) == (a+b-c - a) == (b-c) */ - __m128i pa = _mm_sub_epi16(b,c); - - /* (p-b) == (a+b-c - b) == (a-c) */ - __m128i pb = _mm_sub_epi16(a,c); - - /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ - __m128i pc = _mm_add_epi16(pa,pb); - - pa = abs_i16(pa); /* |p-a| */ - pb = abs_i16(pb); /* |p-b| */ - pc = abs_i16(pc); /* |p-c| */ - - __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); - - /* Paeth breaks ties favoring a over b over c. */ - __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, - if_then_else(_mm_cmpeq_epi16(smallest, pb), b, - c)); - - /* Note `_epi8`: we need addition to wrap modulo 255. */ - d = _mm_add_epi8(d, nearest); - store3(row, _mm_packus_epi16(d,d)); - - prev += 3; - row += 3; - rb -= 3; - } - if (rb > 0) { - /* It's easiest to do this math (particularly, deal with pc) with 16-bit - * intermediates. - */ - c = b; b = _mm_unpacklo_epi8(load3(prev), zero); - a = d; d = _mm_unpacklo_epi8(load3(row ), zero); - - /* (p-a) == (a+b-c - a) == (b-c) */ - __m128i pa = _mm_sub_epi16(b,c); - - /* (p-b) == (a+b-c - b) == (a-c) */ - __m128i pb = _mm_sub_epi16(a,c); - - /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ - __m128i pc = _mm_add_epi16(pa,pb); - - pa = abs_i16(pa); /* |p-a| */ - pb = abs_i16(pb); /* |p-b| */ - pc = abs_i16(pc); /* |p-c| */ - - __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); - - /* Paeth breaks ties favoring a over b over c. */ - __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, - if_then_else(_mm_cmpeq_epi16(smallest, pb), b, - c)); - - /* Note `_epi8`: we need addition to wrap modulo 255. */ - d = _mm_add_epi8(d, nearest); - store3(row, _mm_packus_epi16(d,d)); - - prev += 3; - row += 3; - rb -= 3; - } -} - -void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, - png_const_bytep prev) -{ - /* Paeth tries to predict pixel d using the pixel to the left of it, a, - * and two pixels from the previous row, b and c: - * prev: c b - * row: a d - * The Paeth function predicts d to be whichever of a, b, or c is nearest to - * p=a+b-c. - * - * The first pixel has no left context, and so uses an Up filter, p = b. - * This works naturally with our main loop's p = a+b-c if we force a and c - * to zero. - * Here we zero b and d, which become c and a respectively at the start of - * the loop. - */ - png_debug(1, "in png_read_filter_row_paeth4_sse2"); - const __m128i zero = _mm_setzero_si128(); - __m128i c, b = zero, - a, d = zero; - - int rb = row_info->rowbytes; - while (rb > 0) { - /* It's easiest to do this math (particularly, deal with pc) with 16-bit - * intermediates. - */ - c = b; b = _mm_unpacklo_epi8(load4(prev), zero); - a = d; d = _mm_unpacklo_epi8(load4(row ), zero); - - /* (p-a) == (a+b-c - a) == (b-c) */ - __m128i pa = _mm_sub_epi16(b,c); - - /* (p-b) == (a+b-c - b) == (a-c) */ - __m128i pb = _mm_sub_epi16(a,c); - - /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ - __m128i pc = _mm_add_epi16(pa,pb); - - pa = abs_i16(pa); /* |p-a| */ - pb = abs_i16(pb); /* |p-b| */ - pc = abs_i16(pc); /* |p-c| */ - - __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); - - /* Paeth breaks ties favoring a over b over c. */ - __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, - if_then_else(_mm_cmpeq_epi16(smallest, pb), b, - c)); - - /* Note `_epi8`: we need addition to wrap modulo 255. */ - d = _mm_add_epi8(d, nearest); - store4(row, _mm_packus_epi16(d,d)); - - prev += 4; - row += 4; - rb -= 4; - } -} - -#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ -#endif /* READ */ diff --git a/third_party/libpng16/contrib/intel/intel_init.c b/third_party/libpng16/contrib/intel/intel_init.c deleted file mode 100644 index 328e90e9af..0000000000 --- a/third_party/libpng16/contrib/intel/intel_init.c +++ /dev/null @@ -1,54 +0,0 @@ - -/* intel_init.c - SSE2 optimized filter functions - * - * Copyright (c) 2016 Google, Inc. - * Written by Mike Klein and Matt Sarett - * Derived from arm/arm_init.c, which was - * Copyright (c) 2014,2016 Glenn Randers-Pehrson - * - * Last changed in libpng 1.6.22 [May 26, 2016] - * - * This code is released under the libpng license. - * For conditions of distribution and use, see the disclaimer - * and license in png.h - */ - -#include "../../pngpriv.h" - -#ifdef PNG_READ_SUPPORTED -#if PNG_INTEL_SSE_IMPLEMENTATION > 0 - -void -png_init_filter_functions_sse2(png_structp pp, unsigned int bpp) -{ - /* The techniques used to implement each of these filters in SSE operate on - * one pixel at a time. - * So they generally speed up 3bpp images about 3x, 4bpp images about 4x. - * They can scale up to 6 and 8 bpp images and down to 2 bpp images, - * but they'd not likely have any benefit for 1bpp images. - * Most of these can be implemented using only MMX and 64-bit registers, - * but they end up a bit slower than using the equally-ubiquitous SSE2. - */ - png_debug(1, "in png_init_filter_functions_sse2"); - if (bpp == 3) - { - pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2; - pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_sse2; - pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = - png_read_filter_row_paeth3_sse2; - } - else if (bpp == 4) - { - pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_sse2; - pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_sse2; - pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = - png_read_filter_row_paeth4_sse2; - } - - /* No need optimize PNG_FILTER_VALUE_UP. The compiler should - * autovectorize. - */ -} - -#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ -#endif /* PNG_READ_SUPPORTED */ diff --git a/third_party/libpng16/contrib/intel/intel_sse.patch b/third_party/libpng16/contrib/intel/intel_sse.patch deleted file mode 100644 index d9d02bb8fc..0000000000 --- a/third_party/libpng16/contrib/intel/intel_sse.patch +++ /dev/null @@ -1,164 +0,0 @@ -diff --git libpng-1.6.22-orig/configure.ac libpng-1.6.22/configure.ac ---- libpng-1.6.22-orig/configure.ac 2016-05-25 18:59:10.000000000 -0400 -+++ libpng-1.6.22/configure.ac 2016-05-25 19:48:10.631751170 -0400 -@@ -341,16 +341,50 @@ AC_ARG_ENABLE([arm-neon], - - AM_CONDITIONAL([PNG_ARM_NEON], - [test "$enable_arm_neon" != 'no' && - case "$host_cpu" in - arm*|aarch64*) :;; - *) test "$enable_arm_neon" != '';; - esac]) - -+# INTEL -+# ===== -+# -+# INTEL SSE (SIMD) support. -+ -+AC_ARG_ENABLE([intel-sse], -+ AS_HELP_STRING([[[--enable-intel-sse]]], -+ [Enable Intel SSE optimizations: =no/off, yes/on:] -+ [no/off: disable the optimizations;] -+ [yes/on: enable the optimizations.] -+ [If not specified: determined by the compiler.]), -+ [case "$enableval" in -+ no|off) -+ # disable the default enabling: -+ AC_DEFINE([PNG_INTEL_SSE_OPT], [0], -+ [Disable Intel SSE optimizations]) -+ # Prevent inclusion of the assembler files below: -+ enable_intel_sse=no;; -+ yes|on) -+ AC_DEFINE([PNG_INTEL_SSE_OPT], [1], -+ [Enable Intel SSE optimizations]);; -+ *) -+ AC_MSG_ERROR([--enable-intel-sse=${enable_intel_sse}: invalid value]) -+ esac]) -+ -+# Add Intel specific files to all builds where the host_cpu is Intel ('x86*') -+# or where Intel optimizations were explicitly requested (this allows a -+# fallback if a future host CPU does not match 'x86*') -+AM_CONDITIONAL([PNG_INTEL_SSE], -+ [test "$enable_intel_sse" != 'no' && -+ case "$host_cpu" in -+ i?86|x86_64) :;; -+ *) test "$enable_intel_sse" != '';; -+ esac]) - AC_MSG_NOTICE([[Extra options for compiler: $PNG_COPTS]]) - - # Config files, substituting as above - AC_CONFIG_FILES([Makefile libpng.pc:libpng.pc.in]) - AC_CONFIG_FILES([libpng-config:libpng-config.in], - [chmod +x libpng-config]) - - AC_OUTPUT -diff --git libpng-1.6.22-orig/Makefile.am libpng-1.6.22/Makefile.am ---- libpng-1.6.22-orig/Makefile.am 2016-05-17 18:15:12.000000000 -0400 -+++ libpng-1.6.22/Makefile.am 2016-05-25 19:48:10.631751170 -0400 -@@ -89,16 +89,20 @@ libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SO - pngset.c pngtrans.c pngwio.c pngwrite.c pngwtran.c pngwutil.c\ - png.h pngconf.h pngdebug.h pnginfo.h pngpriv.h pngstruct.h pngusr.dfa - - if PNG_ARM_NEON - libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += arm/arm_init.c\ - arm/filter_neon.S arm/filter_neon_intrinsics.c - endif - -+if PNG_INTEL_SSE -+libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\ -+ contrib/intel/filter_sse2_intrinsics.c -+endif - nodist_libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES = pnglibconf.h - - libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_LDFLAGS = -no-undefined -export-dynamic \ - -version-number @PNGLIB_MAJOR@@PNGLIB_MINOR@:@PNGLIB_RELEASE@:0 - - if HAVE_LD_VERSION_SCRIPT - # Versioned symbols and restricted exports - if HAVE_SOLARIS_LD -diff --git libpng-1.6.22-orig/pngpriv.h libpng-1.6.22/pngpriv.h ---- libpng-1.6.22-orig/pngpriv.h 2016-05-25 18:59:10.000000000 -0400 -+++ libpng-1.6.22/pngpriv.h 2016-05-25 19:48:10.635751171 -0400 -@@ -177,16 +177,52 @@ - # endif /* !PNG_ARM_NEON_IMPLEMENTATION */ - - # ifndef PNG_ARM_NEON_IMPLEMENTATION - /* Use the intrinsics code by default. */ - # define PNG_ARM_NEON_IMPLEMENTATION 1 - # endif - #endif /* PNG_ARM_NEON_OPT > 0 */ - -+#ifndef PNG_INTEL_SSE_OPT -+# ifdef PNG_INTEL_SSE -+ /* Only check for SSE if the build configuration has been modified to -+ * enable SSE optimizations. This means that these optimizations will -+ * be off by default. See contrib/intel for more details. -+ */ -+# if defined(__SSE4_1__) || defined(__AVX__) || defined(__SSSE3__) || \ -+ defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \ -+ (defined(_M_IX86_FP) && _M_IX86_FP >= 2) -+# define PNG_INTEL_SSE_OPT 1 -+# endif -+# endif -+#endif -+ -+#if PNG_INTEL_SSE_OPT > 0 -+# ifndef PNG_INTEL_SSE_IMPLEMENTATION -+# if defined(__SSE4_1__) || defined(__AVX__) -+ /* We are not actually using AVX, but checking for AVX is the best -+ way we can detect SSE4.1 and SSSE3 on MSVC. -+ */ -+# define PNG_INTEL_SSE_IMPLEMENTATION 3 -+# elif defined(__SSSE3__) -+# define PNG_INTEL_SSE_IMPLEMENTATION 2 -+# elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \ -+ (defined(_M_IX86_FP) && _M_IX86_FP >= 2) -+# define PNG_INTEL_SSE_IMPLEMENTATION 1 -+# else -+# define PNG_INTEL_SSE_IMPLEMENTATION 0 -+# endif -+# endif -+ -+# if PNG_INTEL_SSE_IMPLEMENTATION > 0 -+# define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_sse2 -+# endif -+#endif -+ - /* Is this a build of a DLL where compilation of the object modules requires - * different preprocessor settings to those required for a simple library? If - * so PNG_BUILD_DLL must be set. - * - * If libpng is used inside a DLL but that DLL does not export the libpng APIs - * PNG_BUILD_DLL must not be set. To avoid the code below kicking in build a - * static library of libpng then link the DLL against that. - */ -@@ -1184,16 +1220,29 @@ PNG_INTERNAL_FUNCTION(void,png_read_filt - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); - PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_neon,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); - PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_neon,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); - PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop - row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); - -+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop -+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_sse2,(png_row_infop -+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_sse2,(png_row_infop -+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_sse2,(png_row_infop -+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_sse2,(png_row_infop -+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_sse2,(png_row_infop -+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); -+ - /* Choose the best filter to use and filter the row data */ - PNG_INTERNAL_FUNCTION(void,png_write_find_filter,(png_structrp png_ptr, - png_row_infop row_info),PNG_EMPTY); - - #ifdef PNG_SEQUENTIAL_READ_SUPPORTED - PNG_INTERNAL_FUNCTION(void,png_read_IDAT_data,(png_structrp png_ptr, - png_bytep output, png_alloc_size_t avail_out),PNG_EMPTY); - /* Read 'avail_out' bytes of data from the IDAT stream. If the output buffer |