diff options
author | Nicolas Pena <npm@chromium.org> | 2017-09-01 13:25:16 -0400 |
---|---|---|
committer | Chromium commit bot <commit-bot@chromium.org> | 2017-09-05 14:10:37 +0000 |
commit | 088ca03f25fe1f6d75c0ff3b71e0ad3d018a5e0c (patch) | |
tree | 27f55db27d9112910d9219efa58474a0c2bd9b52 /third_party/libopenjpeg20/mct.c | |
parent | 740bcd892d22136873b2b123b94e51bf6e77b8f9 (diff) | |
download | pdfium-088ca03f25fe1f6d75c0ff3b71e0ad3d018a5e0c.tar.xz |
Upgrade OpenJPEG to 2.2.0
This CL upgrades OpenJPEG by copying the files from 2.2.0 and then applying
patches. Patch files that are no longer relevant are deleted. The relevant
ones are applied manually due to changes in formatting in OpenJPEG. Patch 34
is added to account for opj_malloc changes in PDFium.
Bug: chromium:718731
Change-Id: I3d316893eab5e235c9f71222a6818b8ae0c98383
Reviewed-on: https://pdfium-review.googlesource.com/12770
Commit-Queue: dsinclair <dsinclair@chromium.org>
Reviewed-by: dsinclair <dsinclair@chromium.org>
Diffstat (limited to 'third_party/libopenjpeg20/mct.c')
-rw-r--r-- | third_party/libopenjpeg20/mct.c | 828 |
1 files changed, 421 insertions, 407 deletions
diff --git a/third_party/libopenjpeg20/mct.c b/third_party/libopenjpeg20/mct.c index 7731c15ddb..20b9e121b0 100644 --- a/third_party/libopenjpeg20/mct.c +++ b/third_party/libopenjpeg20/mct.c @@ -1,6 +1,6 @@ /* - * The copyright in this software is being made available under the 2-clauses - * BSD License, included below. This software may be subject to other third + * The copyright in this software is being made available under the 2-clauses + * BSD License, included below. This software may be subject to other third * party and contributor rights, including patent rights, and no such rights * are granted under this license. * @@ -8,10 +8,10 @@ * Copyright (c) 2002-2014, Professor Benoit Macq * Copyright (c) 2001-2003, David Janssens * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux + * Copyright (c) 2003-2007, Francois-Olivier Devaux * Copyright (c) 2003-2014, Antonin Descampe * Copyright (c) 2005, Herve Drolon, FreeImage Team - * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR + * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR * Copyright (c) 2012, CS Systemes d'Information, France * All rights reserved. * @@ -62,14 +62,14 @@ static const OPJ_FLOAT64 opj_mct_norms[3] = { 1.732, .8292, .8292 }; /* </summary> */ static const OPJ_FLOAT64 opj_mct_norms_real[3] = { 1.732, 1.805, 1.573 }; -const OPJ_FLOAT64 * opj_mct_get_mct_norms () +const OPJ_FLOAT64 * opj_mct_get_mct_norms() { - return opj_mct_norms; + return opj_mct_norms; } -const OPJ_FLOAT64 * opj_mct_get_mct_norms_real () +const OPJ_FLOAT64 * opj_mct_get_mct_norms_real() { - return opj_mct_norms_real; + return opj_mct_norms_real; } /* <summary> */ @@ -77,63 +77,67 @@ const OPJ_FLOAT64 * opj_mct_get_mct_norms_real () /* </summary> */ #ifdef USE_SSE2 void opj_mct_encode( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, - OPJ_UINT32 n) + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) { - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - for(i = 0; i < (len & ~3U); i += 4) { - __m128i y, u, v; - __m128i r = _mm_load_si128((const __m128i *)&(c0[i])); - __m128i g = _mm_load_si128((const __m128i *)&(c1[i])); - __m128i b = _mm_load_si128((const __m128i *)&(c2[i])); - y = _mm_add_epi32(g, g); - y = _mm_add_epi32(y, b); - y = _mm_add_epi32(y, r); - y = _mm_srai_epi32(y, 2); - u = _mm_sub_epi32(b, g); - v = _mm_sub_epi32(r, g); - _mm_store_si128((__m128i *)&(c0[i]), y); - _mm_store_si128((__m128i *)&(c1[i]), u); - _mm_store_si128((__m128i *)&(c2[i]), v); - } - - for(; i < len; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = (r + (g * 2) + b) >> 2; - OPJ_INT32 u = b - g; - OPJ_INT32 v = r - g; - c0[i] = y; - c1[i] = u; - c2[i] = v; - } + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + /* buffer are aligned on 16 bytes */ + assert(((size_t)c0 & 0xf) == 0); + assert(((size_t)c1 & 0xf) == 0); + assert(((size_t)c2 & 0xf) == 0); + + for (i = 0; i < (len & ~3U); i += 4) { + __m128i y, u, v; + __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); + __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); + __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); + y = _mm_add_epi32(g, g); + y = _mm_add_epi32(y, b); + y = _mm_add_epi32(y, r); + y = _mm_srai_epi32(y, 2); + u = _mm_sub_epi32(b, g); + v = _mm_sub_epi32(r, g); + _mm_store_si128((__m128i *) & (c0[i]), y); + _mm_store_si128((__m128i *) & (c1[i]), u); + _mm_store_si128((__m128i *) & (c2[i]), v); + } + + for (; i < len; ++i) { + OPJ_INT32 r = c0[i]; + OPJ_INT32 g = c1[i]; + OPJ_INT32 b = c2[i]; + OPJ_INT32 y = (r + (g * 2) + b) >> 2; + OPJ_INT32 u = b - g; + OPJ_INT32 v = r - g; + c0[i] = y; + c1[i] = u; + c2[i] = v; + } } #else void opj_mct_encode( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, - OPJ_UINT32 n) + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) { - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - for(i = 0; i < len; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = (r + (g * 2) + b) >> 2; - OPJ_INT32 u = b - g; - OPJ_INT32 v = r - g; - c0[i] = y; - c1[i] = u; - c2[i] = v; - } + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + for (i = 0; i < len; ++i) { + OPJ_INT32 r = c0[i]; + OPJ_INT32 g = c1[i]; + OPJ_INT32 b = c2[i]; + OPJ_INT32 y = (r + (g * 2) + b) >> 2; + OPJ_INT32 u = b - g; + OPJ_INT32 v = r - g; + c0[i] = y; + c1[i] = u; + c2[i] = v; + } } #endif @@ -142,66 +146,67 @@ void opj_mct_encode( /* </summary> */ #ifdef USE_SSE2 void opj_mct_decode( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, - OPJ_UINT32 n) + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) { - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - for(i = 0; i < (len & ~3U); i += 4) { - __m128i r, g, b; - __m128i y = _mm_load_si128((const __m128i *)&(c0[i])); - __m128i u = _mm_load_si128((const __m128i *)&(c1[i])); - __m128i v = _mm_load_si128((const __m128i *)&(c2[i])); - g = y; - g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2)); - r = _mm_add_epi32(v, g); - b = _mm_add_epi32(u, g); - _mm_store_si128((__m128i *)&(c0[i]), r); - _mm_store_si128((__m128i *)&(c1[i]), g); - _mm_store_si128((__m128i *)&(c2[i]), b); - } - for (; i < len; ++i) { - OPJ_INT32 y = c0[i]; - OPJ_INT32 u = c1[i]; - OPJ_INT32 v = c2[i]; - OPJ_INT32 g = y - ((u + v) >> 2); - OPJ_INT32 r = v + g; - OPJ_INT32 b = u + g; - c0[i] = r; - c1[i] = g; - c2[i] = b; - } + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + for (i = 0; i < (len & ~3U); i += 4) { + __m128i r, g, b; + __m128i y = _mm_load_si128((const __m128i *) & (c0[i])); + __m128i u = _mm_load_si128((const __m128i *) & (c1[i])); + __m128i v = _mm_load_si128((const __m128i *) & (c2[i])); + g = y; + g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2)); + r = _mm_add_epi32(v, g); + b = _mm_add_epi32(u, g); + _mm_store_si128((__m128i *) & (c0[i]), r); + _mm_store_si128((__m128i *) & (c1[i]), g); + _mm_store_si128((__m128i *) & (c2[i]), b); + } + for (; i < len; ++i) { + OPJ_INT32 y = c0[i]; + OPJ_INT32 u = c1[i]; + OPJ_INT32 v = c2[i]; + OPJ_INT32 g = y - ((u + v) >> 2); + OPJ_INT32 r = v + g; + OPJ_INT32 b = u + g; + c0[i] = r; + c1[i] = g; + c2[i] = b; + } } #else void opj_mct_decode( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, - OPJ_UINT32 n) + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) { - OPJ_UINT32 i; - for (i = 0; i < n; ++i) { - OPJ_INT32 y = c0[i]; - OPJ_INT32 u = c1[i]; - OPJ_INT32 v = c2[i]; - OPJ_INT32 g = y - ((u + v) >> 2); - OPJ_INT32 r = v + g; - OPJ_INT32 b = u + g; - c0[i] = r; - c1[i] = g; - c2[i] = b; - } + OPJ_UINT32 i; + for (i = 0; i < n; ++i) { + OPJ_INT32 y = c0[i]; + OPJ_INT32 u = c1[i]; + OPJ_INT32 v = c2[i]; + OPJ_INT32 g = y - ((u + v) >> 2); + OPJ_INT32 r = v + g; + OPJ_INT32 b = u + g; + c0[i] = r; + c1[i] = g; + c2[i] = b; + } } #endif /* <summary> */ /* Get norm of basis function of reversible MCT. */ /* </summary> */ -OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) { - return opj_mct_norms[compno]; +OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) +{ + return opj_mct_norms[compno]; } /* <summary> */ @@ -209,164 +214,171 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) { /* </summary> */ #ifdef USE_SSE4 void opj_mct_encode_real( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, - OPJ_UINT32 n) + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) { - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - const __m128i ry = _mm_set1_epi32(2449); - const __m128i gy = _mm_set1_epi32(4809); - const __m128i by = _mm_set1_epi32(934); - const __m128i ru = _mm_set1_epi32(1382); - const __m128i gu = _mm_set1_epi32(2714); - /* const __m128i bu = _mm_set1_epi32(4096); */ - /* const __m128i rv = _mm_set1_epi32(4096); */ - const __m128i gv = _mm_set1_epi32(3430); - const __m128i bv = _mm_set1_epi32(666); - const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), _MM_SHUFFLE(1, 0, 1, 0)); - - for(i = 0; i < (len & ~3U); i += 4) { - __m128i lo, hi; - __m128i y, u, v; - __m128i r = _mm_load_si128((const __m128i *)&(c0[i])); - __m128i g = _mm_load_si128((const __m128i *)&(c1[i])); - __m128i b = _mm_load_si128((const __m128i *)&(c2[i])); - - lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, ry); - hi = _mm_mul_epi32(hi, ry); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - y = _mm_blend_epi16(lo, hi, 0xCC); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gy); - hi = _mm_mul_epi32(hi, gy); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, by); - hi = _mm_mul_epi32(hi, by); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *)&(c0[i]), y); - - /*lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, mulround); - hi = _mm_mul_epi32(hi, mulround);*/ - lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); - hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); - lo = _mm_slli_epi64(lo, 12); - hi = _mm_slli_epi64(hi, 12); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - u = _mm_blend_epi16(lo, hi, 0xCC); - - lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, ru); - hi = _mm_mul_epi32(hi, ru); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gu); - hi = _mm_mul_epi32(hi, gu); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *)&(c1[i]), u); - - /*lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, mulround); - hi = _mm_mul_epi32(hi, mulround);*/ - lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); - hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); - lo = _mm_slli_epi64(lo, 12); - hi = _mm_slli_epi64(hi, 12); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - v = _mm_blend_epi16(lo, hi, 0xCC); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gv); - hi = _mm_mul_epi32(hi, gv); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, bv); - hi = _mm_mul_epi32(hi, bv); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32-13); - v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *)&(c2[i]), v); - } - for(; i < len; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, 4809) + opj_int_fix_mul(b, 934); - OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, 2714) + opj_int_fix_mul(b, 4096); - OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, 3430) - opj_int_fix_mul(b, 666); - c0[i] = y; - c1[i] = u; - c2[i] = v; - } + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + const __m128i ry = _mm_set1_epi32(2449); + const __m128i gy = _mm_set1_epi32(4809); + const __m128i by = _mm_set1_epi32(934); + const __m128i ru = _mm_set1_epi32(1382); + const __m128i gu = _mm_set1_epi32(2714); + /* const __m128i bu = _mm_set1_epi32(4096); */ + /* const __m128i rv = _mm_set1_epi32(4096); */ + const __m128i gv = _mm_set1_epi32(3430); + const __m128i bv = _mm_set1_epi32(666); + const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), + _MM_SHUFFLE(1, 0, 1, 0)); + + for (i = 0; i < (len & ~3U); i += 4) { + __m128i lo, hi; + __m128i y, u, v; + __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); + __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); + __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); + + lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, ry); + hi = _mm_mul_epi32(hi, ry); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + y = _mm_blend_epi16(lo, hi, 0xCC); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gy); + hi = _mm_mul_epi32(hi, gy); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, by); + hi = _mm_mul_epi32(hi, by); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *) & (c0[i]), y); + + /*lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, mulround); + hi = _mm_mul_epi32(hi, mulround);*/ + lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); + hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); + lo = _mm_slli_epi64(lo, 12); + hi = _mm_slli_epi64(hi, 12); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + u = _mm_blend_epi16(lo, hi, 0xCC); + + lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, ru); + hi = _mm_mul_epi32(hi, ru); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gu); + hi = _mm_mul_epi32(hi, gu); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *) & (c1[i]), u); + + /*lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, mulround); + hi = _mm_mul_epi32(hi, mulround);*/ + lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); + hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); + lo = _mm_slli_epi64(lo, 12); + hi = _mm_slli_epi64(hi, 12); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + v = _mm_blend_epi16(lo, hi, 0xCC); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gv); + hi = _mm_mul_epi32(hi, gv); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, bv); + hi = _mm_mul_epi32(hi, bv); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32 - 13); + v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *) & (c2[i]), v); + } + for (; i < len; ++i) { + OPJ_INT32 r = c0[i]; + OPJ_INT32 g = c1[i]; + OPJ_INT32 b = c2[i]; + OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, + 4809) + opj_int_fix_mul(b, 934); + OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, + 2714) + opj_int_fix_mul(b, 4096); + OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, + 3430) - opj_int_fix_mul(b, 666); + c0[i] = y; + c1[i] = u; + c2[i] = v; + } } #else void opj_mct_encode_real( - OPJ_INT32* restrict c0, - OPJ_INT32* restrict c1, - OPJ_INT32* restrict c2, - OPJ_UINT32 n) + OPJ_INT32* OPJ_RESTRICT c0, + OPJ_INT32* OPJ_RESTRICT c1, + OPJ_INT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) { - OPJ_UINT32 i; - for(i = 0; i < n; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, 4809) + opj_int_fix_mul(b, 934); - OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, 2714) + opj_int_fix_mul(b, 4096); - OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, 3430) - opj_int_fix_mul(b, 666); - c0[i] = y; - c1[i] = u; - c2[i] = v; - } + OPJ_UINT32 i; + for (i = 0; i < n; ++i) { + OPJ_INT32 r = c0[i]; + OPJ_INT32 g = c1[i]; + OPJ_INT32 b = c2[i]; + OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, + 4809) + opj_int_fix_mul(b, 934); + OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, + 2714) + opj_int_fix_mul(b, 4096); + OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, + 3430) - opj_int_fix_mul(b, 666); + c0[i] = y; + c1[i] = u; + c2[i] = v; + } } #endif @@ -374,183 +386,185 @@ void opj_mct_encode_real( /* Inverse irreversible MCT. */ /* </summary> */ void opj_mct_decode_real( - OPJ_FLOAT32* restrict c0, - OPJ_FLOAT32* restrict c1, - OPJ_FLOAT32* restrict c2, - OPJ_UINT32 n) + OPJ_FLOAT32* OPJ_RESTRICT c0, + OPJ_FLOAT32* OPJ_RESTRICT c1, + OPJ_FLOAT32* OPJ_RESTRICT c2, + OPJ_UINT32 n) { - OPJ_UINT32 i; + OPJ_UINT32 i; #ifdef USE_SSE - __m128 vrv, vgu, vgv, vbu; - vrv = _mm_set1_ps(1.402f); - vgu = _mm_set1_ps(0.34413f); - vgv = _mm_set1_ps(0.71414f); - vbu = _mm_set1_ps(1.772f); - for (i = 0; i < (n >> 3); ++i) { - __m128 vy, vu, vv; - __m128 vr, vg, vb; - - vy = _mm_load_ps(c0); - vu = _mm_load_ps(c1); - vv = _mm_load_ps(c2); - vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); - vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); - vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); - _mm_store_ps(c0, vr); - _mm_store_ps(c1, vg); - _mm_store_ps(c2, vb); - c0 += 4; - c1 += 4; - c2 += 4; - - vy = _mm_load_ps(c0); - vu = _mm_load_ps(c1); - vv = _mm_load_ps(c2); - vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); - vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); - vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); - _mm_store_ps(c0, vr); - _mm_store_ps(c1, vg); - _mm_store_ps(c2, vb); - c0 += 4; - c1 += 4; - c2 += 4; - } - n &= 7; + __m128 vrv, vgu, vgv, vbu; + vrv = _mm_set1_ps(1.402f); + vgu = _mm_set1_ps(0.34413f); + vgv = _mm_set1_ps(0.71414f); + vbu = _mm_set1_ps(1.772f); + for (i = 0; i < (n >> 3); ++i) { + __m128 vy, vu, vv; + __m128 vr, vg, vb; + + vy = _mm_load_ps(c0); + vu = _mm_load_ps(c1); + vv = _mm_load_ps(c2); + vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); + vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); + vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); + _mm_store_ps(c0, vr); + _mm_store_ps(c1, vg); + _mm_store_ps(c2, vb); + c0 += 4; + c1 += 4; + c2 += 4; + + vy = _mm_load_ps(c0); + vu = _mm_load_ps(c1); + vv = _mm_load_ps(c2); + vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); + vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); + vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); + _mm_store_ps(c0, vr); + _mm_store_ps(c1, vg); + _mm_store_ps(c2, vb); + c0 += 4; + c1 += 4; + c2 += 4; + } + n &= 7; #endif - for(i = 0; i < n; ++i) { - OPJ_FLOAT32 y = c0[i]; - OPJ_FLOAT32 u = c1[i]; - OPJ_FLOAT32 v = c2[i]; - OPJ_FLOAT32 r = y + (v * 1.402f); - OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f)); - OPJ_FLOAT32 b = y + (u * 1.772f); - c0[i] = r; - c1[i] = g; - c2[i] = b; - } + for (i = 0; i < n; ++i) { + OPJ_FLOAT32 y = c0[i]; + OPJ_FLOAT32 u = c1[i]; + OPJ_FLOAT32 v = c2[i]; + OPJ_FLOAT32 r = y + (v * 1.402f); + OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f)); + OPJ_FLOAT32 b = y + (u * 1.772f); + c0[i] = r; + c1[i] = g; + c2[i] = b; + } } /* <summary> */ /* Get norm of basis function of irreversible MCT. */ /* </summary> */ -OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno) { - return opj_mct_norms_real[compno]; +OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno) +{ + return opj_mct_norms_real[compno]; } OPJ_BOOL opj_mct_encode_custom( - OPJ_BYTE * pCodingdata, - OPJ_UINT32 n, - OPJ_BYTE ** pData, - OPJ_UINT32 pNbComp, - OPJ_UINT32 isSigned) + OPJ_BYTE * pCodingdata, + OPJ_UINT32 n, + OPJ_BYTE ** pData, + OPJ_UINT32 pNbComp, + OPJ_UINT32 isSigned) { - OPJ_FLOAT32 * lMct = (OPJ_FLOAT32 *) pCodingdata; - OPJ_UINT32 i; - OPJ_UINT32 j; - OPJ_UINT32 k; - OPJ_UINT32 lNbMatCoeff = pNbComp * pNbComp; - OPJ_INT32 * lCurrentData = 00; - OPJ_INT32 * lCurrentMatrix = 00; - OPJ_INT32 ** lData = (OPJ_INT32 **) pData; - OPJ_UINT32 lMultiplicator = 1 << 13; - OPJ_INT32 * lMctPtr; + OPJ_FLOAT32 * lMct = (OPJ_FLOAT32 *) pCodingdata; + OPJ_UINT32 i; + OPJ_UINT32 j; + OPJ_UINT32 k; + OPJ_UINT32 lNbMatCoeff = pNbComp * pNbComp; + OPJ_INT32 * lCurrentData = 00; + OPJ_INT32 * lCurrentMatrix = 00; + OPJ_INT32 ** lData = (OPJ_INT32 **) pData; + OPJ_UINT32 lMultiplicator = 1 << 13; + OPJ_INT32 * lMctPtr; OPJ_ARG_NOT_USED(isSigned); - lCurrentData = (OPJ_INT32 *) opj_malloc((pNbComp + lNbMatCoeff) * sizeof(OPJ_INT32)); - if (! lCurrentData) { - return OPJ_FALSE; - } + lCurrentData = (OPJ_INT32 *) opj_malloc((pNbComp + lNbMatCoeff) * sizeof( + OPJ_INT32)); + if (! lCurrentData) { + return OPJ_FALSE; + } - lCurrentMatrix = lCurrentData + pNbComp; + lCurrentMatrix = lCurrentData + pNbComp; - for (i =0;i<lNbMatCoeff;++i) { - lCurrentMatrix[i] = (OPJ_INT32) (*(lMct++) * (OPJ_FLOAT32)lMultiplicator); - } + for (i = 0; i < lNbMatCoeff; ++i) { + lCurrentMatrix[i] = (OPJ_INT32)(*(lMct++) * (OPJ_FLOAT32)lMultiplicator); + } - for (i = 0; i < n; ++i) { - lMctPtr = lCurrentMatrix; - for (j=0;j<pNbComp;++j) { - lCurrentData[j] = (*(lData[j])); - } + for (i = 0; i < n; ++i) { + lMctPtr = lCurrentMatrix; + for (j = 0; j < pNbComp; ++j) { + lCurrentData[j] = (*(lData[j])); + } - for (j=0;j<pNbComp;++j) { - *(lData[j]) = 0; - for (k=0;k<pNbComp;++k) { - *(lData[j]) += opj_int_fix_mul(*lMctPtr, lCurrentData[k]); - ++lMctPtr; - } + for (j = 0; j < pNbComp; ++j) { + *(lData[j]) = 0; + for (k = 0; k < pNbComp; ++k) { + *(lData[j]) += opj_int_fix_mul(*lMctPtr, lCurrentData[k]); + ++lMctPtr; + } - ++lData[j]; - } - } + ++lData[j]; + } + } - opj_free(lCurrentData); + opj_free(lCurrentData); - return OPJ_TRUE; + return OPJ_TRUE; } OPJ_BOOL opj_mct_decode_custom( - OPJ_BYTE * pDecodingData, - OPJ_UINT32 n, - OPJ_BYTE ** pData, - OPJ_UINT32 pNbComp, - OPJ_UINT32 isSigned) + OPJ_BYTE * pDecodingData, + OPJ_UINT32 n, + OPJ_BYTE ** pData, + OPJ_UINT32 pNbComp, + OPJ_UINT32 isSigned) { - OPJ_FLOAT32 * lMct; - OPJ_UINT32 i; - OPJ_UINT32 j; - OPJ_UINT32 k; + OPJ_FLOAT32 * lMct; + OPJ_UINT32 i; + OPJ_UINT32 j; + OPJ_UINT32 k; - OPJ_FLOAT32 * lCurrentData = 00; - OPJ_FLOAT32 * lCurrentResult = 00; - OPJ_FLOAT32 ** lData = (OPJ_FLOAT32 **) pData; + OPJ_FLOAT32 * lCurrentData = 00; + OPJ_FLOAT32 * lCurrentResult = 00; + OPJ_FLOAT32 ** lData = (OPJ_FLOAT32 **) pData; OPJ_ARG_NOT_USED(isSigned); - lCurrentData = (OPJ_FLOAT32 *) opj_malloc (2 * pNbComp * sizeof(OPJ_FLOAT32)); - if (! lCurrentData) { - return OPJ_FALSE; - } - lCurrentResult = lCurrentData + pNbComp; - - for (i = 0; i < n; ++i) { - lMct = (OPJ_FLOAT32 *) pDecodingData; - for (j=0;j<pNbComp;++j) { - lCurrentData[j] = (OPJ_FLOAT32) (*(lData[j])); - } - for (j=0;j<pNbComp;++j) { - lCurrentResult[j] = 0; - for (k=0;k<pNbComp;++k) { - lCurrentResult[j] += *(lMct++) * lCurrentData[k]; - } - *(lData[j]++) = (OPJ_FLOAT32) (lCurrentResult[j]); - } - } - opj_free(lCurrentData); - return OPJ_TRUE; + lCurrentData = (OPJ_FLOAT32 *) opj_malloc(2 * pNbComp * sizeof(OPJ_FLOAT32)); + if (! lCurrentData) { + return OPJ_FALSE; + } + lCurrentResult = lCurrentData + pNbComp; + + for (i = 0; i < n; ++i) { + lMct = (OPJ_FLOAT32 *) pDecodingData; + for (j = 0; j < pNbComp; ++j) { + lCurrentData[j] = (OPJ_FLOAT32)(*(lData[j])); + } + for (j = 0; j < pNbComp; ++j) { + lCurrentResult[j] = 0; + for (k = 0; k < pNbComp; ++k) { + lCurrentResult[j] += *(lMct++) * lCurrentData[k]; + } + *(lData[j]++) = (OPJ_FLOAT32)(lCurrentResult[j]); + } + } + opj_free(lCurrentData); + return OPJ_TRUE; } -void opj_calculate_norms( OPJ_FLOAT64 * pNorms, - OPJ_UINT32 pNbComps, - OPJ_FLOAT32 * pMatrix) +void opj_calculate_norms(OPJ_FLOAT64 * pNorms, + OPJ_UINT32 pNbComps, + OPJ_FLOAT32 * pMatrix) { - OPJ_UINT32 i,j,lIndex; - OPJ_FLOAT32 lCurrentValue; - OPJ_FLOAT64 * lNorms = (OPJ_FLOAT64 *) pNorms; - OPJ_FLOAT32 * lMatrix = (OPJ_FLOAT32 *) pMatrix; - - for (i=0;i<pNbComps;++i) { - lNorms[i] = 0; - lIndex = i; - - for (j=0;j<pNbComps;++j) { - lCurrentValue = lMatrix[lIndex]; - lIndex += pNbComps; - lNorms[i] += lCurrentValue * lCurrentValue; - } - lNorms[i] = sqrt(lNorms[i]); - } + OPJ_UINT32 i, j, lIndex; + OPJ_FLOAT32 lCurrentValue; + OPJ_FLOAT64 * lNorms = (OPJ_FLOAT64 *) pNorms; + OPJ_FLOAT32 * lMatrix = (OPJ_FLOAT32 *) pMatrix; + + for (i = 0; i < pNbComps; ++i) { + lNorms[i] = 0; + lIndex = i; + + for (j = 0; j < pNbComps; ++j) { + lCurrentValue = lMatrix[lIndex]; + lIndex += pNbComps; + lNorms[i] += lCurrentValue * lCurrentValue; + } + lNorms[i] = sqrt(lNorms[i]); + } } |