From 160bd0e26f5dfe5fa11322f61b3d156c2214cba8 Mon Sep 17 00:00:00 2001
From: brucedawson <brucedawson@chromium.org>
Date: Mon, 27 Jun 2016 06:58:37 -0700
Subject: Double AdobeCMYK_to_sRGB speed with faster rounding

FXSYS_round is painfully slow on Windows. It does range checking and
then calls an extremely expensive function. It ends up consuming half
the CPU time when decoding the images in PDFs such as this one:

    https://www.ets.org/Media/Tests/GRE/pdf/gre_research_validity_data.pdf

SSE can be used to optimize this:

  __m128 cmyk = {c * 255, m * 255, y * 255, k * 255};
  uint32_t output[4];
  _mm_storeu_si128((__m128i*)output, _mm_cvtps_epi32(cmyk));

but is cryptic, only works for x86/x64, and gives basically identical
performance to this solution - int(c * 255 + 0.5f);

The rounding behavior is not identical but in practice this rarely
matters, and in this specific case it does not matter because the edge
cases that vary are not hit.

The three divisions at the end were changed to multiplies because
profiling showed they were a significant cost.

This change reduces the image-decode stalls in the PDF listed above by
about 40%, making for a noticeably better experience. Further
optimizations are possible but would require significantly more time and
testing.

BUG=617365

Review-Url: https://codereview.chromium.org/2096723003
---
 core/fxcodec/codec/fx_codec_icc.cpp | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'core/fxcodec')

diff --git a/core/fxcodec/codec/fx_codec_icc.cpp b/core/fxcodec/codec/fx_codec_icc.cpp
index 29cfe0e9f4..8ffbffe985 100644
--- a/core/fxcodec/codec/fx_codec_icc.cpp
+++ b/core/fxcodec/codec/fx_codec_icc.cpp
@@ -1977,13 +1977,31 @@ void AdobeCMYK_to_sRGB(FX_FLOAT c,
                        FX_FLOAT& R,
                        FX_FLOAT& G,
                        FX_FLOAT& B) {
-  uint8_t c1 = FXSYS_round(c * 255);
-  uint8_t m1 = FXSYS_round(m * 255);
-  uint8_t y1 = FXSYS_round(y * 255);
-  uint8_t k1 = FXSYS_round(k * 255);
+  // Convert to uint8_t with round-to-nearest. Avoid using FXSYS_round because
+  // it is incredibly expensive with VC++ (tested on VC++ 2015) because round()
+  // is very expensive.
+  // Adding 0.5f and truncating can round the wrong direction in some edge
+  // cases but these do not matter in this context. For instance, the float that
+  // is one ULP (unit in the last place) before 0.5 should round to zero but
+  // this will round it to one. These edge cases are never hit in this function
+  // due to the very limited precision of the input integers.
+  // This method also doesn't handle negative or extremely large numbers, but
+  // those are not needed here.
+  uint8_t c1 = int(c * 255.f + 0.5f);
+  uint8_t m1 = int(m * 255.f + 0.5f);
+  uint8_t y1 = int(y * 255.f + 0.5f);
+  uint8_t k1 = int(k * 255.f + 0.5f);
+
+  ASSERT(c1 == FXSYS_round(c * 255));
+  ASSERT(m1 == FXSYS_round(m * 255));
+  ASSERT(y1 == FXSYS_round(y * 255));
+  ASSERT(k1 == FXSYS_round(k * 255));
+
   uint8_t r, g, b;
   AdobeCMYK_to_sRGB1(c1, m1, y1, k1, r, g, b);
-  R = 1.0f * r / 255;
-  G = 1.0f * g / 255;
-  B = 1.0f * b / 255;
+  // Multiply by a constant rather than dividing because division is much
+  // more expensive.
+  R = r * (1.0f / 255);
+  G = g * (1.0f / 255);
+  B = b * (1.0f / 255);
 }
-- 
cgit v1.2.3