Add script to compare performance of two versions of pdfium.

Run from the pdfium root: $ testing/tools/safetynet_compare.py testing This compares the current branch with and without local changes. $ testing/tools/safetynet_compare.py testing/corpus --branch-before x This compares the current branch + local changes against branch x. It runs only the corpus tests. $ testing/tools/safetynet_compare.py testing --branch-before x --branch-after y --build-dir=~/output_compare This compares branch x and branch y. x and y can be revision hashes. The callgrind.out files of cases with significant changes will be created in ~/output_compare. $ testing/tools/safetynet_compare.py -h Print all options. Change-Id: I43aaf5fe890745db611fb3bc00a656ef799fdfef Reviewed-on: https://pdfium-review.googlesource.com/7390 Reviewed-by: Lei Zhang <thestig@chromium.org> Commit-Queue: Henrique Nakashima <hnakashima@chromium.org>
author: Henrique Nakashima <hnakashima@chromium.org> 2017-08-03 13:29:22 -0400
committer: Chromium commit bot <commit-bot@chromium.org> 2017-08-03 22:00:23 +0000
commit: f24fc1e69d77db16527de99bff192693878f4080 (patch)
tree: be79d47e1367cdee5ea7eef363addb5233d3b514 /testing/tools/safetynet_conclusions.py
parent: b35dbadce21f684619377ce545e066de6494a441 (diff)
download: pdfium-f24fc1e69d77db16527de99bff192693878f4080.tar.xz
1 files changed, 297 insertions, 0 deletions
diff --git a/testing/tools/safetynet_conclusions.py b/testing/tools/safetynet_conclusions.py
new file mode 100644
index 0000000000..112274e669
--- /dev/null
+++ b/testing/tools/safetynet_conclusions.py
@@ -0,0 +1,297 @@
+# Copyright 2017 The PDFium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Classes that draw conclusions out of a comparison and represent them."""
+
+from collections import Counter
+
+
+FORMAT_RED = '\033[01;31m{0}\033[00m'
+FORMAT_GREEN = '\033[01;32m{0}\033[00m'
+FORMAT_MAGENTA = '\033[01;35m{0}\033[00m'
+FORMAT_CYAN = '\033[01;36m{0}\033[00m'
+FORMAT_NORMAL = '{0}'
+
+RATING_FAILURE = 'failure'
+RATING_REGRESSION = 'regression'
+RATING_IMPROVEMENT = 'improvement'
+RATING_NO_CHANGE = 'no_change'
+RATING_SMALL_CHANGE = 'small_change'
+
+RATINGS = [
+    RATING_FAILURE,
+    RATING_REGRESSION,
+    RATING_IMPROVEMENT,
+    RATING_NO_CHANGE,
+    RATING_SMALL_CHANGE
+]
+
+RATING_TO_COLOR = {
+    RATING_FAILURE: FORMAT_MAGENTA,
+    RATING_REGRESSION: FORMAT_RED,
+    RATING_IMPROVEMENT: FORMAT_CYAN,
+    RATING_NO_CHANGE: FORMAT_GREEN,
+    RATING_SMALL_CHANGE: FORMAT_NORMAL,
+}
+
+
+class ComparisonConclusions(object):
+  """All conclusions drawn from a comparison.
+
+  This is initialized empty and then processes pairs of results for each test
+  case, determining the rating for that case, which can be:
+  "failure" if either or both runs for the case failed.
+  "regression" if there is a significant increase in time for the test case.
+  "improvement" if there is a significant decrease in time for the test case.
+  "no_change" if the time for the test case did not change at all.
+  "small_change" if the time for the test case changed but within the threshold.
+  """
+
+  def __init__(self, threshold_significant):
+    """Initializes an empty ComparisonConclusions.
+
+    Args:
+      threshold_significant: Float with the tolerance beyond which changes in
+          measurements are considered significant.
+
+          The change is considered as a multiplication rather than an addition
+          of a fraction of the previous measurement, that is, a
+          threshold_significant of 1.0 will flag test cases that became over
+          100% slower (> 200% of the previous time measured) or over 100% faster
+          (< 50% of the previous time measured).
+
+          threshold_significant 0.02 -> 98.04% to 102% is not significant
+          threshold_significant 0.1 -> 90.9% to 110% is not significant
+          threshold_significant 0.25 -> 80% to 125% is not significant
+          threshold_significant 1 -> 50% to 200% is not significant
+          threshold_significant 4 -> 20% to 500% is not significant
+
+    """
+    self.threshold_significant = threshold_significant
+    self.threshold_significant_negative = (1 / (1 + threshold_significant)) - 1
+
+    self.params = {'threshold': threshold_significant}
+    self.summary = ComparisonSummary()
+    self.case_results = {}
+
+  def ProcessCase(self, case_name, before, after):
+    """Feeds a test case results to the ComparisonConclusions.
+
+    Args:
+      case_name: String identifying the case.
+      before: Measurement for the "before" version of the code.
+      after: Measurement for the "after" version of the code.
+    """
+
+    # Switch 0 to None to simplify the json dict output. All zeros are
+    # considered failed runs, so they will be represented by "null".
+    if not before:
+      before = None
+    if not after:
+      after = None
+
+    if not before or not after:
+      ratio = None
+      rating = RATING_FAILURE
+    else:
+      ratio = (float(after) / before) - 1.0
+      if ratio > self.threshold_significant:
+        rating = RATING_REGRESSION
+      elif ratio < self.threshold_significant_negative:
+        rating = RATING_IMPROVEMENT
+      elif ratio == 0:
+        rating = RATING_NO_CHANGE
+      else:
+        rating = RATING_SMALL_CHANGE
+
+    case_result = CaseResult(case_name, before, after, ratio, rating)
+
+    self.summary.ProcessCaseResult(case_result)
+    self.case_results[case_name] = case_result
+
+  def GetSummary(self):
+    """Gets the ComparisonSummary with consolidated totals."""
+    return self.summary
+
+  def GetCaseResults(self):
+    """Gets a dict mapping each test case identifier to its CaseResult."""
+    return self.case_results
+
+  def GetOutputDict(self):
+    """Returns a conclusions dict with all the conclusions drawn.
+
+    Returns:
+      A serializable dict with the format illustrated below:
+      {
+        "params": {
+          "threshold": 0.02
+        },
+        "summary": {
+          "total": 123,
+          "failure": 1,
+          "regression": 2,
+          "improvement": 1,
+          "no_change": 100,
+          "small_change": 19
+        },
+        "comparison_by_case": {
+          "testing/resources/new_test.pdf": {
+            "before": None,
+            "after": 1000,
+            "ratio": None,
+            "rating": "failure"
+          },
+          "testing/resources/test1.pdf": {
+            "before": 100,
+            "after": 120,
+            "ratio": 0.2,
+            "rating": "regression"
+          },
+          "testing/resources/test2.pdf": {
+            "before": 100,
+            "after": 2000,
+            "ratio": 19.0,
+            "rating": "regression"
+          },
+          "testing/resources/test3.pdf": {
+            "before": 1000,
+            "after": 1005,
+            "ratio": 0.005,
+            "rating": "small_change"
+          },
+          "testing/resources/test4.pdf": {
+            "before": 1000,
+            "after": 1000,
+            "ratio": 0.0,
+            "rating": "no_change"
+          },
+          "testing/resources/test5.pdf": {
+            "before": 1000,
+            "after": 600,
+            "ratio": -0.4,
+            "rating": "improvement"
+          }
+        }
+      }
+    """
+    output_dict = {}
+    output_dict['params'] = {'threshold': self.threshold_significant}
+    output_dict['summary'] = self.summary.GetOutputDict()
+    output_dict['comparison_by_case'] = {
+        cr.case_name: cr.GetOutputDict()
+        for cr in self.GetCaseResults().values()
+    }
+    return output_dict
+
+
+class ComparisonSummary(object):
+  """Totals computed for a comparison."""
+
+  def __init__(self):
+    self.rating_counter = Counter()
+
+  def ProcessCaseResult(self, case_result):
+    self.rating_counter[case_result.rating] += 1
+
+  def GetTotal(self):
+    """Gets the number of test cases processed."""
+    return sum(self.rating_counter.values())
+
+  def GetCount(self, rating):
+    """Gets the number of test cases processed with a given rating."""
+    return self.rating_counter[rating]
+
+  def GetOutputDict(self):
+    """Returns a dict that can be serialized with all the totals."""
+    result = {'total': self.GetTotal()}
+    for rating in RATINGS:
+      result[rating] = self.GetCount(rating)
+    return result
+
+
+class CaseResult(object):
+  """The conclusion for the comparison of a single test case."""
+
+  def __init__(self, case_name, before, after, ratio, rating):
+    """Initializes an empty ComparisonConclusions.
+
+    Args:
+      case_name: String identifying the case.
+      before: Measurement for the "before" version of the code.
+      after: Measurement for the "after" version of the code.
+      ratio: Difference between |after| and |before| as a fraction of |before|.
+      rating: Rating for this test case.
+    """
+    self.case_name = case_name
+    self.before = before
+    self.after = after
+    self.ratio = ratio
+    self.rating = rating
+
+  def GetOutputDict(self):
+    """Returns a dict with the test case's conclusions."""
+    return {'before': self.before,
+            'after': self.after,
+            'ratio': self.ratio,
+            'rating': self.rating}
+
+
+def PrintConclusionsDictHumanReadable(conclusions_dict, colored, key=None):
+  """Prints a conclusions dict in a human-readable way.
+
+  Args:
+    conclusions_dict: Dict to print.
+    colored: Whether to color the output to highlight significant changes.
+    key: String with the CaseResult dictionary key to sort the cases.
+  """
+  # Print header
+  print '=' * 80
+  print '{0:>11s} {1:>15s}  {2}' .format(
+      '% Change',
+      'Time after',
+      'Test case')
+  print '-' * 80
+
+  color = FORMAT_NORMAL
+
+  # Print cases
+  if key is not None:
+    case_pairs = sorted(conclusions_dict['comparison_by_case'].iteritems(),
+                        key=lambda kv: kv[1][key])
+  else:
+    case_pairs = sorted(conclusions_dict['comparison_by_case'].iteritems())
+
+  for case_name, case_dict in case_pairs:
+    if case_dict['rating'] == RATING_FAILURE:
+      print '%s to measure time for %s' % (
+          RATING_TO_COLOR[RATING_FAILURE].format('Failed'), case_name)
+      continue
+
+    if colored:
+      color = RATING_TO_COLOR[case_dict['rating']]
+
+    print '{0} {1:15,d}  {2}' .format(
+        color.format('{:+11.4%}'.format(case_dict['ratio'])),
+        case_dict['after'],
+        case_name)
+
+  # Print totals
+  totals = conclusions_dict['summary']
+  print '=' * 80
+  print 'Test cases run: %d' % totals['total']
+
+  if colored:
+    color = FORMAT_MAGENTA if totals[RATING_FAILURE] else FORMAT_GREEN
+  print ('Failed to measure: %s'
+         % color.format(totals[RATING_FAILURE]))
+
+  if colored:
+    color = FORMAT_RED if totals[RATING_REGRESSION] else FORMAT_GREEN
+  print ('Regressions: %s'
+         % color.format(totals[RATING_REGRESSION]))
+
+  if colored:
+    color = FORMAT_CYAN if totals[RATING_IMPROVEMENT] else FORMAT_GREEN
+  print ('Improvements: %s'
+         % color.format(totals[RATING_IMPROVEMENT]))
author	Henrique Nakashima <hnakashima@chromium.org>	2017-08-03 13:29:22 -0400
committer	Chromium commit bot <commit-bot@chromium.org>	2017-08-03 22:00:23 +0000
commit	f24fc1e69d77db16527de99bff192693878f4080 (patch)
tree	be79d47e1367cdee5ea7eef363addb5233d3b514 /testing/tools/safetynet_conclusions.py
parent	b35dbadce21f684619377ce545e066de6494a441 (diff)
download	pdfium-f24fc1e69d77db16527de99bff192693878f4080.tar.xz