From 9774984f96946eb96eed29abfcbe824cb5858bbb Mon Sep 17 00:00:00 2001
From: Dan Sinclair <dsinclair@chromium.org>
Date: Tue, 16 May 2017 12:59:10 -0400
Subject: Add formcalc lexer tests.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This CL adds tests for CXFA_FMLexer.

Change-Id: I4cb7000212dda6d2b32211005a1c22deabb813ae
Reviewed-on: https://pdfium-review.googlesource.com/5554
Commit-Queue: dsinclair <dsinclair@chromium.org>
Reviewed-by: Nicolás Peña <npm@chromium.org>
---
 BUILD.gn                              |   1 +
 xfa/fxfa/fm2js/xfa_lexer.cpp          |  57 ++++----
 xfa/fxfa/fm2js/xfa_lexer_unittest.cpp | 239 ++++++++++++++++++++++++++++++++++
 3 files changed, 267 insertions(+), 30 deletions(-)
 create mode 100644 xfa/fxfa/fm2js/xfa_lexer_unittest.cpp

diff --git a/BUILD.gn b/BUILD.gn
index b7b3ff6e0b..25ff2c6fd3 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -1908,6 +1908,7 @@ test("pdfium_unittests") {
       "xfa/fxfa/app/cxfa_textparser_unittest.cpp",
       "xfa/fxfa/app/xfa_ffbarcode_unittest.cpp",
       "xfa/fxfa/cxfa_ffapp_unittest.cpp",
+      "xfa/fxfa/fm2js/xfa_lexer_unittest.cpp",
       "xfa/fxfa/fm2js/xfa_simpleexpression_unittest.cpp",
       "xfa/fxfa/parser/xfa_utils_unittest.cpp",
     ]
diff --git a/xfa/fxfa/fm2js/xfa_lexer.cpp b/xfa/fxfa/fm2js/xfa_lexer.cpp
index dfac51ab2c..bdffa7e998 100644
--- a/xfa/fxfa/fm2js/xfa_lexer.cpp
+++ b/xfa/fxfa/fm2js/xfa_lexer.cpp
@@ -124,9 +124,13 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
   }
 
   while (1) {
-    // Make sure we don't walk off the end of the string.
-    if (m_ptr > m_end)
+    // Make sure we don't walk off the end of the string. If we don't currently
+    // have a token type then mark it EOF.
+    if (m_ptr > m_end) {
+      if (p->m_type == TOKreserver)
+        p->m_type = TOKeof;
       return p;
+    }
 
     ch = *m_ptr;
     if (!IsValid(m_ptr)) {
@@ -172,7 +176,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
       case '=':
         ++m_ptr;
         if (m_ptr > m_end) {
-          Error(kFMErrEndOfInput);
+          p->m_type = TOKassign;
           return p;
         }
 
@@ -192,7 +196,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
       case '<':
         ++m_ptr;
         if (m_ptr > m_end) {
-          Error(kFMErrEndOfInput);
+          p->m_type = TOKlt;
           return p;
         }
 
@@ -215,7 +219,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
       case '>':
         ++m_ptr;
         if (m_ptr > m_end) {
-          Error(kFMErrEndOfInput);
+          p->m_type = TOKgt;
           return p;
         }
 
@@ -275,7 +279,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
       case '/': {
         ++m_ptr;
         if (m_ptr > m_end) {
-          Error(kFMErrEndOfInput);
+          p->m_type = TOKdiv;
           return p;
         }
 
@@ -295,7 +299,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
       case '.':
         ++m_ptr;
         if (m_ptr > m_end) {
-          Error(kFMErrEndOfInput);
+          p->m_type = TOKdot;
           return p;
         }
 
@@ -369,15 +373,18 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
     }
 
     ++p;
-    if (p > m_end) {
-      Error(kFMErrEndOfInput);
-      return p;
-    }
-
     if (ch != '"') {
+      // We've hit the end of the input, return the string.
+      if (p > m_end) {
+        Error(kFMErrEndOfInput);
+        return p;
+      }
       ch = *p;
       continue;
     }
+    // We've hit the end of the input, return the string.
+    if (p > m_end)
+      break;
 
     if (!IsValid(p)) {
       ch = *p;
@@ -405,7 +412,8 @@ const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) {
   uint16_t ch = *p;
   ++p;
   if (p > m_end) {
-    Error(kFMErrEndOfInput);
+    t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
+    t->m_type = IsKeyword(t->m_wstring);
     return p;
   }
 
@@ -432,10 +440,8 @@ const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) {
       break;
     }
     ++p;
-    if (p > m_end) {
-      Error(kFMErrEndOfInput);
-      return p;
-    }
+    if (p > m_end)
+      break;
   }
   t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
   t->m_type = IsKeyword(t->m_wstring);
@@ -445,29 +451,20 @@ const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) {
 const wchar_t* CXFA_FMLexer::Comment(const wchar_t* p) {
   ++p;
 
-  if (p > m_end) {
-    Error(kFMErrEndOfInput);
+  if (p > m_end)
     return p;
-  }
 
   unsigned ch = *p;
   while (ch) {
-    if (ch == L'\r') {
-      ++p;
-      if (p > m_end)
-        Error(kFMErrEndOfInput);
-      return p;
-    }
-
     ++p;
-    if (p > m_end) {
-      Error(kFMErrEndOfInput);
+    if (ch == L'\r')
       return p;
-    }
     if (ch == L'\n') {
       ++m_uCurrentLine;
       return p;
     }
+    if (p > m_end)
+      return p;
     ch = *p;
   }
   return p;
diff --git a/xfa/fxfa/fm2js/xfa_lexer_unittest.cpp b/xfa/fxfa/fm2js/xfa_lexer_unittest.cpp
new file mode 100644
index 0000000000..fac0c9ac55
--- /dev/null
+++ b/xfa/fxfa/fm2js/xfa_lexer_unittest.cpp
@@ -0,0 +1,239 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "xfa/fxfa/fm2js/xfa_lexer.h"
+
+#include <vector>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "testing/test_support.h"
+#include "third_party/base/ptr_util.h"
+
+TEST(CXFA_FMLexerTest, EmptyString) {
+  CXFA_FMLexer lexer(L"", nullptr);
+  CXFA_FMToken* token = lexer.NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+}
+
+TEST(CXFA_FMLexerTest, Numbers) {
+  auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"-12", nullptr);
+  CXFA_FMToken* token = lexer->NextToken();
+  // TODO(dsinclair): Should this return -12 instead of two tokens?
+  EXPECT_EQ(TOKminus, token->m_type);
+  token = lexer->NextToken();
+  EXPECT_EQ(L"12", token->m_wstring);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"1.5362", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  EXPECT_EQ(L"1.5362", token->m_wstring);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"0.875", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  EXPECT_EQ(L"0.875", token->m_wstring);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"5.56e-2", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  EXPECT_EQ(L"5.56e-2", token->m_wstring);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"1.234E10", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  EXPECT_EQ(L"1.234E10", token->m_wstring);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"123456789.012345678", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  // TODO(dsinclair): This should round as per IEEE 64-bit values.
+  // EXPECT_EQ(L"123456789.01234567", token->m_wstring);
+  EXPECT_EQ(L"123456789.012345678", token->m_wstring);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"99999999999999999", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  // TODO(dsinclair): This is spec'd as rounding when > 16 significant digits
+  // prior to the exponent.
+  // EXPECT_EQ(L"100000000000000000", token->m_wstring);
+  EXPECT_EQ(L"99999999999999999", token->m_wstring);
+}
+
+// The quotes are stripped in CXFA_FMStringExpression::ToJavaScript.
+TEST(CXFA_FMLexerTest, Strings) {
+  auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(
+      L"\"The cat jumped over the fence.\"", nullptr);
+  CXFA_FMToken* token = lexer->NextToken();
+  EXPECT_EQ(TOKstring, token->m_type);
+  EXPECT_EQ(L"\"The cat jumped over the fence.\"", token->m_wstring);
+
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"\"\"", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKstring, token->m_type);
+  EXPECT_EQ(L"\"\"", token->m_wstring);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(
+      L"\"The message reads: \"\"Warning: Insufficient Memory\"\"\"", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKstring, token->m_type);
+  EXPECT_EQ(L"\"The message reads: \"\"Warning: Insufficient Memory\"\"\"",
+            token->m_wstring);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(
+      L"\"\\u0047\\u006f\\u0066\\u0069\\u0073\\u0068\\u0021\\u000d\\u000a\"",
+      nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKstring, token->m_type);
+  EXPECT_EQ(
+      L"\"\\u0047\\u006f\\u0066\\u0069\\u0073\\u0068\\u0021\\u000d\\u000a\"",
+      token->m_wstring);
+}
+
+// Note, 'this' is a keyword but is not matched by the lexer.
+TEST(CXFA_FMLexerTest, OperatorsAndKeywords) {
+  struct {
+    const wchar_t* op;
+    XFA_FM_TOKEN token;
+  } op[] = {{L"+", TOKplus},
+            {L"/", TOKdiv},
+            {L"-", TOKminus},
+            {L"&", TOKand},
+            {L"|", TOKor},
+            {L"*", TOKmul},
+            {L"<", TOKlt},
+            {L">", TOKgt},
+            {L"==", TOKeq},
+            {L"<>", TOKne},
+            {L"<=", TOKle},
+            {L">=", TOKge},
+            {L"and", TOKksand},
+            {L"break", TOKbreak},
+            {L"continue", TOKcontinue},
+            {L"do", TOKdo},
+            {L"downto", TOKdownto},
+            {L"else", TOKelse},
+            {L"elseif", TOKelseif},
+            {L"end", TOKend},
+            {L"endfor", TOKendfor},
+            {L"endfunc", TOKendfunc},
+            {L"endif", TOKendif},
+            {L"endwhile", TOKendwhile},
+            {L"eq", TOKkseq},
+            {L"exit", TOKexit},
+            {L"for", TOKfor},
+            {L"foreach", TOKforeach},
+            {L"func", TOKfunc},
+            {L"ge", TOKksge},
+            {L"gt", TOKksgt},
+            {L"if", TOKif},
+            {L"in", TOKin},
+            {L"infinity", TOKinfinity},
+            {L"le", TOKksle},
+            {L"lt", TOKkslt},
+            {L"nan", TOKnan},
+            {L"ne", TOKksne},
+            {L"not", TOKksnot},
+            {L"null", TOKnull},
+            {L"or", TOKksor},
+            {L"return", TOKreturn},
+            {L"step", TOKstep},
+            {L"then", TOKthen},
+            {L"throw", TOKthrow},
+            {L"upto", TOKupto},
+            {L"var", TOKvar},
+            {L"while", TOKwhile},
+
+            // The following are defined but aren't in the spec.
+            {L"(", TOKlparen},
+            {L")", TOKrparen},
+            {L",", TOKcomma},
+            {L".", TOKdot},
+            {L"[", TOKlbracket},
+            {L"]", TOKrbracket},
+            {L"..", TOKdotdot},
+            {L".#", TOKdotscream},
+            {L".*", TOKdotstar}};
+
+  for (size_t i = 0; i < FX_ArraySize(op); ++i) {
+    auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(op[i].op, nullptr);
+    CXFA_FMToken* token = lexer->NextToken();
+    EXPECT_EQ(op[i].token, token->m_type);
+  }
+}
+
+TEST(CXFA_FMLexerTest, Comments) {
+  auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"// Empty.", nullptr);
+  CXFA_FMToken* token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"//", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"123 // Empty.\n\"str\"", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  EXPECT_EQ(L"123", token->m_wstring);
+
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKstring, token->m_type);
+  EXPECT_EQ(L"\"str\"", token->m_wstring);
+
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L";", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"; Empty.", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"123 ;Empty.\n\"str\"", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  EXPECT_EQ(L"123", token->m_wstring);
+
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKstring, token->m_type);
+  EXPECT_EQ(L"\"str\"", token->m_wstring);
+
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+}
+
+TEST(CXFA_FMLexerTest, Identifiers) {
+  std::vector<const wchar_t*> identifiers = {
+      L"a", L"an_identifier", L"_ident", L"$ident", L"!ident", L"GetAddr"};
+  for (const auto* ident : identifiers) {
+    auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(ident, nullptr);
+    CXFA_FMToken* token = lexer->NextToken();
+    EXPECT_EQ(TOKidentifier, token->m_type);
+    EXPECT_EQ(ident, token->m_wstring);
+  }
+}
+
+TEST(CXFA_FMLexerTest, Whitespace) {
+  auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(L" \t\xc\x9\xb", nullptr);
+  CXFA_FMToken* token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"123 \t\xc\x9\xb 456", nullptr);
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  EXPECT_EQ(L"123", token->m_wstring);
+
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKnumber, token->m_type);
+  EXPECT_EQ(L"456", token->m_wstring);
+
+  token = lexer->NextToken();
+  EXPECT_EQ(TOKeof, token->m_type);
+}
-- 
cgit v1.2.3