From 756023071d1c4574fcb433c4bc7f13e7b763f763 Mon Sep 17 00:00:00 2001
From: Ryan Harrison <rharrison@chromium.org>
Date: Tue, 18 Jul 2017 10:27:00 -0400
Subject: Correct lexer handling of FormCalc identifiers

This makes the lexer stricter on valid characters for identifiers, and
conform to the grammar in the FormCalc spec. This should remove a
class of inputs that ClusterFuzz is attempting that are breaking later
stages of the transpile.

BUG: chromium:736234, pdfium:783, pdfium:784

Change-Id: I3987d6778a82b71d768fa751035993c0af2577ee
Reviewed-on: https://pdfium-review.googlesource.com/8010
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
---
 xfa/fxfa/fm2js/DEPS                      |  3 ++
 xfa/fxfa/fm2js/cxfa_fmlexer.cpp          | 65 +++++++++++++++-----------------
 xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp | 35 ++++++++++++++++-
 3 files changed, 68 insertions(+), 35 deletions(-)
 create mode 100644 xfa/fxfa/fm2js/DEPS

diff --git a/xfa/fxfa/fm2js/DEPS b/xfa/fxfa/fm2js/DEPS
new file mode 100644
index 0000000000..2be03524b6
--- /dev/null
+++ b/xfa/fxfa/fm2js/DEPS
@@ -0,0 +1,3 @@
+include_rules = [
+  '+third_party/icu',
+]
diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
index 7537041bb7..908cd2f04a 100644
--- a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
+++ b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
@@ -1,4 +1,4 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
+// Copright 2014 PDFium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -10,14 +10,26 @@
 
 #include "core/fxcrt/fx_extension.h"
 #include "third_party/base/ptr_util.h"
+#include "third_party/icu/source/common/unicode/uchar.h"
 
 namespace {
 
-bool IsValid(const wchar_t* p) {
+bool IsValidFormCalcCharacter(const wchar_t* p) {
   return *p == 0 || (*p >= 0x09 && *p <= 0x0D) ||
          (*p >= 0x20 && *p <= 0xd7FF) || (*p >= 0xE000 && *p <= 0xFFFD);
 }
 
+bool IsValidIdentifierCharacter(const wchar_t* p) {
+  return u_isalnum(*p) || *p == 0x005F ||  // '_'
+         *p == 0x0024;                     // '$'
+}
+
+bool IsValidInitialIdentifierCharacter(const wchar_t* p) {
+  return u_isalpha(*p) || *p == 0x005F ||  // '_'
+         *p == 0x0024 ||                   // '$'
+         *p == 0x0021;                     // '!'
+}
+
 const XFA_FMKeyword keyWords[] = {
     {TOKand, 0x00000026, L"&"},
     {TOKlparen, 0x00000028, L"("},
@@ -118,9 +130,9 @@ CXFA_FMToken* CXFA_FMLexer::NextToken() {
 }
 
 std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
-  uint16_t ch = 0;
+  wchar_t ch = 0;
   auto p = pdfium::MakeUnique<CXFA_FMToken>(m_uCurrentLine);
-  if (!IsValid(m_ptr)) {
+  if (!IsValidFormCalcCharacter(m_ptr)) {
     ch = *m_ptr;
     m_LexerError = true;
     return p;
@@ -136,7 +148,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
     }
 
     ch = *m_ptr;
-    if (!IsValid(m_ptr)) {
+    if (!IsValidFormCalcCharacter(m_ptr)) {
       m_LexerError = true;
       return p;
     }
@@ -183,7 +195,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
           return p;
         }
 
-        if (IsValid(m_ptr)) {
+        if (IsValidFormCalcCharacter(m_ptr)) {
           ch = *m_ptr;
           if (ch == '=') {
             p->m_type = TOKeq;
@@ -203,7 +215,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
           return p;
         }
 
-        if (IsValid(m_ptr)) {
+        if (IsValidFormCalcCharacter(m_ptr)) {
           ch = *m_ptr;
           if (ch == '=') {
             p->m_type = TOKle;
@@ -226,7 +238,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
           return p;
         }
 
-        if (IsValid(m_ptr)) {
+        if (IsValidFormCalcCharacter(m_ptr)) {
           ch = *m_ptr;
           if (ch == '=') {
             p->m_type = TOKge;
@@ -286,7 +298,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
           return p;
         }
 
-        if (!IsValid(m_ptr)) {
+        if (!IsValidFormCalcCharacter(m_ptr)) {
           ch = *m_ptr;
           m_LexerError = true;
           return p;
@@ -306,7 +318,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
           return p;
         }
 
-        if (IsValid(m_ptr)) {
+        if (IsValidFormCalcCharacter(m_ptr)) {
           ch = *m_ptr;
           if (ch == '.') {
             p->m_type = TOKdotdot;
@@ -336,6 +348,10 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
         ++m_ptr;
         break;
       default: {
+        if (!IsValidInitialIdentifierCharacter(m_ptr)) {
+          m_LexerError = true;
+          return p;
+        }
         m_ptr = Identifiers(p.get(), m_ptr);
         return p;
       }
@@ -368,7 +384,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
 
   uint16_t ch = *p;
   while (ch) {
-    if (!IsValid(p)) {
+    if (!IsValidFormCalcCharacter(p)) {
       ch = *p;
       t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
       m_LexerError = true;
@@ -389,7 +405,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
     if (p > m_end)
       break;
 
-    if (!IsValid(p)) {
+    if (!IsValidFormCalcCharacter(p)) {
       ch = *p;
       t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
       m_LexerError = true;
@@ -412,34 +428,15 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
 
 const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) {
   const wchar_t* pStart = p;
-  uint16_t ch = *p;
   ++p;
-  if (p > m_end) {
-    t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
-    t->m_type = IsKeyword(t->m_wstring);
-    return p;
-  }
-
-  if (!IsValid(p)) {
-    t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
-    m_LexerError = true;
-    return p;
-  }
-
-  ch = *p;
-  while (ch) {
-    if (!IsValid(p)) {
+  while (p <= m_end && *p) {
+    if (!IsValidFormCalcCharacter(p)) {
       t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
       m_LexerError = true;
       return p;
     }
 
-    ch = *p;
-    if (ch == 0 || ch == 0x0A || ch == 0x0D || ch == 0x09 || ch == 0x0B ||
-        ch == 0x0C || ch == 0x20 || ch == '.' || ch == ';' || ch == '"' ||
-        ch == '=' || ch == '<' || ch == '>' || ch == ',' || ch == '(' ||
-        ch == ')' || ch == ']' || ch == '[' || ch == '&' || ch == '|' ||
-        ch == '+' || ch == '-' || ch == '*' || ch == '/') {
+    if (!IsValidIdentifierCharacter(p)) {
       break;
     }
     ++p;
diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp
index 7ca12d2fad..5a8139d416 100644
--- a/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp
+++ b/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp
@@ -208,7 +208,7 @@ TEST(CXFA_FMLexerTest, Comments) {
   EXPECT_EQ(TOKeof, token->m_type);
 }
 
-TEST(CXFA_FMLexerTest, Identifiers) {
+TEST(CXFA_FMLexerTest, ValidIdentifiers) {
   std::vector<const wchar_t*> identifiers = {
       L"a", L"an_identifier", L"_ident", L"$ident", L"!ident", L"GetAddr"};
   for (const auto* ident : identifiers) {
@@ -219,6 +219,39 @@ TEST(CXFA_FMLexerTest, Identifiers) {
   }
 }
 
+TEST(CXFA_FMLexerTest, InvalidIdentifiers) {
+  auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"#a");
+  lexer->NextToken();
+  // TODO(rharrison): Add an expects for the return being nullptr here.
+  // See https://crbug.com/pdfium/814
+  EXPECT_TRUE(lexer->HasError());
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"1a");
+  lexer->NextToken();
+  // TODO(rharrison): Add an expects for the return being nullptr here.
+  // See https://crbug.com/pdfium/814
+  EXPECT_TRUE(lexer->HasError());
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"an@identifier");
+  lexer->NextToken();
+  EXPECT_FALSE(lexer->HasError());
+  lexer->NextToken();
+  // TODO(rharrison): Add an expects for the return being nullptr here.
+  // See https://crbug.com/pdfium/814
+  EXPECT_TRUE(lexer->HasError());
+  // TODO(rharrison): Add a test for if an another call to NextToken occurs,
+  // the error state will be retained, instead of continuing the parse.
+  // See https://crbug.com/pdfium/814
+
+  lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"_ident@");
+  lexer->NextToken();
+  EXPECT_FALSE(lexer->HasError());
+  lexer->NextToken();
+  // TODO(rharrison): Add an expects for the return being nullptr here.
+  // See https://crbug.com/pdfium/814
+  EXPECT_TRUE(lexer->HasError());
+}
+
 TEST(CXFA_FMLexerTest, Whitespace) {
   auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(L" \t\xc\x9\xb");
   CXFA_FMToken* token = lexer->NextToken();
-- 
cgit v1.2.3