summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyan Harrison <rharrison@chromium.org>2017-07-18 10:27:00 -0400
committerChromium commit bot <commit-bot@chromium.org>2017-07-18 14:48:37 +0000
commit756023071d1c4574fcb433c4bc7f13e7b763f763 (patch)
tree3388aed81fc470ddfff8afa7def359070c2ad086
parent574366b637c1e937efc7b1becb1d151c3599f7af (diff)
downloadpdfium-756023071d1c4574fcb433c4bc7f13e7b763f763.tar.xz
Correct lexer handling of FormCalc identifiers
This makes the lexer stricter on valid characters for identifiers, and conform to the grammar in the FormCalc spec. This should remove a class of inputs that ClusterFuzz is attempting that are breaking later stages of the transpile. BUG: chromium:736234, pdfium:783, pdfium:784 Change-Id: I3987d6778a82b71d768fa751035993c0af2577ee Reviewed-on: https://pdfium-review.googlesource.com/8010 Commit-Queue: Ryan Harrison <rharrison@chromium.org> Reviewed-by: Tom Sepez <tsepez@chromium.org>
-rw-r--r--xfa/fxfa/fm2js/DEPS3
-rw-r--r--xfa/fxfa/fm2js/cxfa_fmlexer.cpp65
-rw-r--r--xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp35
3 files changed, 68 insertions, 35 deletions
diff --git a/xfa/fxfa/fm2js/DEPS b/xfa/fxfa/fm2js/DEPS
new file mode 100644
index 0000000000..2be03524b6
--- /dev/null
+++ b/xfa/fxfa/fm2js/DEPS
@@ -0,0 +1,3 @@
+include_rules = [
+ '+third_party/icu',
+]
diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
index 7537041bb7..908cd2f04a 100644
--- a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
+++ b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
@@ -1,4 +1,4 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
+// Copright 2014 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -10,14 +10,26 @@
#include "core/fxcrt/fx_extension.h"
#include "third_party/base/ptr_util.h"
+#include "third_party/icu/source/common/unicode/uchar.h"
namespace {
-bool IsValid(const wchar_t* p) {
+bool IsValidFormCalcCharacter(const wchar_t* p) {
return *p == 0 || (*p >= 0x09 && *p <= 0x0D) ||
(*p >= 0x20 && *p <= 0xd7FF) || (*p >= 0xE000 && *p <= 0xFFFD);
}
+bool IsValidIdentifierCharacter(const wchar_t* p) {
+ return u_isalnum(*p) || *p == 0x005F || // '_'
+ *p == 0x0024; // '$'
+}
+
+bool IsValidInitialIdentifierCharacter(const wchar_t* p) {
+ return u_isalpha(*p) || *p == 0x005F || // '_'
+ *p == 0x0024 || // '$'
+ *p == 0x0021; // '!'
+}
+
const XFA_FMKeyword keyWords[] = {
{TOKand, 0x00000026, L"&"},
{TOKlparen, 0x00000028, L"("},
@@ -118,9 +130,9 @@ CXFA_FMToken* CXFA_FMLexer::NextToken() {
}
std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
- uint16_t ch = 0;
+ wchar_t ch = 0;
auto p = pdfium::MakeUnique<CXFA_FMToken>(m_uCurrentLine);
- if (!IsValid(m_ptr)) {
+ if (!IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
m_LexerError = true;
return p;
@@ -136,7 +148,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
}
ch = *m_ptr;
- if (!IsValid(m_ptr)) {
+ if (!IsValidFormCalcCharacter(m_ptr)) {
m_LexerError = true;
return p;
}
@@ -183,7 +195,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (IsValid(m_ptr)) {
+ if (IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
if (ch == '=') {
p->m_type = TOKeq;
@@ -203,7 +215,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (IsValid(m_ptr)) {
+ if (IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
if (ch == '=') {
p->m_type = TOKle;
@@ -226,7 +238,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (IsValid(m_ptr)) {
+ if (IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
if (ch == '=') {
p->m_type = TOKge;
@@ -286,7 +298,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (!IsValid(m_ptr)) {
+ if (!IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
m_LexerError = true;
return p;
@@ -306,7 +318,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (IsValid(m_ptr)) {
+ if (IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
if (ch == '.') {
p->m_type = TOKdotdot;
@@ -336,6 +348,10 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
++m_ptr;
break;
default: {
+ if (!IsValidInitialIdentifierCharacter(m_ptr)) {
+ m_LexerError = true;
+ return p;
+ }
m_ptr = Identifiers(p.get(), m_ptr);
return p;
}
@@ -368,7 +384,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
uint16_t ch = *p;
while (ch) {
- if (!IsValid(p)) {
+ if (!IsValidFormCalcCharacter(p)) {
ch = *p;
t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
m_LexerError = true;
@@ -389,7 +405,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
if (p > m_end)
break;
- if (!IsValid(p)) {
+ if (!IsValidFormCalcCharacter(p)) {
ch = *p;
t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
m_LexerError = true;
@@ -412,34 +428,15 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) {
const wchar_t* pStart = p;
- uint16_t ch = *p;
++p;
- if (p > m_end) {
- t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
- t->m_type = IsKeyword(t->m_wstring);
- return p;
- }
-
- if (!IsValid(p)) {
- t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
- m_LexerError = true;
- return p;
- }
-
- ch = *p;
- while (ch) {
- if (!IsValid(p)) {
+ while (p <= m_end && *p) {
+ if (!IsValidFormCalcCharacter(p)) {
t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
m_LexerError = true;
return p;
}
- ch = *p;
- if (ch == 0 || ch == 0x0A || ch == 0x0D || ch == 0x09 || ch == 0x0B ||
- ch == 0x0C || ch == 0x20 || ch == '.' || ch == ';' || ch == '"' ||
- ch == '=' || ch == '<' || ch == '>' || ch == ',' || ch == '(' ||
- ch == ')' || ch == ']' || ch == '[' || ch == '&' || ch == '|' ||
- ch == '+' || ch == '-' || ch == '*' || ch == '/') {
+ if (!IsValidIdentifierCharacter(p)) {
break;
}
++p;
diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp
index 7ca12d2fad..5a8139d416 100644
--- a/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp
+++ b/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp
@@ -208,7 +208,7 @@ TEST(CXFA_FMLexerTest, Comments) {
EXPECT_EQ(TOKeof, token->m_type);
}
-TEST(CXFA_FMLexerTest, Identifiers) {
+TEST(CXFA_FMLexerTest, ValidIdentifiers) {
std::vector<const wchar_t*> identifiers = {
L"a", L"an_identifier", L"_ident", L"$ident", L"!ident", L"GetAddr"};
for (const auto* ident : identifiers) {
@@ -219,6 +219,39 @@ TEST(CXFA_FMLexerTest, Identifiers) {
}
}
+TEST(CXFA_FMLexerTest, InvalidIdentifiers) {
+ auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"#a");
+ lexer->NextToken();
+ // TODO(rharrison): Add an expects for the return being nullptr here.
+ // See https://crbug.com/pdfium/814
+ EXPECT_TRUE(lexer->HasError());
+
+ lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"1a");
+ lexer->NextToken();
+ // TODO(rharrison): Add an expects for the return being nullptr here.
+ // See https://crbug.com/pdfium/814
+ EXPECT_TRUE(lexer->HasError());
+
+ lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"an@identifier");
+ lexer->NextToken();
+ EXPECT_FALSE(lexer->HasError());
+ lexer->NextToken();
+ // TODO(rharrison): Add an expects for the return being nullptr here.
+ // See https://crbug.com/pdfium/814
+ EXPECT_TRUE(lexer->HasError());
+ // TODO(rharrison): Add a test for if an another call to NextToken occurs,
+ // the error state will be retained, instead of continuing the parse.
+ // See https://crbug.com/pdfium/814
+
+ lexer = pdfium::MakeUnique<CXFA_FMLexer>(L"_ident@");
+ lexer->NextToken();
+ EXPECT_FALSE(lexer->HasError());
+ lexer->NextToken();
+ // TODO(rharrison): Add an expects for the return being nullptr here.
+ // See https://crbug.com/pdfium/814
+ EXPECT_TRUE(lexer->HasError());
+}
+
TEST(CXFA_FMLexerTest, Whitespace) {
auto lexer = pdfium::MakeUnique<CXFA_FMLexer>(L" \t\xc\x9\xb");
CXFA_FMToken* token = lexer->NextToken();