summaryrefslogtreecommitdiff
path: root/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
diff options
context:
space:
mode:
authorRyan Harrison <rharrison@chromium.org>2017-07-18 10:27:00 -0400
committerChromium commit bot <commit-bot@chromium.org>2017-07-18 14:48:37 +0000
commit756023071d1c4574fcb433c4bc7f13e7b763f763 (patch)
tree3388aed81fc470ddfff8afa7def359070c2ad086 /xfa/fxfa/fm2js/cxfa_fmlexer.cpp
parent574366b637c1e937efc7b1becb1d151c3599f7af (diff)
downloadpdfium-756023071d1c4574fcb433c4bc7f13e7b763f763.tar.xz
Correct lexer handling of FormCalc identifiers
This makes the lexer stricter on valid characters for identifiers, and conform to the grammar in the FormCalc spec. This should remove a class of inputs that ClusterFuzz is attempting that are breaking later stages of the transpile. BUG: chromium:736234, pdfium:783, pdfium:784 Change-Id: I3987d6778a82b71d768fa751035993c0af2577ee Reviewed-on: https://pdfium-review.googlesource.com/8010 Commit-Queue: Ryan Harrison <rharrison@chromium.org> Reviewed-by: Tom Sepez <tsepez@chromium.org>
Diffstat (limited to 'xfa/fxfa/fm2js/cxfa_fmlexer.cpp')
-rw-r--r--xfa/fxfa/fm2js/cxfa_fmlexer.cpp65
1 files changed, 31 insertions, 34 deletions
diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
index 7537041bb7..908cd2f04a 100644
--- a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
+++ b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp
@@ -1,4 +1,4 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
+// Copright 2014 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -10,14 +10,26 @@
#include "core/fxcrt/fx_extension.h"
#include "third_party/base/ptr_util.h"
+#include "third_party/icu/source/common/unicode/uchar.h"
namespace {
-bool IsValid(const wchar_t* p) {
+bool IsValidFormCalcCharacter(const wchar_t* p) {
return *p == 0 || (*p >= 0x09 && *p <= 0x0D) ||
(*p >= 0x20 && *p <= 0xd7FF) || (*p >= 0xE000 && *p <= 0xFFFD);
}
+bool IsValidIdentifierCharacter(const wchar_t* p) {
+ return u_isalnum(*p) || *p == 0x005F || // '_'
+ *p == 0x0024; // '$'
+}
+
+bool IsValidInitialIdentifierCharacter(const wchar_t* p) {
+ return u_isalpha(*p) || *p == 0x005F || // '_'
+ *p == 0x0024 || // '$'
+ *p == 0x0021; // '!'
+}
+
const XFA_FMKeyword keyWords[] = {
{TOKand, 0x00000026, L"&"},
{TOKlparen, 0x00000028, L"("},
@@ -118,9 +130,9 @@ CXFA_FMToken* CXFA_FMLexer::NextToken() {
}
std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
- uint16_t ch = 0;
+ wchar_t ch = 0;
auto p = pdfium::MakeUnique<CXFA_FMToken>(m_uCurrentLine);
- if (!IsValid(m_ptr)) {
+ if (!IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
m_LexerError = true;
return p;
@@ -136,7 +148,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
}
ch = *m_ptr;
- if (!IsValid(m_ptr)) {
+ if (!IsValidFormCalcCharacter(m_ptr)) {
m_LexerError = true;
return p;
}
@@ -183,7 +195,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (IsValid(m_ptr)) {
+ if (IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
if (ch == '=') {
p->m_type = TOKeq;
@@ -203,7 +215,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (IsValid(m_ptr)) {
+ if (IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
if (ch == '=') {
p->m_type = TOKle;
@@ -226,7 +238,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (IsValid(m_ptr)) {
+ if (IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
if (ch == '=') {
p->m_type = TOKge;
@@ -286,7 +298,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (!IsValid(m_ptr)) {
+ if (!IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
m_LexerError = true;
return p;
@@ -306,7 +318,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
return p;
}
- if (IsValid(m_ptr)) {
+ if (IsValidFormCalcCharacter(m_ptr)) {
ch = *m_ptr;
if (ch == '.') {
p->m_type = TOKdotdot;
@@ -336,6 +348,10 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() {
++m_ptr;
break;
default: {
+ if (!IsValidInitialIdentifierCharacter(m_ptr)) {
+ m_LexerError = true;
+ return p;
+ }
m_ptr = Identifiers(p.get(), m_ptr);
return p;
}
@@ -368,7 +384,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
uint16_t ch = *p;
while (ch) {
- if (!IsValid(p)) {
+ if (!IsValidFormCalcCharacter(p)) {
ch = *p;
t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
m_LexerError = true;
@@ -389,7 +405,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
if (p > m_end)
break;
- if (!IsValid(p)) {
+ if (!IsValidFormCalcCharacter(p)) {
ch = *p;
t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
m_LexerError = true;
@@ -412,34 +428,15 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) {
const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) {
const wchar_t* pStart = p;
- uint16_t ch = *p;
++p;
- if (p > m_end) {
- t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
- t->m_type = IsKeyword(t->m_wstring);
- return p;
- }
-
- if (!IsValid(p)) {
- t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
- m_LexerError = true;
- return p;
- }
-
- ch = *p;
- while (ch) {
- if (!IsValid(p)) {
+ while (p <= m_end && *p) {
+ if (!IsValidFormCalcCharacter(p)) {
t->m_wstring = CFX_WideStringC(pStart, (p - pStart));
m_LexerError = true;
return p;
}
- ch = *p;
- if (ch == 0 || ch == 0x0A || ch == 0x0D || ch == 0x09 || ch == 0x0B ||
- ch == 0x0C || ch == 0x20 || ch == '.' || ch == ';' || ch == '"' ||
- ch == '=' || ch == '<' || ch == '>' || ch == ',' || ch == '(' ||
- ch == ')' || ch == ']' || ch == '[' || ch == '&' || ch == '|' ||
- ch == '+' || ch == '-' || ch == '*' || ch == '/') {
+ if (!IsValidIdentifierCharacter(p)) {
break;
}
++p;