From 756023071d1c4574fcb433c4bc7f13e7b763f763 Mon Sep 17 00:00:00 2001 From: Ryan Harrison Date: Tue, 18 Jul 2017 10:27:00 -0400 Subject: Correct lexer handling of FormCalc identifiers This makes the lexer stricter on valid characters for identifiers, and conform to the grammar in the FormCalc spec. This should remove a class of inputs that ClusterFuzz is attempting that are breaking later stages of the transpile. BUG: chromium:736234, pdfium:783, pdfium:784 Change-Id: I3987d6778a82b71d768fa751035993c0af2577ee Reviewed-on: https://pdfium-review.googlesource.com/8010 Commit-Queue: Ryan Harrison Reviewed-by: Tom Sepez --- xfa/fxfa/fm2js/DEPS | 3 ++ xfa/fxfa/fm2js/cxfa_fmlexer.cpp | 65 +++++++++++++++----------------- xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp | 35 ++++++++++++++++- 3 files changed, 68 insertions(+), 35 deletions(-) create mode 100644 xfa/fxfa/fm2js/DEPS (limited to 'xfa') diff --git a/xfa/fxfa/fm2js/DEPS b/xfa/fxfa/fm2js/DEPS new file mode 100644 index 0000000000..2be03524b6 --- /dev/null +++ b/xfa/fxfa/fm2js/DEPS @@ -0,0 +1,3 @@ +include_rules = [ + '+third_party/icu', +] diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp index 7537041bb7..908cd2f04a 100644 --- a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp +++ b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp @@ -1,4 +1,4 @@ -// Copyright 2014 PDFium Authors. All rights reserved. +// Copright 2014 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -10,14 +10,26 @@ #include "core/fxcrt/fx_extension.h" #include "third_party/base/ptr_util.h" +#include "third_party/icu/source/common/unicode/uchar.h" namespace { -bool IsValid(const wchar_t* p) { +bool IsValidFormCalcCharacter(const wchar_t* p) { return *p == 0 || (*p >= 0x09 && *p <= 0x0D) || (*p >= 0x20 && *p <= 0xd7FF) || (*p >= 0xE000 && *p <= 0xFFFD); } +bool IsValidIdentifierCharacter(const wchar_t* p) { + return u_isalnum(*p) || *p == 0x005F || // '_' + *p == 0x0024; // '$' +} + +bool IsValidInitialIdentifierCharacter(const wchar_t* p) { + return u_isalpha(*p) || *p == 0x005F || // '_' + *p == 0x0024 || // '$' + *p == 0x0021; // '!' +} + const XFA_FMKeyword keyWords[] = { {TOKand, 0x00000026, L"&"}, {TOKlparen, 0x00000028, L"("}, @@ -118,9 +130,9 @@ CXFA_FMToken* CXFA_FMLexer::NextToken() { } std::unique_ptr CXFA_FMLexer::Scan() { - uint16_t ch = 0; + wchar_t ch = 0; auto p = pdfium::MakeUnique(m_uCurrentLine); - if (!IsValid(m_ptr)) { + if (!IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; m_LexerError = true; return p; @@ -136,7 +148,7 @@ std::unique_ptr CXFA_FMLexer::Scan() { } ch = *m_ptr; - if (!IsValid(m_ptr)) { + if (!IsValidFormCalcCharacter(m_ptr)) { m_LexerError = true; return p; } @@ -183,7 +195,7 @@ std::unique_ptr CXFA_FMLexer::Scan() { return p; } - if (IsValid(m_ptr)) { + if (IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; if (ch == '=') { p->m_type = TOKeq; @@ -203,7 +215,7 @@ std::unique_ptr CXFA_FMLexer::Scan() { return p; } - if (IsValid(m_ptr)) { + if (IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; if (ch == '=') { p->m_type = TOKle; @@ -226,7 +238,7 @@ std::unique_ptr CXFA_FMLexer::Scan() { return p; } - if (IsValid(m_ptr)) { + if (IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; if (ch == '=') { p->m_type = TOKge; @@ -286,7 +298,7 @@ std::unique_ptr CXFA_FMLexer::Scan() { return p; } - if (!IsValid(m_ptr)) { + if (!IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; m_LexerError = true; return p; @@ -306,7 +318,7 @@ std::unique_ptr CXFA_FMLexer::Scan() { return p; } - if (IsValid(m_ptr)) { + if (IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; if (ch == '.') { p->m_type = TOKdotdot; @@ -336,6 +348,10 @@ std::unique_ptr CXFA_FMLexer::Scan() { ++m_ptr; break; default: { + if (!IsValidInitialIdentifierCharacter(m_ptr)) { + m_LexerError = true; + return p; + } m_ptr = Identifiers(p.get(), m_ptr); return p; } @@ -368,7 +384,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) { uint16_t ch = *p; while (ch) { - if (!IsValid(p)) { + if (!IsValidFormCalcCharacter(p)) { ch = *p; t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); m_LexerError = true; @@ -389,7 +405,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) { if (p > m_end) break; - if (!IsValid(p)) { + if (!IsValidFormCalcCharacter(p)) { ch = *p; t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); m_LexerError = true; @@ -412,34 +428,15 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) { const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) { const wchar_t* pStart = p; - uint16_t ch = *p; ++p; - if (p > m_end) { - t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); - t->m_type = IsKeyword(t->m_wstring); - return p; - } - - if (!IsValid(p)) { - t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); - m_LexerError = true; - return p; - } - - ch = *p; - while (ch) { - if (!IsValid(p)) { + while (p <= m_end && *p) { + if (!IsValidFormCalcCharacter(p)) { t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); m_LexerError = true; return p; } - ch = *p; - if (ch == 0 || ch == 0x0A || ch == 0x0D || ch == 0x09 || ch == 0x0B || - ch == 0x0C || ch == 0x20 || ch == '.' || ch == ';' || ch == '"' || - ch == '=' || ch == '<' || ch == '>' || ch == ',' || ch == '(' || - ch == ')' || ch == ']' || ch == '[' || ch == '&' || ch == '|' || - ch == '+' || ch == '-' || ch == '*' || ch == '/') { + if (!IsValidIdentifierCharacter(p)) { break; } ++p; diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp index 7ca12d2fad..5a8139d416 100644 --- a/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp +++ b/xfa/fxfa/fm2js/cxfa_fmlexer_unittest.cpp @@ -208,7 +208,7 @@ TEST(CXFA_FMLexerTest, Comments) { EXPECT_EQ(TOKeof, token->m_type); } -TEST(CXFA_FMLexerTest, Identifiers) { +TEST(CXFA_FMLexerTest, ValidIdentifiers) { std::vector identifiers = { L"a", L"an_identifier", L"_ident", L"$ident", L"!ident", L"GetAddr"}; for (const auto* ident : identifiers) { @@ -219,6 +219,39 @@ TEST(CXFA_FMLexerTest, Identifiers) { } } +TEST(CXFA_FMLexerTest, InvalidIdentifiers) { + auto lexer = pdfium::MakeUnique(L"#a"); + lexer->NextToken(); + // TODO(rharrison): Add an expects for the return being nullptr here. + // See https://crbug.com/pdfium/814 + EXPECT_TRUE(lexer->HasError()); + + lexer = pdfium::MakeUnique(L"1a"); + lexer->NextToken(); + // TODO(rharrison): Add an expects for the return being nullptr here. + // See https://crbug.com/pdfium/814 + EXPECT_TRUE(lexer->HasError()); + + lexer = pdfium::MakeUnique(L"an@identifier"); + lexer->NextToken(); + EXPECT_FALSE(lexer->HasError()); + lexer->NextToken(); + // TODO(rharrison): Add an expects for the return being nullptr here. + // See https://crbug.com/pdfium/814 + EXPECT_TRUE(lexer->HasError()); + // TODO(rharrison): Add a test for if an another call to NextToken occurs, + // the error state will be retained, instead of continuing the parse. + // See https://crbug.com/pdfium/814 + + lexer = pdfium::MakeUnique(L"_ident@"); + lexer->NextToken(); + EXPECT_FALSE(lexer->HasError()); + lexer->NextToken(); + // TODO(rharrison): Add an expects for the return being nullptr here. + // See https://crbug.com/pdfium/814 + EXPECT_TRUE(lexer->HasError()); +} + TEST(CXFA_FMLexerTest, Whitespace) { auto lexer = pdfium::MakeUnique(L" \t\xc\x9\xb"); CXFA_FMToken* token = lexer->NextToken(); -- cgit v1.2.3