diff options
author | Ryan Harrison <rharrison@chromium.org> | 2017-07-18 10:27:00 -0400 |
---|---|---|
committer | Chromium commit bot <commit-bot@chromium.org> | 2017-07-18 14:48:37 +0000 |
commit | 756023071d1c4574fcb433c4bc7f13e7b763f763 (patch) | |
tree | 3388aed81fc470ddfff8afa7def359070c2ad086 /xfa/fxfa/fm2js/cxfa_fmlexer.cpp | |
parent | 574366b637c1e937efc7b1becb1d151c3599f7af (diff) | |
download | pdfium-756023071d1c4574fcb433c4bc7f13e7b763f763.tar.xz |
Correct lexer handling of FormCalc identifiers
This makes the lexer stricter on valid characters for identifiers, and
conform to the grammar in the FormCalc spec. This should remove a
class of inputs that ClusterFuzz is attempting that are breaking later
stages of the transpile.
BUG: chromium:736234, pdfium:783, pdfium:784
Change-Id: I3987d6778a82b71d768fa751035993c0af2577ee
Reviewed-on: https://pdfium-review.googlesource.com/8010
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Diffstat (limited to 'xfa/fxfa/fm2js/cxfa_fmlexer.cpp')
-rw-r--r-- | xfa/fxfa/fm2js/cxfa_fmlexer.cpp | 65 |
1 files changed, 31 insertions, 34 deletions
diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp index 7537041bb7..908cd2f04a 100644 --- a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp +++ b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp @@ -1,4 +1,4 @@ -// Copyright 2014 PDFium Authors. All rights reserved. +// Copright 2014 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -10,14 +10,26 @@ #include "core/fxcrt/fx_extension.h" #include "third_party/base/ptr_util.h" +#include "third_party/icu/source/common/unicode/uchar.h" namespace { -bool IsValid(const wchar_t* p) { +bool IsValidFormCalcCharacter(const wchar_t* p) { return *p == 0 || (*p >= 0x09 && *p <= 0x0D) || (*p >= 0x20 && *p <= 0xd7FF) || (*p >= 0xE000 && *p <= 0xFFFD); } +bool IsValidIdentifierCharacter(const wchar_t* p) { + return u_isalnum(*p) || *p == 0x005F || // '_' + *p == 0x0024; // '$' +} + +bool IsValidInitialIdentifierCharacter(const wchar_t* p) { + return u_isalpha(*p) || *p == 0x005F || // '_' + *p == 0x0024 || // '$' + *p == 0x0021; // '!' +} + const XFA_FMKeyword keyWords[] = { {TOKand, 0x00000026, L"&"}, {TOKlparen, 0x00000028, L"("}, @@ -118,9 +130,9 @@ CXFA_FMToken* CXFA_FMLexer::NextToken() { } std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() { - uint16_t ch = 0; + wchar_t ch = 0; auto p = pdfium::MakeUnique<CXFA_FMToken>(m_uCurrentLine); - if (!IsValid(m_ptr)) { + if (!IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; m_LexerError = true; return p; @@ -136,7 +148,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() { } ch = *m_ptr; - if (!IsValid(m_ptr)) { + if (!IsValidFormCalcCharacter(m_ptr)) { m_LexerError = true; return p; } @@ -183,7 +195,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() { return p; } - if (IsValid(m_ptr)) { + if (IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; if (ch == '=') { p->m_type = TOKeq; @@ -203,7 +215,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() { return p; } - if (IsValid(m_ptr)) { + if (IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; if (ch == '=') { p->m_type = TOKle; @@ -226,7 +238,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() { return p; } - if (IsValid(m_ptr)) { + if (IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; if (ch == '=') { p->m_type = TOKge; @@ -286,7 +298,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() { return p; } - if (!IsValid(m_ptr)) { + if (!IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; m_LexerError = true; return p; @@ -306,7 +318,7 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() { return p; } - if (IsValid(m_ptr)) { + if (IsValidFormCalcCharacter(m_ptr)) { ch = *m_ptr; if (ch == '.') { p->m_type = TOKdotdot; @@ -336,6 +348,10 @@ std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::Scan() { ++m_ptr; break; default: { + if (!IsValidInitialIdentifierCharacter(m_ptr)) { + m_LexerError = true; + return p; + } m_ptr = Identifiers(p.get(), m_ptr); return p; } @@ -368,7 +384,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) { uint16_t ch = *p; while (ch) { - if (!IsValid(p)) { + if (!IsValidFormCalcCharacter(p)) { ch = *p; t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); m_LexerError = true; @@ -389,7 +405,7 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) { if (p > m_end) break; - if (!IsValid(p)) { + if (!IsValidFormCalcCharacter(p)) { ch = *p; t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); m_LexerError = true; @@ -412,34 +428,15 @@ const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) { const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) { const wchar_t* pStart = p; - uint16_t ch = *p; ++p; - if (p > m_end) { - t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); - t->m_type = IsKeyword(t->m_wstring); - return p; - } - - if (!IsValid(p)) { - t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); - m_LexerError = true; - return p; - } - - ch = *p; - while (ch) { - if (!IsValid(p)) { + while (p <= m_end && *p) { + if (!IsValidFormCalcCharacter(p)) { t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); m_LexerError = true; return p; } - ch = *p; - if (ch == 0 || ch == 0x0A || ch == 0x0D || ch == 0x09 || ch == 0x0B || - ch == 0x0C || ch == 0x20 || ch == '.' || ch == ';' || ch == '"' || - ch == '=' || ch == '<' || ch == '>' || ch == ',' || ch == '(' || - ch == ')' || ch == ']' || ch == '[' || ch == '&' || ch == '|' || - ch == '+' || ch == '-' || ch == '*' || ch == '/') { + if (!IsValidIdentifierCharacter(p)) { break; } ++p; |