From 585f4f0828d2bef2ea132a19e28f21af67bb6bdd Mon Sep 17 00:00:00 2001 From: Ryan Harrison Date: Mon, 24 Jul 2017 14:30:50 -0400 Subject: Remove extracting characters from input Throughout the lexer there is use of a pattern where a character is extracted from the location currently being pointed at in the input and tests are done on this value. This is uneeded, since normally the pointer isn't being modified while the character is stored, so the pointer can be dereferenced directly. This CL also cleans ups the implementation of string extraction from the input to be easier to read, since it did depend on the character being extracted to work. The IsFooCharacter methods are also changed to take in a character instead of a pointer, since they are only testing the character value not the pointer. BUG=pdfium:814 Change-Id: I8eda01e0af59ff1f6c6e97d6fb504856c7a08a24 Reviewed-on: https://pdfium-review.googlesource.com/8690 Commit-Queue: Ryan Harrison Reviewed-by: dsinclair Reviewed-by: Tom Sepez --- xfa/fxfa/fm2js/cxfa_fmlexer.cpp | 316 ++++++++++++++++------------------------ xfa/fxfa/fm2js/cxfa_fmlexer.h | 2 +- 2 files changed, 127 insertions(+), 191 deletions(-) diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp index 6b8508a503..0b94cde944 100644 --- a/xfa/fxfa/fm2js/cxfa_fmlexer.cpp +++ b/xfa/fxfa/fm2js/cxfa_fmlexer.cpp @@ -14,20 +14,20 @@ namespace { -bool IsValidFormCalcCharacter(const wchar_t* p) { - return *p == 0 || (*p >= 0x09 && *p <= 0x0D) || - (*p >= 0x20 && *p <= 0xd7FF) || (*p >= 0xE000 && *p <= 0xFFFD); +bool IsValidFormCalcCharacter(wchar_t c) { + return c == 0 || (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0xd7FF) || + (c >= 0xE000 && c <= 0xFFFD); } -bool IsValidIdentifierCharacter(const wchar_t* p) { - return u_isalnum(*p) || *p == 0x005F || // '_' - *p == 0x0024; // '$' +bool IsValidIdentifierCharacter(wchar_t c) { + return u_isalnum(c) || c == 0x005F || // '_' + c == 0x0024; // '$' } -bool IsValidInitialIdentifierCharacter(const wchar_t* p) { - return u_isalpha(*p) || *p == 0x005F || // '_' - *p == 0x0024 || // '$' - *p == 0x0021; // '!' +bool IsValidInitialIdentifierCharacter(wchar_t c) { + return u_isalpha(c) || c == 0x005F || // '_' + c == 0x0024 || // '$' + c == 0x0021; // '!' } const XFA_FMKeyword keyWords[] = { @@ -119,47 +119,17 @@ CXFA_FMLexer::CXFA_FMLexer(const CFX_WideStringC& wsFormCalc) CXFA_FMLexer::~CXFA_FMLexer() {} CXFA_FMToken* CXFA_FMLexer::NextToken() { - // Make sure we don't walk off the end of the string. - if (m_ptr > m_end) { - m_pToken = pdfium::MakeUnique(m_uCurrentLine); - m_pToken->m_type = TOKeof; - } else { - m_pToken = Scan(); - } - return m_pToken.get(); -} - -std::unique_ptr CXFA_FMLexer::Scan() { - wchar_t ch = 0; - auto p = pdfium::MakeUnique(m_uCurrentLine); - if (!IsValidFormCalcCharacter(m_ptr)) { - ch = *m_ptr; - m_LexerError = true; - return p; - } - - while (1) { - // Make sure we don't walk off the end of the string. If we don't currently - // have a token type then mark it EOF. - if (m_ptr > m_end) { - if (p->m_type == TOKreserver) - p->m_type = TOKeof; - return p; - } - - ch = *m_ptr; - if (!IsValidFormCalcCharacter(m_ptr)) { + m_pToken = pdfium::MakeUnique(m_uCurrentLine); + while (m_ptr <= m_end && *m_ptr) { + if (!IsValidFormCalcCharacter(*m_ptr)) { m_LexerError = true; - return p; + return m_pToken.get(); } - switch (ch) { - case 0: - p->m_type = TOKeof; - return p; + switch (*m_ptr) { case 0x0A: ++m_uCurrentLine; - p->m_uLinenum = m_uCurrentLine; + m_pToken->m_uLinenum = m_uCurrentLine; ++m_ptr; break; case 0x0D: @@ -170,9 +140,9 @@ std::unique_ptr CXFA_FMLexer::Scan() { break; } case '"': { - p->m_type = TOKstring; - m_ptr = String(p.get(), m_ptr); - return p; + m_pToken->m_type = TOKstring; + m_ptr = String(m_pToken.get(), m_ptr); + return m_pToken.get(); } case '0': case '1': @@ -184,129 +154,121 @@ std::unique_ptr CXFA_FMLexer::Scan() { case '7': case '8': case '9': { - p->m_type = TOKnumber; - m_ptr = Number(p.get(), m_ptr); - return p; + m_pToken->m_type = TOKnumber; + m_ptr = Number(m_pToken.get(), m_ptr); + return m_pToken.get(); } case '=': ++m_ptr; if (m_ptr > m_end) { - p->m_type = TOKassign; - return p; + m_pToken->m_type = TOKassign; + return m_pToken.get(); } - if (IsValidFormCalcCharacter(m_ptr)) { - ch = *m_ptr; - if (ch == '=') { - p->m_type = TOKeq; + if (IsValidFormCalcCharacter(*m_ptr)) { + if (*m_ptr == '=') { + m_pToken->m_type = TOKeq; ++m_ptr; } else { - p->m_type = TOKassign; + m_pToken->m_type = TOKassign; } } else { - ch = *m_ptr; m_LexerError = true; } - return p; + return m_pToken.get(); case '<': ++m_ptr; if (m_ptr > m_end) { - p->m_type = TOKlt; - return p; + m_pToken->m_type = TOKlt; + return m_pToken.get(); } - if (IsValidFormCalcCharacter(m_ptr)) { - ch = *m_ptr; - if (ch == '=') { - p->m_type = TOKle; + if (IsValidFormCalcCharacter(*m_ptr)) { + if (*m_ptr == '=') { + m_pToken->m_type = TOKle; ++m_ptr; - } else if (ch == '>') { - p->m_type = TOKne; + } else if (*m_ptr == '>') { + m_pToken->m_type = TOKne; ++m_ptr; } else { - p->m_type = TOKlt; + m_pToken->m_type = TOKlt; } } else { - ch = *m_ptr; m_LexerError = true; } - return p; + return m_pToken.get(); case '>': ++m_ptr; if (m_ptr > m_end) { - p->m_type = TOKgt; - return p; + m_pToken->m_type = TOKgt; + return m_pToken.get(); } - if (IsValidFormCalcCharacter(m_ptr)) { - ch = *m_ptr; - if (ch == '=') { - p->m_type = TOKge; + if (IsValidFormCalcCharacter(*m_ptr)) { + if (*m_ptr == '=') { + m_pToken->m_type = TOKge; ++m_ptr; } else { - p->m_type = TOKgt; + m_pToken->m_type = TOKgt; } } else { - ch = *m_ptr; m_LexerError = true; } - return p; + return m_pToken.get(); case ',': - p->m_type = TOKcomma; + m_pToken->m_type = TOKcomma; ++m_ptr; - return p; + return m_pToken.get(); case '(': - p->m_type = TOKlparen; + m_pToken->m_type = TOKlparen; ++m_ptr; - return p; + return m_pToken.get(); case ')': - p->m_type = TOKrparen; + m_pToken->m_type = TOKrparen; ++m_ptr; - return p; + return m_pToken.get(); case '[': - p->m_type = TOKlbracket; + m_pToken->m_type = TOKlbracket; ++m_ptr; - return p; + return m_pToken.get(); case ']': - p->m_type = TOKrbracket; + m_pToken->m_type = TOKrbracket; ++m_ptr; - return p; + return m_pToken.get(); case '&': ++m_ptr; - p->m_type = TOKand; - return p; + m_pToken->m_type = TOKand; + return m_pToken.get(); case '|': ++m_ptr; - p->m_type = TOKor; - return p; + m_pToken->m_type = TOKor; + return m_pToken.get(); case '+': ++m_ptr; - p->m_type = TOKplus; - return p; + m_pToken->m_type = TOKplus; + return m_pToken.get(); case '-': ++m_ptr; - p->m_type = TOKminus; - return p; + m_pToken->m_type = TOKminus; + return m_pToken.get(); case '*': ++m_ptr; - p->m_type = TOKmul; - return p; + m_pToken->m_type = TOKmul; + return m_pToken.get(); case '/': { ++m_ptr; if (m_ptr > m_end) { - p->m_type = TOKdiv; - return p; + m_pToken->m_type = TOKdiv; + return m_pToken.get(); } - if (!IsValidFormCalcCharacter(m_ptr)) { - ch = *m_ptr; + if (!IsValidFormCalcCharacter(*m_ptr)) { m_LexerError = true; - return p; + return m_pToken.get(); } - ch = *m_ptr; - if (ch != '/') { - p->m_type = TOKdiv; - return p; + if (*m_ptr != '/') { + m_pToken->m_type = TOKdiv; + return m_pToken.get(); } m_ptr = Comment(m_ptr); break; @@ -314,33 +276,31 @@ std::unique_ptr CXFA_FMLexer::Scan() { case '.': ++m_ptr; if (m_ptr > m_end) { - p->m_type = TOKdot; - return p; + m_pToken->m_type = TOKdot; + return m_pToken.get(); } - if (IsValidFormCalcCharacter(m_ptr)) { - ch = *m_ptr; - if (ch == '.') { - p->m_type = TOKdotdot; + if (IsValidFormCalcCharacter(*m_ptr)) { + if (*m_ptr == '.') { + m_pToken->m_type = TOKdotdot; ++m_ptr; - } else if (ch == '*') { - p->m_type = TOKdotstar; + } else if (*m_ptr == '*') { + m_pToken->m_type = TOKdotstar; ++m_ptr; - } else if (ch == '#') { - p->m_type = TOKdotscream; + } else if (*m_ptr == '#') { + m_pToken->m_type = TOKdotscream; ++m_ptr; - } else if (ch <= '9' && ch >= '0') { - p->m_type = TOKnumber; + } else if (*m_ptr <= '9' && *m_ptr >= '0') { + m_pToken->m_type = TOKnumber; --m_ptr; - m_ptr = Number(p.get(), m_ptr); + m_ptr = Number(m_pToken.get(), m_ptr); } else { - p->m_type = TOKdot; + m_pToken->m_type = TOKdot; } } else { - ch = *m_ptr; m_LexerError = true; } - return p; + return m_pToken.get(); case 0x09: case 0x0B: case 0x0C: @@ -348,15 +308,20 @@ std::unique_ptr CXFA_FMLexer::Scan() { ++m_ptr; break; default: { - if (!IsValidInitialIdentifierCharacter(m_ptr)) { + if (!IsValidInitialIdentifierCharacter(*m_ptr)) { m_LexerError = true; - return p; + return m_pToken.get(); } - m_ptr = Identifiers(p.get(), m_ptr); - return p; + m_ptr = Identifiers(m_pToken.get(), m_ptr); + return m_pToken.get(); } } } + + // If there isn't currently a token type then mark it EOF. + if (m_pToken->m_type == TOKreserver) + m_pToken->m_type = TOKeof; + return m_pToken.get(); } const wchar_t* CXFA_FMLexer::Number(CXFA_FMToken* t, const wchar_t* p) { @@ -374,55 +339,36 @@ const wchar_t* CXFA_FMLexer::Number(CXFA_FMToken* t, const wchar_t* p) { } const wchar_t* CXFA_FMLexer::String(CXFA_FMToken* t, const wchar_t* p) { - const wchar_t* pStart = p; - + const wchar_t* start = p; ++p; - if (p > m_end) { - m_LexerError = true; - return p; - } - - uint16_t ch = *p; - while (ch) { - if (!IsValidFormCalcCharacter(p)) { - ch = *p; - t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); - m_LexerError = true; - return p; - } + while (p <= m_end && *p) { + if (!IsValidFormCalcCharacter(*p)) + break; - ++p; - if (ch != '"') { - // We've hit the end of the input, return the string. + if (*p == '"') { + // Check for escaped "s, i.e. "". + ++p; + // If the end of the input has been reached it was not escaped. if (p > m_end) { - m_LexerError = true; + t->m_wstring = CFX_WideStringC(start, (p - start)); + return p; + } + // If the next character is not a " then the end of the string has been + // found. + if (*p != '"') { + if (!IsValidFormCalcCharacter(*p)) { + break; + } + t->m_wstring = CFX_WideStringC(start, (p - start)); return p; } - ch = *p; - continue; - } - // We've hit the end of the input, return the string. - if (p > m_end) - break; - - if (!IsValidFormCalcCharacter(p)) { - ch = *p; - t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); - m_LexerError = true; - return p; } - ch = *p; - if (ch != '"') - break; - ++p; - if (p > m_end) { - m_LexerError = true; - return p; - } - ch = *p; } - t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); + + // Didn't find the end of the string. + t->m_wstring = CFX_WideStringC(start, (p - start)); + m_LexerError = true; return p; } @@ -430,18 +376,16 @@ const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) { const wchar_t* pStart = p; ++p; while (p <= m_end && *p) { - if (!IsValidFormCalcCharacter(p)) { + if (!IsValidFormCalcCharacter(*p)) { t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); m_LexerError = true; return p; } - if (!IsValidIdentifierCharacter(p)) { + if (!IsValidIdentifierCharacter(*p)) { break; } ++p; - if (p > m_end) - break; } t->m_wstring = CFX_WideStringC(pStart, (p - pStart)); t->m_type = IsKeyword(t->m_wstring); @@ -449,23 +393,15 @@ const wchar_t* CXFA_FMLexer::Identifiers(CXFA_FMToken* t, const wchar_t* p) { } const wchar_t* CXFA_FMLexer::Comment(const wchar_t* p) { - ++p; - - if (p > m_end) - return p; - - unsigned ch = *p; - while (ch) { - ++p; - if (ch == L'\r') - return p; - if (ch == L'\n') { + p++; + while (p <= m_end && *p) { + if (*p == L'\r') + return ++p; + if (*p == L'\n') { ++m_uCurrentLine; - return p; + return ++p; } - if (p > m_end) - return p; - ch = *p; + ++p; } return p; } diff --git a/xfa/fxfa/fm2js/cxfa_fmlexer.h b/xfa/fxfa/fm2js/cxfa_fmlexer.h index 3b5b3727e7..11076f8d72 100644 --- a/xfa/fxfa/fm2js/cxfa_fmlexer.h +++ b/xfa/fxfa/fm2js/cxfa_fmlexer.h @@ -123,8 +123,8 @@ class CXFA_FMLexer { const wchar_t* String(CXFA_FMToken* t, const wchar_t* p); const wchar_t* Identifiers(CXFA_FMToken* t, const wchar_t* p); const wchar_t* Comment(const wchar_t* p); + XFA_FM_TOKEN IsKeyword(const CFX_WideStringC& p); - std::unique_ptr Scan(); const wchar_t* m_ptr; const wchar_t* const m_end; -- cgit v1.2.3