From 69472875a28a4e2d40623893e029af129f5e88e2 Mon Sep 17 00:00:00 2001
From: Dan Sinclair <dsinclair@chromium.org>
Date: Wed, 28 Oct 2015 10:20:35 -0400
Subject: Merge to XFA: Add helpers to check the PDF_CharType.

This CL adds helpers to provide more descriptive access to
PDF_CharType.

TBR=thestig@chromium.org

Review URL: https://codereview.chromium.org/1407913004 .

(cherry picked from commit e3e5675bcdd26b8df7286e10a42d585df6d2321d)

Review URL: https://codereview.chromium.org/1419893004 .
---
 .../src/fpdfapi/fpdf_parser/fpdf_parser_parser.cpp | 196 +++++++++------------
 .../fpdfapi/fpdf_parser/fpdf_parser_utility.cpp    |  77 ++++----
 2 files changed, 122 insertions(+), 151 deletions(-)

(limited to 'core/src/fpdfapi/fpdf_parser')

diff --git a/core/src/fpdfapi/fpdf_parser/fpdf_parser_parser.cpp b/core/src/fpdfapi/fpdf_parser/fpdf_parser_parser.cpp
index c1b78f1d81..e8842888c8 100644
--- a/core/src/fpdfapi/fpdf_parser/fpdf_parser_parser.cpp
+++ b/core/src/fpdfapi/fpdf_parser/fpdf_parser_parser.cpp
@@ -630,7 +630,7 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
       uint8_t byte = buffer[i];
       switch (status) {
         case 0:
-          if (PDF_CharType[byte] == 'W') {
+          if (PDFCharIsWhitespace(byte)) {
             status = 1;
           }
           if (byte <= '9' && byte >= '0') {
@@ -658,7 +658,7 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
           }
           break;
         case 1:
-          if (PDF_CharType[byte] == 'W') {
+          if (PDFCharIsWhitespace(byte)) {
             break;
           } else if (byte <= '9' && byte >= '0') {
             start_pos = pos + i;
@@ -679,7 +679,7 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
           if (byte <= '9' && byte >= '0') {
             objnum = objnum * 10 + byte - '0';
             break;
-          } else if (PDF_CharType[byte] == 'W') {
+          } else if (PDFCharIsWhitespace(byte)) {
             status = 3;
           } else {
             --i;
@@ -692,7 +692,7 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
             start_pos1 = pos + i;
             status = 4;
             gennum = byte - '0';
-          } else if (PDF_CharType[byte] == 'W') {
+          } else if (PDFCharIsWhitespace(byte)) {
             break;
           } else if (byte == 't') {
             status = 7;
@@ -706,7 +706,7 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
           if (byte <= '9' && byte >= '0') {
             gennum = gennum * 10 + byte - '0';
             break;
-          } else if (PDF_CharType[byte] == 'W') {
+          } else if (PDFCharIsWhitespace(byte)) {
             status = 5;
           } else {
             --i;
@@ -717,7 +717,7 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
           if (byte == 'o') {
             status = 6;
             inside_index = 1;
-          } else if (PDF_CharType[byte] == 'W') {
+          } else if (PDFCharIsWhitespace(byte)) {
             break;
           } else if (byte <= '9' && byte >= '0') {
             objnum = gennum;
@@ -752,7 +752,7 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
               }
               break;
             case 3:
-              if (PDF_CharType[byte] == 'W' || PDF_CharType[byte] == 'D') {
+              if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) {
                 if (objnum > 0x1000000) {
                   status = 0;
                   break;
@@ -826,7 +826,7 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
           break;
         case 7:
           if (inside_index == 7) {
-            if (PDF_CharType[byte] == 'W' || PDF_CharType[byte] == 'D') {
+            if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) {
               last_trailer = pos + i - 7;
               m_Syntax.RestorePos(pos + i - m_Syntax.m_HeaderOffset);
               CPDF_Object* pObj = m_Syntax.GetObject(m_pDocument, 0, 0, 0);
@@ -937,13 +937,13 @@ FX_BOOL CPDF_Parser::RebuildCrossRef() {
           status = 0;
           break;
         case 13:
-          if (PDF_CharType[byte] == 'D' || PDF_CharType[byte] == 'W') {
+          if (PDFCharIsDelimiter(byte) || PDFCharIsWhitespace(byte)) {
             --i;
             status = 0;
           }
           break;
         case 14:
-          if (PDF_CharType[byte] == 'W') {
+          if (PDFCharIsWhitespace(byte)) {
             status = 0;
           } else if (byte == '%' || byte == '(' || byte == '<' ||
                      byte == '\\') {
@@ -1646,15 +1646,13 @@ FX_DWORD CPDF_Parser::LoadLinearizedMainXRefTable() {
   uint8_t ch = 0;
   FX_DWORD dwCount = 0;
   m_Syntax.GetNextChar(ch);
-  int32_t type = PDF_CharType[ch];
-  while (type == 'W') {
+  while (PDFCharIsWhitespace(ch)) {
     ++dwCount;
     if (m_Syntax.m_FileLen >=
         (FX_FILESIZE)(m_Syntax.SavePos() + m_Syntax.m_HeaderOffset)) {
       break;
     }
     m_Syntax.GetNextChar(ch);
-    type = PDF_CharType[ch];
   }
   m_LastXRefOffset += dwCount;
   FX_POSITION pos = m_ObjectStreamMap.GetStartPosition();
@@ -1771,77 +1769,66 @@ void CPDF_SyntaxParser::GetNextWord() {
   if (!GetNextChar(ch)) {
     return;
   }
-  uint8_t type = PDF_CharType[ch];
   while (1) {
-    while (type == 'W') {
-      if (!GetNextChar(ch)) {
+    while (PDFCharIsWhitespace(ch)) {
+      if (!GetNextChar(ch))
         return;
-      }
-      type = PDF_CharType[ch];
     }
-    if (ch != '%') {
+    if (ch != '%')
       break;
-    }
+
     while (1) {
-      if (!GetNextChar(ch)) {
+      if (!GetNextChar(ch))
         return;
-      }
-      if (ch == '\r' || ch == '\n') {
+      if (ch == '\r' || ch == '\n')
         break;
-      }
     }
-    type = PDF_CharType[ch];
   }
-  if (type == 'D') {
+
+  if (PDFCharIsDelimiter(ch)) {
     m_bIsNumber = FALSE;
     m_WordBuffer[m_WordSize++] = ch;
     if (ch == '/') {
       while (1) {
-        if (!GetNextChar(ch)) {
+        if (!GetNextChar(ch))
           return;
-        }
-        type = PDF_CharType[ch];
-        if (type != 'R' && type != 'N') {
+
+        if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
           m_Pos--;
           return;
         }
-        if (m_WordSize < MAX_WORD_BUFFER) {
+
+        if (m_WordSize < MAX_WORD_BUFFER)
           m_WordBuffer[m_WordSize++] = ch;
-        }
       }
     } else if (ch == '<') {
-      if (!GetNextChar(ch)) {
+      if (!GetNextChar(ch))
         return;
-      }
-      if (ch == '<') {
+      if (ch == '<')
         m_WordBuffer[m_WordSize++] = ch;
-      } else {
+      else
         m_Pos--;
-      }
     } else if (ch == '>') {
-      if (!GetNextChar(ch)) {
+      if (!GetNextChar(ch))
         return;
-      }
-      if (ch == '>') {
+      if (ch == '>')
         m_WordBuffer[m_WordSize++] = ch;
-      } else {
+      else
         m_Pos--;
-      }
     }
     return;
   }
+
   while (1) {
-    if (m_WordSize < MAX_WORD_BUFFER) {
+    if (m_WordSize < MAX_WORD_BUFFER)
       m_WordBuffer[m_WordSize++] = ch;
-    }
-    if (type != 'N') {
+
+    if (!PDFCharIsNumeric(ch))
       m_bIsNumber = FALSE;
-    }
-    if (!GetNextChar(ch)) {
+    if (!GetNextChar(ch))
       return;
-    }
-    type = PDF_CharType[ch];
-    if (type == 'D' || type == 'W') {
+
+    if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
       m_Pos--;
       break;
     }
@@ -1996,33 +1983,29 @@ void CPDF_SyntaxParser::ToNextLine() {
 }
 void CPDF_SyntaxParser::ToNextWord() {
   uint8_t ch;
-  if (!GetNextChar(ch)) {
+  if (!GetNextChar(ch))
     return;
-  }
-  uint8_t type = PDF_CharType[ch];
+
   while (1) {
-    while (type == 'W') {
+    while (PDFCharIsWhitespace(ch)) {
       m_dwWordPos = m_Pos;
-      if (!GetNextChar(ch)) {
+      if (!GetNextChar(ch))
         return;
-      }
-      type = PDF_CharType[ch];
     }
-    if (ch != '%') {
+
+    if (ch != '%')
       break;
-    }
+
     while (1) {
-      if (!GetNextChar(ch)) {
+      if (!GetNextChar(ch))
         return;
-      }
-      if (ch == '\r' || ch == '\n') {
+      if (ch == '\r' || ch == '\n')
         break;
-      }
     }
-    type = PDF_CharType[ch];
   }
   m_Pos--;
 }
+
 CFX_ByteString CPDF_SyntaxParser::GetNextWord(FX_BOOL& bIsNumber) {
   GetNextWord();
   bIsNumber = m_bIsNumber;
@@ -2511,21 +2494,21 @@ FX_BOOL CPDF_SyntaxParser::IsWholeWord(FX_FILESIZE startpos,
                                        const uint8_t* tag,
                                        FX_DWORD taglen,
                                        FX_BOOL checkKeyword) {
-  uint8_t type = PDF_CharType[tag[0]];
-  FX_BOOL bCheckLeft = type != 'D' && type != 'W';
-  type = PDF_CharType[tag[taglen - 1]];
-  FX_BOOL bCheckRight = type != 'D' && type != 'W';
+  bool bCheckLeft = !PDFCharIsDelimiter(tag[0]) && !PDFCharIsWhitespace(tag[0]);
+  bool bCheckRight = !PDFCharIsDelimiter(tag[taglen - 1]) &&
+                     !PDFCharIsWhitespace(tag[taglen - 1]);
   uint8_t ch;
   if (bCheckRight && startpos + (int32_t)taglen <= limit &&
       GetCharAt(startpos + (int32_t)taglen, ch)) {
-    uint8_t type = PDF_CharType[ch];
-    if (type == 'N' || type == 'R' || (checkKeyword && type == 'D')) {
+    if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
+        (checkKeyword && PDFCharIsDelimiter(ch))) {
       return FALSE;
     }
   }
+
   if (bCheckLeft && startpos > 0 && GetCharAt(startpos - 1, ch)) {
-    uint8_t type = PDF_CharType[ch];
-    if (type == 'N' || type == 'R' || (checkKeyword && type == 'D')) {
+    if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
+        (checkKeyword && PDFCharIsDelimiter(ch))) {
       return FALSE;
     }
   }
@@ -3769,84 +3752,79 @@ inline void CPDF_DataAvail::SetStartOffset(FX_FILESIZE dwOffset) {
 FX_BOOL CPDF_DataAvail::GetNextToken(CFX_ByteString& token) {
   m_WordSize = 0;
   uint8_t ch;
-  if (!GetNextChar(ch)) {
+  if (!GetNextChar(ch))
     return FALSE;
-  }
-  uint8_t type = PDF_CharType[ch];
+
   while (1) {
-    while (type == 'W') {
-      if (!GetNextChar(ch)) {
+    while (PDFCharIsWhitespace(ch)) {
+      if (!GetNextChar(ch))
         return FALSE;
-      }
-      type = PDF_CharType[ch];
     }
-    if (ch != '%') {
+
+    if (ch != '%')
       break;
-    }
+
     while (1) {
-      if (!GetNextChar(ch)) {
+      if (!GetNextChar(ch))
         return FALSE;
-      }
-      if (ch == '\r' || ch == '\n') {
+      if (ch == '\r' || ch == '\n')
         break;
-      }
     }
-    type = PDF_CharType[ch];
   }
-  if (type == 'D') {
+
+  if (PDFCharIsDelimiter(ch)) {
     m_WordBuffer[m_WordSize++] = ch;
     if (ch == '/') {
       while (1) {
-        if (!GetNextChar(ch)) {
+        if (!GetNextChar(ch))
           return FALSE;
-        }
-        type = PDF_CharType[ch];
-        if (type != 'R' && type != 'N') {
+
+        if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
           m_Pos--;
           CFX_ByteString ret(m_WordBuffer, m_WordSize);
           token = ret;
           return TRUE;
         }
-        if (m_WordSize < MAX_WORD_BUFFER) {
+
+        if (m_WordSize < MAX_WORD_BUFFER)
           m_WordBuffer[m_WordSize++] = ch;
-        }
       }
     } else if (ch == '<') {
-      if (!GetNextChar(ch)) {
+      if (!GetNextChar(ch))
         return FALSE;
-      }
-      if (ch == '<') {
+
+      if (ch == '<')
         m_WordBuffer[m_WordSize++] = ch;
-      } else {
+      else
         m_Pos--;
-      }
     } else if (ch == '>') {
-      if (!GetNextChar(ch)) {
+      if (!GetNextChar(ch))
         return FALSE;
-      }
-      if (ch == '>') {
+
+      if (ch == '>')
         m_WordBuffer[m_WordSize++] = ch;
-      } else {
+      else
         m_Pos--;
-      }
     }
+
     CFX_ByteString ret(m_WordBuffer, m_WordSize);
     token = ret;
     return TRUE;
   }
+
   while (1) {
-    if (m_WordSize < MAX_WORD_BUFFER) {
+    if (m_WordSize < MAX_WORD_BUFFER)
       m_WordBuffer[m_WordSize++] = ch;
-    }
-    if (!GetNextChar(ch)) {
+
+    if (!GetNextChar(ch))
       return FALSE;
-    }
-    type = PDF_CharType[ch];
-    if (type == 'D' || type == 'W') {
+
+    if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
       m_Pos--;
       break;
     }
   }
+
   CFX_ByteString ret(m_WordBuffer, m_WordSize);
   token = ret;
   return TRUE;
diff --git a/core/src/fpdfapi/fpdf_parser/fpdf_parser_utility.cpp b/core/src/fpdfapi/fpdf_parser/fpdf_parser_utility.cpp
index e1e60ecae3..335101e85b 100644
--- a/core/src/fpdfapi/fpdf_parser/fpdf_parser_utility.cpp
+++ b/core/src/fpdfapi/fpdf_parser/fpdf_parser_utility.cpp
@@ -5,6 +5,12 @@
 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
 
 #include "../../../include/fpdfapi/fpdf_parser.h"
+
+// Indexed by 8-bit character code, contains either:
+//   'W' - for whitespace: NUL, TAB, CR, LF, FF, 0x80, 0xff
+//   'N' - for numeric: 0123456789+-.
+//   'D' - for delimiter: %()/<>[]{}
+//   'R' - otherwise.
 const char PDF_CharType[256] = {
     // NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL  BS   HT   LF   VT   FF   CR   SO
     // SI
@@ -72,45 +78,37 @@ void CPDF_SimpleParser::ParseWord(const uint8_t*& pStart,
   dwSize = 0;
   type = PDFWORD_EOF;
   uint8_t ch;
-  char chartype;
   while (1) {
-    if (m_dwSize <= m_dwCurPos) {
+    if (m_dwSize <= m_dwCurPos)
       return;
-    }
     ch = m_pData[m_dwCurPos++];
-    chartype = PDF_CharType[ch];
-    while (chartype == 'W') {
-      if (m_dwSize <= m_dwCurPos) {
+    while (PDFCharIsWhitespace(ch)) {
+      if (m_dwSize <= m_dwCurPos)
         return;
-      }
       ch = m_pData[m_dwCurPos++];
-      chartype = PDF_CharType[ch];
     }
-    if (ch != '%') {
+
+    if (ch != '%')
       break;
-    }
+
     while (1) {
-      if (m_dwSize <= m_dwCurPos) {
+      if (m_dwSize <= m_dwCurPos)
         return;
-      }
       ch = m_pData[m_dwCurPos++];
-      if (ch == '\r' || ch == '\n') {
+      if (ch == '\r' || ch == '\n')
         break;
-      }
     }
-    chartype = PDF_CharType[ch];
   }
+
   FX_DWORD start_pos = m_dwCurPos - 1;
   pStart = m_pData + start_pos;
-  if (chartype == 'D') {
+  if (PDFCharIsDelimiter(ch)) {
     if (ch == '/') {
       while (1) {
-        if (m_dwSize <= m_dwCurPos) {
+        if (m_dwSize <= m_dwCurPos)
           return;
-        }
         ch = m_pData[m_dwCurPos++];
-        chartype = PDF_CharType[ch];
-        if (chartype != 'R' && chartype != 'N') {
+        if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
           m_dwCurPos--;
           dwSize = m_dwCurPos - start_pos;
           type = PDFWORD_NAME;
@@ -121,41 +119,36 @@ void CPDF_SimpleParser::ParseWord(const uint8_t*& pStart,
       type = PDFWORD_DELIMITER;
       dwSize = 1;
       if (ch == '<') {
-        if (m_dwSize <= m_dwCurPos) {
+        if (m_dwSize <= m_dwCurPos)
           return;
-        }
         ch = m_pData[m_dwCurPos++];
-        if (ch == '<') {
+        if (ch == '<')
           dwSize = 2;
-        } else {
+        else
           m_dwCurPos--;
-        }
       } else if (ch == '>') {
-        if (m_dwSize <= m_dwCurPos) {
+        if (m_dwSize <= m_dwCurPos)
           return;
-        }
         ch = m_pData[m_dwCurPos++];
-        if (ch == '>') {
+        if (ch == '>')
           dwSize = 2;
-        } else {
+        else
           m_dwCurPos--;
-        }
       }
     }
     return;
   }
+
   type = PDFWORD_NUMBER;
   dwSize = 1;
   while (1) {
-    if (chartype != 'N') {
+    if (!PDFCharIsNumeric(ch))
       type = PDFWORD_TEXT;
-    }
-    if (m_dwSize <= m_dwCurPos) {
+    if (m_dwSize <= m_dwCurPos)
       return;
-    }
     ch = m_pData[m_dwCurPos++];
-    chartype = PDF_CharType[ch];
-    if (chartype == 'D' || chartype == 'W') {
+
+    if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
       m_dwCurPos--;
       break;
     }
@@ -331,23 +324,23 @@ CFX_ByteString PDF_NameEncode(const CFX_ByteString& orig) {
   int i;
   for (i = 0; i < src_len; i++) {
     uint8_t ch = src_buf[i];
-    if (ch >= 0x80 || PDF_CharType[ch] == 'W' || ch == '#' ||
-        PDF_CharType[ch] == 'D') {
+    if (ch >= 0x80 || PDFCharIsWhitespace(ch) || ch == '#' ||
+        PDFCharIsDelimiter(ch)) {
       dest_len += 3;
     } else {
       dest_len++;
     }
   }
-  if (dest_len == src_len) {
+  if (dest_len == src_len)
     return orig;
-  }
+
   CFX_ByteString res;
   FX_CHAR* dest_buf = res.GetBuffer(dest_len);
   dest_len = 0;
   for (i = 0; i < src_len; i++) {
     uint8_t ch = src_buf[i];
-    if (ch >= 0x80 || PDF_CharType[ch] == 'W' || ch == '#' ||
-        PDF_CharType[ch] == 'D') {
+    if (ch >= 0x80 || PDFCharIsWhitespace(ch) || ch == '#' ||
+        PDFCharIsDelimiter(ch)) {
       dest_buf[dest_len++] = '#';
       dest_buf[dest_len++] = "0123456789ABCDEF"[ch / 16];
       dest_buf[dest_len++] = "0123456789ABCDEF"[ch % 16];
-- 
cgit v1.2.3