From 06c6855258bf25246c46a1f628b8a8a8185029a7 Mon Sep 17 00:00:00 2001 From: Ryan Harrison Date: Wed, 13 Sep 2017 11:16:32 -0400 Subject: Rewrite IsHyphen using string operations The existing code did end of range checks by making sure that the value was never less then 0. This isn't correct when using an unsigned type, since 0 - 1 will wrap around to the max possible value, and thus still be less then 0. Additionally the existing code was hard to follow due to the complexity of some of the low level operations being performed. It has been rewritten using higher level string operations to make it clearer and correct. BUG=chromium:763256 Change-Id: Ib8bf5ca0e29e73724c4a1c4781362e8a8fc30149 Reviewed-on: https://pdfium-review.googlesource.com/13690 Commit-Queue: Ryan Harrison Reviewed-by: Tom Sepez --- core/fpdftext/cpdf_textpage.cpp | 63 ++++++++++++++++++---------------- core/fpdftext/cpdf_textpage.h | 2 +- core/fxcrt/cfx_binarybuf.cpp | 4 +++ core/fxcrt/cfx_binarybuf.h | 4 ++- core/fxcrt/cfx_bytestring_unittest.cpp | 8 +++++ core/fxcrt/cfx_string_c_template.h | 14 ++++++++ core/fxcrt/cfx_widestring_unittest.cpp | 8 +++++ core/fxcrt/cfx_widetextbuf.cpp | 4 +++ core/fxcrt/cfx_widetextbuf.h | 2 +- 9 files changed, 77 insertions(+), 32 deletions(-) (limited to 'core') diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp index 3006ebce2b..ee1d51bfd8 100644 --- a/core/fpdftext/cpdf_textpage.cpp +++ b/core/fpdftext/cpdf_textpage.cpp @@ -106,6 +106,10 @@ float MaskPercentFilled(const std::vector& mask, return count / (end - start); } +bool IsHyphenCode(wchar_t c) { + return c == 0x2D || c == 0xAD; +} + } // namespace PDFTEXT_Obj::PDFTEXT_Obj() {} @@ -1215,36 +1219,37 @@ CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode( return m_TextlineDir; } -bool CPDF_TextPage::IsHyphen(wchar_t curChar) { - CFX_WideString strCurText = m_TempTextBuf.MakeString(); - if (strCurText.IsEmpty()) - strCurText = m_TextBuf.AsStringC(); - FX_STRSIZE nCount = strCurText.GetLength(); - if (nCount < 1) +bool CPDF_TextPage::IsHyphen(wchar_t curChar) const { + CFX_WideStringC curText; + if (!m_TempTextBuf.IsEmpty()) + curText = m_TempTextBuf.AsStringC(); + else if (!m_TextBuf.IsEmpty()) + curText = m_TextBuf.AsStringC(); + else return false; - FX_STRSIZE nIndex = nCount - 1; - wchar_t wcTmp = strCurText[nIndex]; - while (wcTmp == 0x20 && nIndex > 0 && nIndex <= nCount - 1) - wcTmp = strCurText[--nIndex]; - if (0x2D == wcTmp || 0xAD == wcTmp) { - if (--nIndex > 0) { - wchar_t preChar = strCurText[nIndex]; - if (FXSYS_iswalpha(preChar) && FXSYS_iswalpha(curChar)) - return true; - } - const PAGECHAR_INFO* preInfo; - if (!m_TempCharList.empty()) - preInfo = &m_TempCharList.back(); - else if (!m_CharList.empty()) - preInfo = &m_CharList.back(); - else - return false; - if (FPDFTEXT_CHAR_PIECE == preInfo->m_Flag && - (0xAD == preInfo->m_Unicode || 0x2D == preInfo->m_Unicode)) { - return true; - } - } - return false; + + curText = curText.TrimmedRight(0x20); + if (curText.GetLength() < 2) + return false; + + // Extracting the last 2 characters, since they are all that matter + curText = curText.Right(2); + if (!IsHyphenCode(curText.Last())) + return false; + + if (FXSYS_iswalpha(curText.First() && FXSYS_iswalnum(curChar))) + return true; + + const PAGECHAR_INFO* preInfo; + if (!m_TempCharList.empty()) + preInfo = &m_TempCharList.back(); + else if (!m_CharList.empty()) + preInfo = &m_CharList.back(); + else + return false; + + return FPDFTEXT_CHAR_PIECE == preInfo->m_Flag && + IsHyphenCode(preInfo->m_Unicode); } CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject( diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h index 6a4c87368d..41892ea97c 100644 --- a/core/fpdftext/cpdf_textpage.h +++ b/core/fpdftext/cpdf_textpage.h @@ -132,7 +132,7 @@ class CPDF_TextPage { Hyphen, }; - bool IsHyphen(wchar_t curChar); + bool IsHyphen(wchar_t curChar) const; bool IsControlChar(const PAGECHAR_INFO& charInfo); void ProcessObject(); void ProcessFormObject(CPDF_FormObject* pFormObj, diff --git a/core/fxcrt/cfx_binarybuf.cpp b/core/fxcrt/cfx_binarybuf.cpp index 73fe945fd6..b826fdd5b5 100644 --- a/core/fxcrt/cfx_binarybuf.cpp +++ b/core/fxcrt/cfx_binarybuf.cpp @@ -28,6 +28,10 @@ void CFX_BinaryBuf::Delete(FX_STRSIZE start_index, FX_STRSIZE count) { m_DataSize -= count; } +FX_STRSIZE CFX_BinaryBuf::GetLength() const { + return m_DataSize; +} + void CFX_BinaryBuf::Clear() { m_DataSize = 0; } diff --git a/core/fxcrt/cfx_binarybuf.h b/core/fxcrt/cfx_binarybuf.h index 3081d02902..4c795eda0b 100644 --- a/core/fxcrt/cfx_binarybuf.h +++ b/core/fxcrt/cfx_binarybuf.h @@ -17,10 +17,12 @@ class CFX_BinaryBuf { public: CFX_BinaryBuf(); explicit CFX_BinaryBuf(FX_STRSIZE size); - ~CFX_BinaryBuf(); + virtual ~CFX_BinaryBuf(); uint8_t* GetBuffer() const { return m_pBuffer.get(); } FX_STRSIZE GetSize() const { return m_DataSize; } + virtual FX_STRSIZE GetLength() const; + bool IsEmpty() const { return GetLength() == 0; } void Clear(); void EstimateSize(FX_STRSIZE size, FX_STRSIZE alloc_step = 0); diff --git a/core/fxcrt/cfx_bytestring_unittest.cpp b/core/fxcrt/cfx_bytestring_unittest.cpp index dcb8577b65..a4c5187c11 100644 --- a/core/fxcrt/cfx_bytestring_unittest.cpp +++ b/core/fxcrt/cfx_bytestring_unittest.cpp @@ -999,6 +999,14 @@ TEST(fxcrt, ByteStringCMid) { EXPECT_EQ("", longer_string.Mid(4, 3)); } +TEST(fxcrt, ByteStringCTrimmedRight) { + CFX_ByteStringC fred("FRED"); + EXPECT_EQ("FRED", fred.TrimmedRight('E')); + EXPECT_EQ("FRE", fred.TrimmedRight('D')); + CFX_ByteStringC fredd("FREDD"); + EXPECT_EQ("FRE", fred.TrimmedRight('D')); +} + TEST(fxcrt, ByteStringCElementAccess) { // CFX_ByteStringC includes the NUL terminator for non-empty strings. CFX_ByteStringC abc("abc"); diff --git a/core/fxcrt/cfx_string_c_template.h b/core/fxcrt/cfx_string_c_template.h index 8585d73ae6..b0e17a1222 100644 --- a/core/fxcrt/cfx_string_c_template.h +++ b/core/fxcrt/cfx_string_c_template.h @@ -176,6 +176,20 @@ class CFX_StringCTemplate { return Mid(GetLength() - count, count); } + CFX_StringCTemplate TrimmedRight(CharType ch) const { + if (IsEmpty()) + return CFX_StringCTemplate(); + + FX_STRSIZE pos = GetLength(); + while (pos && CharAt(pos - 1) == ch) + pos--; + + if (pos == 0) + return CFX_StringCTemplate(); + + return CFX_StringCTemplate(m_Ptr.Get(), pos); + } + bool operator<(const CFX_StringCTemplate& that) const { int result = FXSYS_cmp(reinterpret_cast(m_Ptr.Get()), reinterpret_cast(that.m_Ptr.Get()), diff --git a/core/fxcrt/cfx_widestring_unittest.cpp b/core/fxcrt/cfx_widestring_unittest.cpp index 7b12d50601..1c8aca9aca 100644 --- a/core/fxcrt/cfx_widestring_unittest.cpp +++ b/core/fxcrt/cfx_widestring_unittest.cpp @@ -1038,6 +1038,14 @@ TEST(fxcrt, WideStringCAnyAllNoneOf) { EXPECT_FALSE(pdfium::ContainsValue(str, L'z')); } +TEST(fxcrt, WideStringCTrimmedRight) { + CFX_WideStringC fred(L"FRED"); + EXPECT_EQ(L"FRED", fred.TrimmedRight(L'E')); + EXPECT_EQ(L"FRE", fred.TrimmedRight(L'D')); + CFX_WideStringC fredd(L"FREDD"); + EXPECT_EQ(L"FRE", fred.TrimmedRight(L'D')); +} + TEST(fxcrt, WideStringFormatWidth) { { CFX_WideString str; diff --git a/core/fxcrt/cfx_widetextbuf.cpp b/core/fxcrt/cfx_widetextbuf.cpp index 246124b95f..81b5fd4b05 100644 --- a/core/fxcrt/cfx_widetextbuf.cpp +++ b/core/fxcrt/cfx_widetextbuf.cpp @@ -6,6 +6,10 @@ #include "core/fxcrt/cfx_widetextbuf.h" +FX_STRSIZE CFX_WideTextBuf::GetLength() const { + return m_DataSize / sizeof(wchar_t); +} + void CFX_WideTextBuf::AppendChar(wchar_t ch) { ExpandBuf(sizeof(wchar_t)); *(wchar_t*)(m_pBuffer.get() + m_DataSize) = ch; diff --git a/core/fxcrt/cfx_widetextbuf.h b/core/fxcrt/cfx_widetextbuf.h index 0c9b6ff99d..389f2e52a3 100644 --- a/core/fxcrt/cfx_widetextbuf.h +++ b/core/fxcrt/cfx_widetextbuf.h @@ -14,7 +14,7 @@ class CFX_WideTextBuf : public CFX_BinaryBuf { public: void AppendChar(wchar_t wch); - FX_STRSIZE GetLength() const { return m_DataSize / sizeof(wchar_t); } + FX_STRSIZE GetLength() const override; wchar_t* GetBuffer() const { return reinterpret_cast(m_pBuffer.get()); } -- cgit v1.2.3