summaryrefslogtreecommitdiff
path: root/core/src/fpdftext
diff options
context:
space:
mode:
authorTom Sepez <tsepez@chromium.org>2016-02-11 09:06:11 -0800
committerTom Sepez <tsepez@chromium.org>2016-02-11 09:06:11 -0800
commit6b90e983a9e3e7aee0c637a7b0c3c51f0dfc1faf (patch)
tree4a7ee0b7d6f381b7e3b4270bf74746db45715a76 /core/src/fpdftext
parente238549d212e97fe01dba3740949a98172c21454 (diff)
downloadpdfium-6b90e983a9e3e7aee0c637a7b0c3c51f0dfc1faf.tar.xz
Remove unused NormalizeString()
- NormalizeString() has 1 caller: - CTextBaseLine::WriteOutput(), which has only 1 caller: - CTextPage::WriteOutput(), which has only 1 caller: - PDF_GetPageText_Unicode(), which has only 1 caller: - PDF_GetPageText(), which has no callers. Removing this also makes CheckRotate(), NormalizeCompositeChar(), and CTextPage unused. Removing those makes CTextBaseline() unused. Removing that makes txtproc.h unused. PDF_GetTextStream_Unicode also unused. ... and so on until entire files can be removed. R=thestig@chromium.org Review URL: https://codereview.chromium.org/1689843002 .
Diffstat (limited to 'core/src/fpdftext')
-rw-r--r--core/src/fpdftext/fpdf_text.cpp792
-rw-r--r--core/src/fpdftext/fpdf_text_search.cpp292
-rw-r--r--core/src/fpdftext/text_int.h5
-rw-r--r--core/src/fpdftext/txtproc.h84
4 files changed, 0 insertions, 1173 deletions
diff --git a/core/src/fpdftext/fpdf_text.cpp b/core/src/fpdftext/fpdf_text.cpp
deleted file mode 100644
index c052676a19..0000000000
--- a/core/src/fpdftext/fpdf_text.cpp
+++ /dev/null
@@ -1,792 +0,0 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include <cctype>
-#include <cwctype>
-#include <memory>
-#include <vector>
-
-#include "core/include/fpdfapi/fpdf_page.h"
-#include "core/include/fpdfapi/fpdf_pageobj.h"
-#include "core/include/fpdfapi/fpdf_resource.h"
-#include "core/include/fpdftext/fpdf_text.h"
-#include "core/include/fxcrt/fx_bidi.h"
-#include "core/include/fxcrt/fx_ucd.h"
-#include "core/src/fpdftext/text_int.h"
-#include "core/src/fpdftext/txtproc.h"
-#include "third_party/base/stl_util.h"
-
-CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode,
- int destcp,
- const FX_CHAR* defchar) {
- if (destcp == 0) {
- if (unicode < 0x80) {
- return CFX_ByteString((char)unicode);
- }
- const FX_CHAR* altstr = FCS_GetAltStr(unicode);
- return CFX_ByteString(altstr ? altstr : defchar);
- }
- char buf[10];
- int iDef = 0;
- int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10,
- NULL, &iDef);
- if (ret && !iDef) {
- return CFX_ByteString(buf, ret);
- }
- const FX_CHAR* altstr = FCS_GetAltStr(unicode);
- return CFX_ByteString(altstr ? altstr : defchar);
-}
-CTextPage::CTextPage() {}
-CTextPage::~CTextPage() {
- int i;
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- delete m_BaseLines.GetAt(i);
- }
- for (i = 0; i < m_TextColumns.GetSize(); i++) {
- delete m_TextColumns.GetAt(i);
- }
-}
-void CTextPage::ProcessObject(CPDF_PageObject* pObject) {
- if (pObject->m_Type != CPDF_PageObject::TEXT) {
- return;
- }
- CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
- CPDF_Font* pFont = pText->m_TextState.GetFont();
- int count = pText->CountItems();
- FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2);
- pText->CalcCharPos(pPosArray);
-
- FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
- FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
- FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
- FX_FLOAT spacew = 0;
- if (space_charcode != -1) {
- spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
- }
- if (spacew == 0) {
- spacew = fontsize_h / 4;
- }
- if (pText->m_TextState.GetBaselineAngle() != 0) {
- int cc = 0;
- CFX_Matrix matrix;
- pText->GetTextMatrix(&matrix);
- for (int i = 0; i < pText->m_nChars; i++) {
- FX_DWORD charcode = pText->m_nChars == 1
- ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
- : pText->m_pCharCodes[i];
- if (charcode == (FX_DWORD)-1) {
- continue;
- }
- FX_RECT char_box;
- pFont->GetCharBBox(charcode, char_box);
- FX_FLOAT char_left =
- pPosArray ? pPosArray[cc * 2]
- : char_box.left * pText->m_TextState.GetFontSize() / 1000;
- FX_FLOAT char_right =
- pPosArray ? pPosArray[cc * 2 + 1]
- : char_box.right * pText->m_TextState.GetFontSize() / 1000;
- FX_FLOAT char_top =
- char_box.top * pText->m_TextState.GetFontSize() / 1000;
- FX_FLOAT char_bottom =
- char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
- cc++;
- FX_FLOAT char_origx, char_origy;
- matrix.Transform(char_left, 0, char_origx, char_origy);
- matrix.TransformRect(char_left, char_right, char_top, char_bottom);
- CFX_ByteString str;
- pFont->AppendChar(str, charcode);
- InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
- char_bottom, spacew, fontsize_v, str, pFont);
- }
- FX_Free(pPosArray);
- return;
- }
- FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
- for (int ii = 0; ii < count * 2; ii++) {
- pPosArray[ii] *= ratio_h;
- }
- FX_FLOAT baseline = pText->m_PosY;
- CTextBaseLine* pBaseLine = NULL;
- FX_FLOAT topy = pText->m_Top;
- FX_FLOAT bottomy = pText->m_Bottom;
- FX_FLOAT leftx = pText->m_Left;
- int cc = 0;
- CFX_ByteString segment;
- int space_count = 0;
- FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
- for (int i = 0; i < pText->m_nChars; i++) {
- FX_DWORD charcode = pText->m_nChars == 1
- ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
- : pText->m_pCharCodes[i];
- if (charcode == (FX_DWORD)-1) {
- continue;
- }
- FX_FLOAT char_left = pPosArray[cc * 2];
- FX_FLOAT char_right = pPosArray[cc * 2 + 1];
- cc++;
- if (char_left < last_left || (char_left - last_right) > spacew / 2) {
- pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
- leftx + segment_right, topy, bottomy, spacew,
- fontsize_v, segment, pFont);
- segment_left = char_left;
- segment = "";
- }
- if (space_count > 1) {
- pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
- leftx + segment_right, topy, bottomy, spacew,
- fontsize_v, segment, pFont);
- segment = "";
- } else if (space_count == 1) {
- pFont->AppendChar(segment, ' ');
- }
- if (segment.GetLength() == 0) {
- segment_left = char_left;
- }
- segment_right = char_right;
- pFont->AppendChar(segment, charcode);
- space_count = 0;
- last_left = char_left;
- last_right = char_right;
- }
- if (segment.GetLength())
- pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
- leftx + segment_right, topy, bottomy, spacew,
- fontsize_v, segment, pFont);
- FX_Free(pPosArray);
-}
-CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine,
- FX_FLOAT basey,
- FX_FLOAT leftx,
- FX_FLOAT rightx,
- FX_FLOAT topy,
- FX_FLOAT bottomy,
- FX_FLOAT spacew,
- FX_FLOAT fontsize_v,
- CFX_ByteString& str,
- CPDF_Font* pFont) {
- if (str.GetLength() == 0) {
- return NULL;
- }
- if (!pBaseLine) {
- int i;
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- CTextBaseLine* pExistLine = m_BaseLines.GetAt(i);
- if (pExistLine->m_BaseLine == basey) {
- pBaseLine = pExistLine;
- break;
- }
- if (pExistLine->m_BaseLine < basey) {
- break;
- }
- }
- if (!pBaseLine) {
- pBaseLine = new CTextBaseLine;
- pBaseLine->m_BaseLine = basey;
- m_BaseLines.InsertAt(i, pBaseLine);
- }
- }
- CFX_WideString text;
- const FX_CHAR* pStr = str;
- int len = str.GetLength(), offset = 0;
- while (offset < len) {
- FX_DWORD ch = pFont->GetNextChar(pStr, len, offset);
- CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
- if (unicode_str.IsEmpty()) {
- text += (FX_WCHAR)ch;
- } else {
- text += unicode_str;
- }
- }
- pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v,
- text);
- return pBaseLine;
-}
-void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) {
- FX_FLOAT lastheight = -1;
- FX_FLOAT lastbaseline = -1;
- FX_FLOAT MinLeftX = 1000000;
- FX_FLOAT MaxRightX = 0;
- int i;
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
- FX_FLOAT leftx, rightx;
- if (pBaseLine->GetWidth(leftx, rightx)) {
- if (leftx < MinLeftX) {
- MinLeftX = leftx;
- }
- if (rightx > MaxRightX) {
- MaxRightX = rightx;
- }
- }
- }
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- m_BaseLines.GetAt(i)->MergeBoxes();
- }
- for (i = 1; i < m_BaseLines.GetSize(); i++) {
- CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
- CTextBaseLine* pPrevLine = m_BaseLines.GetAt(i - 1);
- if (pBaseLine->CanMerge(pPrevLine)) {
- pPrevLine->Merge(pBaseLine);
- delete pBaseLine;
- m_BaseLines.RemoveAt(i);
- i--;
- }
- }
- if (m_bAutoWidth) {
- int* widths = FX_Alloc(int, m_BaseLines.GetSize());
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- widths[i] = 0;
- CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
- int TotalChars = 0;
- FX_FLOAT TotalWidth = 0;
- int minchars;
- pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
- if (TotalChars) {
- FX_FLOAT charwidth = TotalWidth / TotalChars;
- widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
- }
- if (widths[i] > 1000) {
- widths[i] = 1000;
- }
- if (widths[i] < minchars) {
- widths[i] = minchars;
- }
- }
- int AvgWidth = 0, widthcount = 0;
- for (i = 0; i < m_BaseLines.GetSize(); i++)
- if (widths[i]) {
- AvgWidth += widths[i];
- widthcount++;
- }
- AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
- int MaxWidth = 0;
- for (i = 0; i < m_BaseLines.GetSize(); i++)
- if (MaxWidth < widths[i]) {
- MaxWidth = widths[i];
- }
- if (MaxWidth > AvgWidth * 6 / 5) {
- MaxWidth = AvgWidth * 6 / 5;
- }
- FX_Free(widths);
- if (iMinWidth < MaxWidth) {
- iMinWidth = MaxWidth;
- }
- }
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- m_BaseLines.GetAt(i)->MergeBoxes();
- }
- if (m_bKeepColumn) {
- FindColumns();
- }
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
- if (lastheight >= 0) {
- FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
- if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
- lines.Add(L"");
- }
- }
- lastheight = pBaseLine->m_MaxFontSizeV;
- lastbaseline = pBaseLine->m_BaseLine;
- CFX_WideString str;
- pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
- lines.Add(str);
- }
-}
-void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) {
- wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
- FX_WCHAR* pDst = NULL;
- FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
- if (nCount < 1) {
- sDest += wChar;
- return;
- }
- pDst = new FX_WCHAR[nCount];
- FX_Unicode_GetNormalization(wChar, pDst);
- for (int nIndex = 0; nIndex < nCount; nIndex++) {
- sDest += pDst[nIndex];
- }
- delete[] pDst;
-}
-
-void NormalizeString(CFX_WideString& str) {
- if (str.GetLength() <= 0) {
- return;
- }
- CFX_WideString sBuffer;
- std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
- std::vector<FX_WORD> order;
- FX_BOOL bR2L = FALSE;
- int32_t start = 0, count = 0, i = 0;
- int nR2L = 0, nL2R = 0;
- for (i = 0; i < str.GetLength(); i++) {
- if (pBidiChar->AppendChar(str.GetAt(i))) {
- CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
- order.push_back(start);
- order.push_back(count);
- order.push_back(ret);
- if (!bR2L) {
- if (ret == CFX_BidiChar::RIGHT) {
- nR2L++;
- } else if (ret == CFX_BidiChar::LEFT) {
- nL2R++;
- }
- }
- }
- }
- if (pBidiChar->EndChar()) {
- CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
- order.push_back(start);
- order.push_back(count);
- order.push_back(ret);
- if (!bR2L) {
- if (ret == CFX_BidiChar::RIGHT) {
- nR2L++;
- } else if (ret == CFX_BidiChar::LEFT) {
- nL2R++;
- }
- }
- }
- if (nR2L > 0 && nR2L >= nL2R) {
- bR2L = TRUE;
- }
- if (bR2L) {
- int count = pdfium::CollectionSize<int>(order);
- for (int j = count - 1; j > 0; j -= 3) {
- int ret = order[j];
- int count1 = order[j - 1];
- int start = order[j - 2];
- if (ret == 2 || ret == 0) {
- for (int i = start + count1 - 1; i >= start; i--) {
- NormalizeCompositeChar(str[i], sBuffer);
- }
- } else {
- i = j;
- FX_BOOL bSymbol = FALSE;
- while (i > 0 && order[i] != 2) {
- bSymbol = !order[i];
- i -= 3;
- }
- int end = start + count1;
- int n = 0;
- if (bSymbol) {
- n = i + 6;
- } else {
- n = i + 3;
- }
- if (n >= j) {
- for (int m = start; m < end; m++) {
- sBuffer += str[m];
- }
- } else {
- i = j;
- j = n;
- for (; n <= i; n += 3) {
- int start = order[n - 2];
- int count1 = order[n - 1];
- int end = start + count1;
- for (int m = start; m < end; m++) {
- sBuffer += str[m];
- }
- }
- }
- }
- }
- } else {
- int count = pdfium::CollectionSize<int>(order);
- FX_BOOL bL2R = FALSE;
- for (int j = 0; j < count; j += 3) {
- int start = order[j];
- int count1 = order[j + 1];
- int ret = order[j + 2];
- if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
- int i = j + 3;
- while (bR2L && i < count) {
- if (order[i + 2] == 1) {
- break;
- } else {
- i += 3;
- }
- }
- if (i == 3) {
- j = -3;
- bL2R = TRUE;
- continue;
- }
- int end = str.GetLength() - 1;
- if (i < count) {
- end = order[i] - 1;
- }
- j = i - 3;
- for (int n = end; n >= start; n--) {
- NormalizeCompositeChar(str[i], sBuffer);
- }
- } else {
- int end = start + count1;
- for (int i = start; i < end; i++) {
- sBuffer += str[i];
- }
- }
- }
- }
- str.Empty();
- str += sBuffer;
-}
-static FX_BOOL IsNumber(CFX_WideString& str) {
- for (int i = 0; i < str.GetLength(); i++) {
- FX_WCHAR ch = str[i];
- // TODO(dsinclair): --.+ +.-- should probably not be a number.
- if (!std::iswdigit(ch) && ch != '-' && ch != '+' && ch != '.' && ch != ' ')
- return FALSE;
- }
- return TRUE;
-}
-void CTextPage::FindColumns() {
- int i;
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
- for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
- CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j);
- CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
- if (pColumn) {
- pColumn->m_AvgPos =
- (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
- (pColumn->m_Count + 1);
- pColumn->m_Count++;
- } else {
- pColumn = new CTextColumn;
- pColumn->m_Count = 1;
- pColumn->m_AvgPos = pTextBox->m_Right;
- pColumn->m_TextPos = -1;
- m_TextColumns.Add(pColumn);
- }
- }
- }
- int mincount = m_BaseLines.GetSize() / 4;
- for (i = 0; i < m_TextColumns.GetSize(); i++) {
- CTextColumn* pTextColumn = m_TextColumns.GetAt(i);
- if (pTextColumn->m_Count >= mincount) {
- continue;
- }
- delete pTextColumn;
- m_TextColumns.RemoveAt(i);
- i--;
- }
- for (i = 0; i < m_BaseLines.GetSize(); i++) {
- CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
- for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
- CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j);
- if (IsNumber(pTextBox->m_Text)) {
- pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
- }
- }
- }
-}
-CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) {
- for (int i = 0; i < m_TextColumns.GetSize(); i++) {
- CTextColumn* pColumn = m_TextColumns.GetAt(i);
- if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
- return pColumn;
- }
- }
- return NULL;
-}
-void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {}
-CTextBaseLine::CTextBaseLine() {
- m_Top = -100000;
- m_Bottom = 100000;
- m_MaxFontSizeV = 0;
-}
-CTextBaseLine::~CTextBaseLine() {
- for (int i = 0; i < m_TextList.GetSize(); i++) {
- delete m_TextList.GetAt(i);
- }
-}
-void CTextBaseLine::InsertTextBox(FX_FLOAT leftx,
- FX_FLOAT rightx,
- FX_FLOAT topy,
- FX_FLOAT bottomy,
- FX_FLOAT spacew,
- FX_FLOAT fontsize_v,
- const CFX_WideString& text) {
- if (m_Top < topy) {
- m_Top = topy;
- }
- if (m_Bottom > bottomy) {
- m_Bottom = bottomy;
- }
- if (m_MaxFontSizeV < fontsize_v) {
- m_MaxFontSizeV = fontsize_v;
- }
- int i;
- for (i = 0; i < m_TextList.GetSize(); i++) {
- CTextBox* pText = m_TextList.GetAt(i);
- if (pText->m_Left > leftx) {
- break;
- }
- }
- CTextBox* pText = new CTextBox;
- pText->m_Text = text;
- pText->m_Left = leftx;
- pText->m_Right = rightx;
- pText->m_Top = topy;
- pText->m_Bottom = bottomy;
- pText->m_SpaceWidth = spacew;
- pText->m_FontSizeV = fontsize_v;
- pText->m_pColumn = NULL;
- m_TextList.InsertAt(i, pText);
-}
-FX_BOOL GetIntersection(FX_FLOAT low1,
- FX_FLOAT high1,
- FX_FLOAT low2,
- FX_FLOAT high2,
- FX_FLOAT& interlow,
- FX_FLOAT& interhigh);
-FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) {
- FX_FLOAT inter_top, inter_bottom;
- if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
- inter_bottom, inter_top)) {
- return FALSE;
- }
- FX_FLOAT inter_h = inter_top - inter_bottom;
- if (inter_h < (m_Top - m_Bottom) / 2 &&
- inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
- return FALSE;
- }
- FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
- for (int i = 0; i < m_TextList.GetSize(); i++) {
- CTextBox* pText = m_TextList.GetAt(i);
- for (int j = 0; j < pOther->m_TextList.GetSize(); j++) {
- CTextBox* pOtherText = pOther->m_TextList.GetAt(j);
- FX_FLOAT inter_left, inter_right;
- if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left,
- pOtherText->m_Right, inter_left, inter_right)) {
- continue;
- }
- FX_FLOAT inter_w = inter_right - inter_left;
- if (inter_w < pText->m_SpaceWidth / 2 &&
- inter_w < pOtherText->m_SpaceWidth / 2) {
- continue;
- }
- if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
- dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
- return FALSE;
- }
- }
- }
- return TRUE;
-}
-void CTextBaseLine::Merge(CTextBaseLine* pOther) {
- for (int i = 0; i < pOther->m_TextList.GetSize(); i++) {
- CTextBox* pText = pOther->m_TextList.GetAt(i);
- InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
- pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
- }
-}
-FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) {
- int i;
- for (i = 0; i < m_TextList.GetSize(); i++) {
- CTextBox* pText = m_TextList.GetAt(i);
- if (pText->m_Text != L" ") {
- break;
- }
- }
- if (i == m_TextList.GetSize()) {
- return FALSE;
- }
- CTextBox* pText = m_TextList.GetAt(i);
- leftx = pText->m_Left;
- for (i = m_TextList.GetSize() - 1; i >= 0; i--) {
- CTextBox* pText = m_TextList.GetAt(i);
- if (pText->m_Text != L" ") {
- break;
- }
- }
- pText = m_TextList.GetAt(i);
- rightx = pText->m_Right;
- return TRUE;
-}
-void CTextBaseLine::MergeBoxes() {
- int i = 0;
- while (1) {
- if (i >= m_TextList.GetSize() - 1) {
- break;
- }
- CTextBox* pThisText = m_TextList.GetAt(i);
- CTextBox* pNextText = m_TextList.GetAt(i + 1);
- FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
- FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0)
- ? pNextText->m_SpaceWidth
- : pThisText->m_SpaceWidth;
- if (spacew > 0.0 && dx < spacew * 2) {
- pThisText->m_Right = pNextText->m_Right;
- if (dx > spacew * 1.5) {
- pThisText->m_Text += L" ";
- } else if (dx > spacew / 3) {
- pThisText->m_Text += L' ';
- }
- pThisText->m_Text += pNextText->m_Text;
- pThisText->m_SpaceWidth =
- pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth;
- m_TextList.RemoveAt(i + 1);
- delete pNextText;
- } else {
- i++;
- }
- }
-}
-void CTextBaseLine::WriteOutput(CFX_WideString& str,
- FX_FLOAT leftx,
- FX_FLOAT pagewidth,
- int iTextWidth) {
- int lastpos = -1;
- for (int i = 0; i < m_TextList.GetSize(); i++) {
- CTextBox* pText = m_TextList.GetAt(i);
- int xpos;
- if (pText->m_pColumn) {
- xpos =
- (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth +
- 0.5);
- xpos -= pText->m_Text.GetLength();
- } else {
- xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
- }
- if (xpos <= lastpos) {
- xpos = lastpos + 1;
- }
- for (int j = lastpos + 1; j < xpos; j++) {
- str += ' ';
- }
- CFX_WideString sSrc(pText->m_Text);
- NormalizeString(sSrc);
- str += sSrc;
- str += ' ';
- lastpos = xpos + pText->m_Text.GetLength();
- }
-}
-void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) {
- minchars = 0;
- for (int i = 0; i < m_TextList.GetSize(); i++) {
- CTextBox* pText = m_TextList.GetAt(i);
- if (pText->m_Right - pText->m_Left < 0.002) {
- continue;
- }
- count += pText->m_Text.GetLength();
- width += pText->m_Right - pText->m_Left;
- minchars += pText->m_Text.GetLength() + 1;
- }
-}
-#define PI 3.1415926535897932384626433832795
-static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) {
- int total_count = 0, rotated_count[3] = {0, 0, 0};
- FX_POSITION pos = page.GetFirstObjectPosition();
- while (pos) {
- CPDF_PageObject* pObj = page.GetNextObject(pos);
- if (pObj->m_Type != CPDF_PageObject::TEXT) {
- continue;
- }
- total_count++;
- CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
- FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
- if (angle == 0.0) {
- continue;
- }
- int degree = (int)(angle * 180 / PI + 0.5);
- if (degree % 90) {
- continue;
- }
- if (degree < 0) {
- degree += 360;
- }
- int index = degree / 90 % 3 - 1;
- if (index < 0) {
- continue;
- }
- rotated_count[index]++;
- }
- if (total_count == 0) {
- return;
- }
- CFX_Matrix matrix;
- if (rotated_count[0] > total_count * 2 / 3) {
- matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
- } else if (rotated_count[1] > total_count * 2 / 3) {
- matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
- } else if (rotated_count[2] > total_count * 2 / 3) {
- matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
- } else {
- return;
- }
- page.Transform(matrix);
- page_bbox.Transform(&matrix);
-}
-void PDF_GetPageText_Unicode(CFX_WideStringArray& lines,
- CPDF_Document* pDoc,
- CPDF_Dictionary* pPage,
- int iMinWidth,
- FX_DWORD flags) {
- lines.RemoveAll();
- if (!pPage) {
- return;
- }
- CPDF_Page page;
- page.Load(pDoc, pPage);
- CPDF_ParseOptions options;
- options.m_bTextOnly = TRUE;
- options.m_bSeparateForm = FALSE;
- page.ParseContent(&options);
- CFX_FloatRect page_bbox = page.GetPageBBox();
- if (flags & PDF2TXT_AUTO_ROTATE) {
- CheckRotate(page, page_bbox);
- }
- CTextPage texts;
- texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
- texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
- texts.m_bBreakSpace = TRUE;
- FX_POSITION pos = page.GetFirstObjectPosition();
- while (pos) {
- CPDF_PageObject* pObject = page.GetNextObject(pos);
- if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
- CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right,
- pObject->m_Top);
- if (!page_bbox.Contains(rect)) {
- continue;
- }
- }
- texts.ProcessObject(pObject);
- }
- texts.WriteOutput(lines, iMinWidth);
-}
-void PDF_GetPageText(CFX_ByteStringArray& lines,
- CPDF_Document* pDoc,
- CPDF_Dictionary* pPage,
- int iMinWidth,
- FX_DWORD flags) {
- lines.RemoveAll();
- CFX_WideStringArray wlines;
- PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
- for (int i = 0; i < wlines.GetSize(); i++) {
- CFX_WideString wstr = wlines[i];
- CFX_ByteString str;
- for (int c = 0; c < wstr.GetLength(); c++) {
- str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
- }
- lines.Add(str);
- }
-}
-void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
- CPDF_Document* pDoc,
- CPDF_Dictionary* pPage,
- FX_DWORD flags) {
- buffer.EstimateSize(0, 10240);
- CPDF_Page page;
- page.Load(pDoc, pPage);
- CPDF_ParseOptions options;
- options.m_bTextOnly = TRUE;
- options.m_bSeparateForm = FALSE;
- page.ParseContent(&options);
- GetTextStream_Unicode(buffer, &page, TRUE);
-}
diff --git a/core/src/fpdftext/fpdf_text_search.cpp b/core/src/fpdftext/fpdf_text_search.cpp
deleted file mode 100644
index 5bbfbd9198..0000000000
--- a/core/src/fpdftext/fpdf_text_search.cpp
+++ /dev/null
@@ -1,292 +0,0 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include "core/include/fpdfapi/fpdf_page.h"
-#include "core/include/fpdfapi/fpdf_pageobj.h"
-#include "core/src/fpdftext/text_int.h"
-
-class CPDF_TextStream {
- public:
- CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF)
- : m_Buffer(buffer), m_bUseLF(bUseLF), m_pLastObj(nullptr) {}
- ~CPDF_TextStream() {}
- FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine);
-
- CFX_WideTextBuf& m_Buffer;
- FX_BOOL m_bUseLF;
- const CPDF_TextObject* m_pLastObj;
-};
-FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1,
- const CPDF_TextObject* pTextObj2) {
- if (!pTextObj1 || !pTextObj2) {
- return FALSE;
- }
- CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
- pTextObj2->m_Right, pTextObj2->m_Top);
- CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
- pTextObj1->m_Right, pTextObj1->m_Top);
- if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
- return TRUE;
- }
- if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
- rcPreObj.Intersect(rcCurObj);
- if (rcPreObj.IsEmpty()) {
- return FALSE;
- }
- if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
- rcCurObj.Width() / 2) {
- return FALSE;
- }
- if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
- return FALSE;
- }
- }
- int nPreCount = pTextObj2->CountItems();
- int nCurCount = pTextObj1->CountItems();
- if (nPreCount != nCurCount) {
- return FALSE;
- }
- for (int i = 0; i < nPreCount; i++) {
- CPDF_TextObjectItem itemPer, itemCur;
- pTextObj2->GetItemInfo(i, &itemPer);
- pTextObj1->GetItemInfo(i, &itemCur);
- if (itemCur.m_CharCode != itemPer.m_CharCode) {
- return FALSE;
- }
- }
- return TRUE;
-}
-int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) {
- if (charCode == -1) {
- return 0;
- }
- int w = pFont->GetCharWidthF(charCode);
- if (w == 0) {
- CFX_ByteString str;
- pFont->AppendChar(str, charCode);
- w = pFont->GetStringWidth(str, 1);
- if (w == 0) {
- FX_RECT BBox;
- pFont->GetCharBBox(charCode, BBox);
- w = BBox.right - BBox.left;
- }
- }
- return w;
-}
-int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj,
- const CPDF_TextObject* pObj) {
- if (FPDFText_IsSameTextObject(pPrevObj, pObj)) {
- return -1;
- }
- CPDF_TextObjectItem item;
- int nItem = pPrevObj->CountItems();
- pPrevObj->GetItemInfo(nItem - 1, &item);
- FX_WCHAR preChar = 0, curChar = 0;
- CFX_WideString wstr =
- pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
- if (wstr.GetLength()) {
- preChar = wstr.GetAt(0);
- }
- FX_FLOAT last_pos = item.m_OriginX;
- int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont());
- FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000;
- last_width = FXSYS_fabs(last_width);
- pObj->GetItemInfo(0, &item);
- wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
- if (wstr.GetLength()) {
- curChar = wstr.GetAt(0);
- }
- int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
- FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
- this_width = FXSYS_fabs(this_width);
- FX_FLOAT threshold =
- last_width > this_width ? last_width / 4 : this_width / 4;
- CFX_Matrix prev_matrix, prev_reverse;
- pPrevObj->GetTextMatrix(&prev_matrix);
- prev_reverse.SetReverse(prev_matrix);
- FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY();
- prev_reverse.Transform(x, y);
- if (FXSYS_fabs(y) > threshold * 2) {
- return 2;
- }
- threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
- threshold = threshold > 400
- ? (threshold < 700 ? threshold / 4 : threshold / 5)
- : (threshold / 2);
- threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize())
- : FXSYS_fabs(pObj->GetFontSize());
- threshold /= 1000;
- if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
- preChar != L' ')
- if (curChar != L' ' && preChar != L' ') {
- if ((x - last_pos - last_width) > threshold ||
- (last_pos - x - last_width) > threshold) {
- return 1;
- }
- if (x < 0 && (last_pos - x - last_width) > threshold) {
- return 1;
- }
- if ((x - last_pos - last_width) > this_width ||
- (x - last_pos - this_width) > last_width) {
- return 1;
- }
- }
- if (last_pos + last_width > x + this_width && curChar == L' ') {
- return 3;
- }
- return 0;
-}
-FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj,
- FX_BOOL bFirstLine) {
- CPDF_Font* pFont = pObj->GetFont();
- CFX_Matrix matrix;
- pObj->GetTextMatrix(&matrix);
- int item_index = 0;
- if (m_pLastObj) {
- int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
- if (result == 2) {
- int len = m_Buffer.GetLength();
- if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
- m_Buffer.Delete(len - 1, 1);
- } else {
- if (bFirstLine) {
- return TRUE;
- }
- if (m_bUseLF) {
- m_Buffer.AppendChar(L'\r');
- m_Buffer.AppendChar(L'\n');
- } else {
- m_Buffer.AppendChar(' ');
- }
- }
- } else if (result == 1) {
- m_Buffer.AppendChar(L' ');
- } else if (result == -1) {
- m_pLastObj = pObj;
- return FALSE;
- } else if (result == 3) {
- item_index = 1;
- }
- }
- m_pLastObj = pObj;
- int nItems = pObj->CountItems();
- FX_FLOAT Ignorekerning = 0;
- for (int i = 1; i < nItems - 1; i += 2) {
- CPDF_TextObjectItem item;
- pObj->GetItemInfo(i, &item);
- if (item.m_CharCode == (FX_DWORD)-1) {
- if (i == 1) {
- Ignorekerning = item.m_OriginX;
- } else if (Ignorekerning > item.m_OriginX) {
- Ignorekerning = item.m_OriginX;
- }
- } else {
- Ignorekerning = 0;
- break;
- }
- }
- FX_FLOAT spacing = 0;
- for (; item_index < nItems; item_index++) {
- CPDF_TextObjectItem item;
- pObj->GetItemInfo(item_index, &item);
- if (item.m_CharCode == (FX_DWORD)-1) {
- CFX_WideString wstr = m_Buffer.GetWideString();
- if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
- continue;
- }
- FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
- spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
- continue;
- }
- FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
- if (nItems > 3 && !spacing) {
- charSpace = 0;
- }
- if ((spacing || charSpace) && item_index > 0) {
- int last_width = 0;
- FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
- FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
- FX_FLOAT threshold = 0;
- if (space_charcode != -1) {
- threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
- }
- if (threshold > fontsize_h / 3) {
- threshold = 0;
- } else {
- threshold /= 2;
- }
- if (threshold == 0) {
- threshold = fontsize_h;
- int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
- threshold = this_width > last_width ? (FX_FLOAT)this_width
- : (FX_FLOAT)last_width;
- int nDivide = 6;
- if (threshold < 300) {
- nDivide = 2;
- } else if (threshold < 500) {
- nDivide = 4;
- } else if (threshold < 700) {
- nDivide = 5;
- }
- threshold = threshold / nDivide;
- threshold = fontsize_h * threshold / 1000;
- }
- if (charSpace > 0.001) {
- spacing += matrix.TransformDistance(charSpace);
- } else if (charSpace < -0.001) {
- spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
- }
- if (threshold && (spacing && spacing >= threshold)) {
- m_Buffer.AppendChar(L' ');
- }
- if (item.m_CharCode == (FX_DWORD)-1) {
- continue;
- }
- spacing = 0;
- }
- CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
- if (unicode_str.IsEmpty()) {
- m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
- } else {
- m_Buffer << unicode_str;
- }
- }
- return FALSE;
-}
-void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
- CPDF_PageObjectList* pPage,
- FX_BOOL bUseLF) {
- CPDF_TextStream textstream(buffer, bUseLF);
- FX_POSITION pos = pPage->GetFirstObjectPosition();
- while (pos) {
- CPDF_PageObject* pObject = pPage->GetNextObject(pos);
- if (pObject && pObject->m_Type == CPDF_PageObject::TEXT)
- textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE);
- }
-}
-CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc,
- CPDF_Dictionary* pPage) {
- CFX_WideTextBuf buffer;
- buffer.EstimateSize(0, 1024);
- CPDF_Page page;
- page.Load(pDoc, pPage);
- CPDF_ParseOptions options;
- options.m_bTextOnly = TRUE;
- options.m_bSeparateForm = FALSE;
- page.ParseContent(&options);
- CPDF_TextStream textstream(buffer, FALSE);
- FX_POSITION pos = page.GetFirstObjectPosition();
- while (pos) {
- CPDF_PageObject* pObject = page.GetNextObject(pos);
- if (pObject->m_Type != CPDF_PageObject::TEXT) {
- continue;
- }
- if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
- break;
- }
- }
- return buffer.GetWideString();
-}
diff --git a/core/src/fpdftext/text_int.h b/core/src/fpdftext/text_int.h
index c420bc7702..93c22736ef 100644
--- a/core/src/fpdftext/text_int.h
+++ b/core/src/fpdftext/text_int.h
@@ -237,10 +237,5 @@ class CPDF_LinkExtract : public IPDF_LinkExtract {
};
FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst);
-void NormalizeString(CFX_WideString& str);
-void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest);
-void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
- CPDF_PageObjectList* pPage,
- FX_BOOL bUseLF);
#endif // CORE_SRC_FPDFTEXT_TEXT_INT_H_
diff --git a/core/src/fpdftext/txtproc.h b/core/src/fpdftext/txtproc.h
deleted file mode 100644
index 27cec5d0a6..0000000000
--- a/core/src/fpdftext/txtproc.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_SRC_FPDFTEXT_TXTPROC_H_
-#define CORE_SRC_FPDFTEXT_TXTPROC_H_
-
-class CTextColumn {
- public:
- FX_FLOAT m_AvgPos;
- int m_Count;
- int m_TextPos;
-};
-class CTextBox {
- public:
- CFX_WideString m_Text;
- FX_FLOAT m_Left;
- FX_FLOAT m_Right;
- FX_FLOAT m_SpaceWidth;
- FX_FLOAT m_Top;
- FX_FLOAT m_Bottom;
- FX_FLOAT m_FontSizeV;
- CTextColumn* m_pColumn;
-};
-class CTextBaseLine {
- public:
- CTextBaseLine();
- ~CTextBaseLine();
- void InsertTextBox(FX_FLOAT leftx,
- FX_FLOAT rightx,
- FX_FLOAT topy,
- FX_FLOAT bottomy,
- FX_FLOAT spacew,
- FX_FLOAT fontsize_v,
- const CFX_WideString& str);
- FX_BOOL GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx);
- FX_BOOL CanMerge(CTextBaseLine* pOther);
- void Merge(CTextBaseLine* pOther);
- void MergeBoxes();
- void CountChars(int& count, FX_FLOAT& width, int& minchars);
- void WriteOutput(CFX_WideString& str,
- FX_FLOAT leftx,
- FX_FLOAT width,
- int iWidth);
- FX_FLOAT m_BaseLine;
- FX_FLOAT m_Top;
- FX_FLOAT m_Bottom;
- FX_FLOAT m_MaxFontSizeV;
- CFX_ArrayTemplate<CTextBox*> m_TextList;
-};
-class CPDF_PageObject;
-class CPDF_TextObject;
-class CTextPage {
- public:
- CTextPage();
- ~CTextPage();
- void ProcessObject(CPDF_PageObject* pObj);
- CTextBaseLine* InsertTextBox(CTextBaseLine* pBaseLine,
- FX_FLOAT basey,
- FX_FLOAT leftx,
- FX_FLOAT rightx,
- FX_FLOAT topy,
- FX_FLOAT bottomy,
- FX_FLOAT spacew,
- FX_FLOAT fontsize_v,
- CFX_ByteString& str,
- CPDF_Font* pFont);
- void WriteOutput(CFX_WideStringArray& lines, int iMinWidth);
- FX_BOOL m_bAutoWidth;
- FX_BOOL m_bKeepColumn;
- FX_BOOL m_bBreakSpace;
-
- private:
- void FindColumns();
- CTextColumn* FindColumn(FX_FLOAT xpos);
- void BreakSpace(CPDF_TextObject* pTextObj);
-
- CFX_ArrayTemplate<CTextBaseLine*> m_BaseLines;
- CFX_ArrayTemplate<CTextColumn*> m_TextColumns;
-};
-
-#endif // CORE_SRC_FPDFTEXT_TXTPROC_H_