From 6b90e983a9e3e7aee0c637a7b0c3c51f0dfc1faf Mon Sep 17 00:00:00 2001 From: Tom Sepez Date: Thu, 11 Feb 2016 09:06:11 -0800 Subject: Remove unused NormalizeString() - NormalizeString() has 1 caller: - CTextBaseLine::WriteOutput(), which has only 1 caller: - CTextPage::WriteOutput(), which has only 1 caller: - PDF_GetPageText_Unicode(), which has only 1 caller: - PDF_GetPageText(), which has no callers. Removing this also makes CheckRotate(), NormalizeCompositeChar(), and CTextPage unused. Removing those makes CTextBaseline() unused. Removing that makes txtproc.h unused. PDF_GetTextStream_Unicode also unused. ... and so on until entire files can be removed. R=thestig@chromium.org Review URL: https://codereview.chromium.org/1689843002 . --- core/src/fpdftext/fpdf_text.cpp | 792 --------------------------------- core/src/fpdftext/fpdf_text_search.cpp | 292 ------------ core/src/fpdftext/text_int.h | 5 - core/src/fpdftext/txtproc.h | 84 ---- 4 files changed, 1173 deletions(-) delete mode 100644 core/src/fpdftext/fpdf_text.cpp delete mode 100644 core/src/fpdftext/fpdf_text_search.cpp delete mode 100644 core/src/fpdftext/txtproc.h (limited to 'core/src/fpdftext') diff --git a/core/src/fpdftext/fpdf_text.cpp b/core/src/fpdftext/fpdf_text.cpp deleted file mode 100644 index c052676a19..0000000000 --- a/core/src/fpdftext/fpdf_text.cpp +++ /dev/null @@ -1,792 +0,0 @@ -// Copyright 2014 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#include -#include -#include -#include - -#include "core/include/fpdfapi/fpdf_page.h" -#include "core/include/fpdfapi/fpdf_pageobj.h" -#include "core/include/fpdfapi/fpdf_resource.h" -#include "core/include/fpdftext/fpdf_text.h" -#include "core/include/fxcrt/fx_bidi.h" -#include "core/include/fxcrt/fx_ucd.h" -#include "core/src/fpdftext/text_int.h" -#include "core/src/fpdftext/txtproc.h" -#include "third_party/base/stl_util.h" - -CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, - int destcp, - const FX_CHAR* defchar) { - if (destcp == 0) { - if (unicode < 0x80) { - return CFX_ByteString((char)unicode); - } - const FX_CHAR* altstr = FCS_GetAltStr(unicode); - return CFX_ByteString(altstr ? altstr : defchar); - } - char buf[10]; - int iDef = 0; - int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, - NULL, &iDef); - if (ret && !iDef) { - return CFX_ByteString(buf, ret); - } - const FX_CHAR* altstr = FCS_GetAltStr(unicode); - return CFX_ByteString(altstr ? altstr : defchar); -} -CTextPage::CTextPage() {} -CTextPage::~CTextPage() { - int i; - for (i = 0; i < m_BaseLines.GetSize(); i++) { - delete m_BaseLines.GetAt(i); - } - for (i = 0; i < m_TextColumns.GetSize(); i++) { - delete m_TextColumns.GetAt(i); - } -} -void CTextPage::ProcessObject(CPDF_PageObject* pObject) { - if (pObject->m_Type != CPDF_PageObject::TEXT) { - return; - } - CPDF_TextObject* pText = (CPDF_TextObject*)pObject; - CPDF_Font* pFont = pText->m_TextState.GetFont(); - int count = pText->CountItems(); - FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2); - pText->CalcCharPos(pPosArray); - - FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); - FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); - FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); - FX_FLOAT spacew = 0; - if (space_charcode != -1) { - spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; - } - if (spacew == 0) { - spacew = fontsize_h / 4; - } - if (pText->m_TextState.GetBaselineAngle() != 0) { - int cc = 0; - CFX_Matrix matrix; - pText->GetTextMatrix(&matrix); - for (int i = 0; i < pText->m_nChars; i++) { - FX_DWORD charcode = pText->m_nChars == 1 - ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes - : pText->m_pCharCodes[i]; - if (charcode == (FX_DWORD)-1) { - continue; - } - FX_RECT char_box; - pFont->GetCharBBox(charcode, char_box); - FX_FLOAT char_left = - pPosArray ? pPosArray[cc * 2] - : char_box.left * pText->m_TextState.GetFontSize() / 1000; - FX_FLOAT char_right = - pPosArray ? pPosArray[cc * 2 + 1] - : char_box.right * pText->m_TextState.GetFontSize() / 1000; - FX_FLOAT char_top = - char_box.top * pText->m_TextState.GetFontSize() / 1000; - FX_FLOAT char_bottom = - char_box.bottom * pText->m_TextState.GetFontSize() / 1000; - cc++; - FX_FLOAT char_origx, char_origy; - matrix.Transform(char_left, 0, char_origx, char_origy); - matrix.TransformRect(char_left, char_right, char_top, char_bottom); - CFX_ByteString str; - pFont->AppendChar(str, charcode); - InsertTextBox(NULL, char_origy, char_left, char_right, char_top, - char_bottom, spacew, fontsize_v, str, pFont); - } - FX_Free(pPosArray); - return; - } - FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); - for (int ii = 0; ii < count * 2; ii++) { - pPosArray[ii] *= ratio_h; - } - FX_FLOAT baseline = pText->m_PosY; - CTextBaseLine* pBaseLine = NULL; - FX_FLOAT topy = pText->m_Top; - FX_FLOAT bottomy = pText->m_Bottom; - FX_FLOAT leftx = pText->m_Left; - int cc = 0; - CFX_ByteString segment; - int space_count = 0; - FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; - for (int i = 0; i < pText->m_nChars; i++) { - FX_DWORD charcode = pText->m_nChars == 1 - ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes - : pText->m_pCharCodes[i]; - if (charcode == (FX_DWORD)-1) { - continue; - } - FX_FLOAT char_left = pPosArray[cc * 2]; - FX_FLOAT char_right = pPosArray[cc * 2 + 1]; - cc++; - if (char_left < last_left || (char_left - last_right) > spacew / 2) { - pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, - leftx + segment_right, topy, bottomy, spacew, - fontsize_v, segment, pFont); - segment_left = char_left; - segment = ""; - } - if (space_count > 1) { - pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, - leftx + segment_right, topy, bottomy, spacew, - fontsize_v, segment, pFont); - segment = ""; - } else if (space_count == 1) { - pFont->AppendChar(segment, ' '); - } - if (segment.GetLength() == 0) { - segment_left = char_left; - } - segment_right = char_right; - pFont->AppendChar(segment, charcode); - space_count = 0; - last_left = char_left; - last_right = char_right; - } - if (segment.GetLength()) - pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, - leftx + segment_right, topy, bottomy, spacew, - fontsize_v, segment, pFont); - FX_Free(pPosArray); -} -CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, - FX_FLOAT basey, - FX_FLOAT leftx, - FX_FLOAT rightx, - FX_FLOAT topy, - FX_FLOAT bottomy, - FX_FLOAT spacew, - FX_FLOAT fontsize_v, - CFX_ByteString& str, - CPDF_Font* pFont) { - if (str.GetLength() == 0) { - return NULL; - } - if (!pBaseLine) { - int i; - for (i = 0; i < m_BaseLines.GetSize(); i++) { - CTextBaseLine* pExistLine = m_BaseLines.GetAt(i); - if (pExistLine->m_BaseLine == basey) { - pBaseLine = pExistLine; - break; - } - if (pExistLine->m_BaseLine < basey) { - break; - } - } - if (!pBaseLine) { - pBaseLine = new CTextBaseLine; - pBaseLine->m_BaseLine = basey; - m_BaseLines.InsertAt(i, pBaseLine); - } - } - CFX_WideString text; - const FX_CHAR* pStr = str; - int len = str.GetLength(), offset = 0; - while (offset < len) { - FX_DWORD ch = pFont->GetNextChar(pStr, len, offset); - CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); - if (unicode_str.IsEmpty()) { - text += (FX_WCHAR)ch; - } else { - text += unicode_str; - } - } - pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, - text); - return pBaseLine; -} -void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) { - FX_FLOAT lastheight = -1; - FX_FLOAT lastbaseline = -1; - FX_FLOAT MinLeftX = 1000000; - FX_FLOAT MaxRightX = 0; - int i; - for (i = 0; i < m_BaseLines.GetSize(); i++) { - CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); - FX_FLOAT leftx, rightx; - if (pBaseLine->GetWidth(leftx, rightx)) { - if (leftx < MinLeftX) { - MinLeftX = leftx; - } - if (rightx > MaxRightX) { - MaxRightX = rightx; - } - } - } - for (i = 0; i < m_BaseLines.GetSize(); i++) { - m_BaseLines.GetAt(i)->MergeBoxes(); - } - for (i = 1; i < m_BaseLines.GetSize(); i++) { - CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); - CTextBaseLine* pPrevLine = m_BaseLines.GetAt(i - 1); - if (pBaseLine->CanMerge(pPrevLine)) { - pPrevLine->Merge(pBaseLine); - delete pBaseLine; - m_BaseLines.RemoveAt(i); - i--; - } - } - if (m_bAutoWidth) { - int* widths = FX_Alloc(int, m_BaseLines.GetSize()); - for (i = 0; i < m_BaseLines.GetSize(); i++) { - widths[i] = 0; - CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); - int TotalChars = 0; - FX_FLOAT TotalWidth = 0; - int minchars; - pBaseLine->CountChars(TotalChars, TotalWidth, minchars); - if (TotalChars) { - FX_FLOAT charwidth = TotalWidth / TotalChars; - widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); - } - if (widths[i] > 1000) { - widths[i] = 1000; - } - if (widths[i] < minchars) { - widths[i] = minchars; - } - } - int AvgWidth = 0, widthcount = 0; - for (i = 0; i < m_BaseLines.GetSize(); i++) - if (widths[i]) { - AvgWidth += widths[i]; - widthcount++; - } - AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); - int MaxWidth = 0; - for (i = 0; i < m_BaseLines.GetSize(); i++) - if (MaxWidth < widths[i]) { - MaxWidth = widths[i]; - } - if (MaxWidth > AvgWidth * 6 / 5) { - MaxWidth = AvgWidth * 6 / 5; - } - FX_Free(widths); - if (iMinWidth < MaxWidth) { - iMinWidth = MaxWidth; - } - } - for (i = 0; i < m_BaseLines.GetSize(); i++) { - m_BaseLines.GetAt(i)->MergeBoxes(); - } - if (m_bKeepColumn) { - FindColumns(); - } - for (i = 0; i < m_BaseLines.GetSize(); i++) { - CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); - if (lastheight >= 0) { - FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; - if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) { - lines.Add(L""); - } - } - lastheight = pBaseLine->m_MaxFontSizeV; - lastbaseline = pBaseLine->m_BaseLine; - CFX_WideString str; - pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); - lines.Add(str); - } -} -void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) { - wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); - FX_WCHAR* pDst = NULL; - FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); - if (nCount < 1) { - sDest += wChar; - return; - } - pDst = new FX_WCHAR[nCount]; - FX_Unicode_GetNormalization(wChar, pDst); - for (int nIndex = 0; nIndex < nCount; nIndex++) { - sDest += pDst[nIndex]; - } - delete[] pDst; -} - -void NormalizeString(CFX_WideString& str) { - if (str.GetLength() <= 0) { - return; - } - CFX_WideString sBuffer; - std::unique_ptr pBidiChar(new CFX_BidiChar); - std::vector order; - FX_BOOL bR2L = FALSE; - int32_t start = 0, count = 0, i = 0; - int nR2L = 0, nL2R = 0; - for (i = 0; i < str.GetLength(); i++) { - if (pBidiChar->AppendChar(str.GetAt(i))) { - CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); - order.push_back(start); - order.push_back(count); - order.push_back(ret); - if (!bR2L) { - if (ret == CFX_BidiChar::RIGHT) { - nR2L++; - } else if (ret == CFX_BidiChar::LEFT) { - nL2R++; - } - } - } - } - if (pBidiChar->EndChar()) { - CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); - order.push_back(start); - order.push_back(count); - order.push_back(ret); - if (!bR2L) { - if (ret == CFX_BidiChar::RIGHT) { - nR2L++; - } else if (ret == CFX_BidiChar::LEFT) { - nL2R++; - } - } - } - if (nR2L > 0 && nR2L >= nL2R) { - bR2L = TRUE; - } - if (bR2L) { - int count = pdfium::CollectionSize(order); - for (int j = count - 1; j > 0; j -= 3) { - int ret = order[j]; - int count1 = order[j - 1]; - int start = order[j - 2]; - if (ret == 2 || ret == 0) { - for (int i = start + count1 - 1; i >= start; i--) { - NormalizeCompositeChar(str[i], sBuffer); - } - } else { - i = j; - FX_BOOL bSymbol = FALSE; - while (i > 0 && order[i] != 2) { - bSymbol = !order[i]; - i -= 3; - } - int end = start + count1; - int n = 0; - if (bSymbol) { - n = i + 6; - } else { - n = i + 3; - } - if (n >= j) { - for (int m = start; m < end; m++) { - sBuffer += str[m]; - } - } else { - i = j; - j = n; - for (; n <= i; n += 3) { - int start = order[n - 2]; - int count1 = order[n - 1]; - int end = start + count1; - for (int m = start; m < end; m++) { - sBuffer += str[m]; - } - } - } - } - } - } else { - int count = pdfium::CollectionSize(order); - FX_BOOL bL2R = FALSE; - for (int j = 0; j < count; j += 3) { - int start = order[j]; - int count1 = order[j + 1]; - int ret = order[j + 2]; - if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) { - int i = j + 3; - while (bR2L && i < count) { - if (order[i + 2] == 1) { - break; - } else { - i += 3; - } - } - if (i == 3) { - j = -3; - bL2R = TRUE; - continue; - } - int end = str.GetLength() - 1; - if (i < count) { - end = order[i] - 1; - } - j = i - 3; - for (int n = end; n >= start; n--) { - NormalizeCompositeChar(str[i], sBuffer); - } - } else { - int end = start + count1; - for (int i = start; i < end; i++) { - sBuffer += str[i]; - } - } - } - } - str.Empty(); - str += sBuffer; -} -static FX_BOOL IsNumber(CFX_WideString& str) { - for (int i = 0; i < str.GetLength(); i++) { - FX_WCHAR ch = str[i]; - // TODO(dsinclair): --.+ +.-- should probably not be a number. - if (!std::iswdigit(ch) && ch != '-' && ch != '+' && ch != '.' && ch != ' ') - return FALSE; - } - return TRUE; -} -void CTextPage::FindColumns() { - int i; - for (i = 0; i < m_BaseLines.GetSize(); i++) { - CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); - for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { - CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j); - CTextColumn* pColumn = FindColumn(pTextBox->m_Right); - if (pColumn) { - pColumn->m_AvgPos = - (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) / - (pColumn->m_Count + 1); - pColumn->m_Count++; - } else { - pColumn = new CTextColumn; - pColumn->m_Count = 1; - pColumn->m_AvgPos = pTextBox->m_Right; - pColumn->m_TextPos = -1; - m_TextColumns.Add(pColumn); - } - } - } - int mincount = m_BaseLines.GetSize() / 4; - for (i = 0; i < m_TextColumns.GetSize(); i++) { - CTextColumn* pTextColumn = m_TextColumns.GetAt(i); - if (pTextColumn->m_Count >= mincount) { - continue; - } - delete pTextColumn; - m_TextColumns.RemoveAt(i); - i--; - } - for (i = 0; i < m_BaseLines.GetSize(); i++) { - CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); - for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { - CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j); - if (IsNumber(pTextBox->m_Text)) { - pTextBox->m_pColumn = FindColumn(pTextBox->m_Right); - } - } - } -} -CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) { - for (int i = 0; i < m_TextColumns.GetSize(); i++) { - CTextColumn* pColumn = m_TextColumns.GetAt(i); - if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) { - return pColumn; - } - } - return NULL; -} -void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {} -CTextBaseLine::CTextBaseLine() { - m_Top = -100000; - m_Bottom = 100000; - m_MaxFontSizeV = 0; -} -CTextBaseLine::~CTextBaseLine() { - for (int i = 0; i < m_TextList.GetSize(); i++) { - delete m_TextList.GetAt(i); - } -} -void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, - FX_FLOAT rightx, - FX_FLOAT topy, - FX_FLOAT bottomy, - FX_FLOAT spacew, - FX_FLOAT fontsize_v, - const CFX_WideString& text) { - if (m_Top < topy) { - m_Top = topy; - } - if (m_Bottom > bottomy) { - m_Bottom = bottomy; - } - if (m_MaxFontSizeV < fontsize_v) { - m_MaxFontSizeV = fontsize_v; - } - int i; - for (i = 0; i < m_TextList.GetSize(); i++) { - CTextBox* pText = m_TextList.GetAt(i); - if (pText->m_Left > leftx) { - break; - } - } - CTextBox* pText = new CTextBox; - pText->m_Text = text; - pText->m_Left = leftx; - pText->m_Right = rightx; - pText->m_Top = topy; - pText->m_Bottom = bottomy; - pText->m_SpaceWidth = spacew; - pText->m_FontSizeV = fontsize_v; - pText->m_pColumn = NULL; - m_TextList.InsertAt(i, pText); -} -FX_BOOL GetIntersection(FX_FLOAT low1, - FX_FLOAT high1, - FX_FLOAT low2, - FX_FLOAT high2, - FX_FLOAT& interlow, - FX_FLOAT& interhigh); -FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) { - FX_FLOAT inter_top, inter_bottom; - if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top, - inter_bottom, inter_top)) { - return FALSE; - } - FX_FLOAT inter_h = inter_top - inter_bottom; - if (inter_h < (m_Top - m_Bottom) / 2 && - inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) { - return FALSE; - } - FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine); - for (int i = 0; i < m_TextList.GetSize(); i++) { - CTextBox* pText = m_TextList.GetAt(i); - for (int j = 0; j < pOther->m_TextList.GetSize(); j++) { - CTextBox* pOtherText = pOther->m_TextList.GetAt(j); - FX_FLOAT inter_left, inter_right; - if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left, - pOtherText->m_Right, inter_left, inter_right)) { - continue; - } - FX_FLOAT inter_w = inter_right - inter_left; - if (inter_w < pText->m_SpaceWidth / 2 && - inter_w < pOtherText->m_SpaceWidth / 2) { - continue; - } - if (dy >= (pText->m_Bottom - pText->m_Top) / 2 || - dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) { - return FALSE; - } - } - } - return TRUE; -} -void CTextBaseLine::Merge(CTextBaseLine* pOther) { - for (int i = 0; i < pOther->m_TextList.GetSize(); i++) { - CTextBox* pText = pOther->m_TextList.GetAt(i); - InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom, - pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text); - } -} -FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) { - int i; - for (i = 0; i < m_TextList.GetSize(); i++) { - CTextBox* pText = m_TextList.GetAt(i); - if (pText->m_Text != L" ") { - break; - } - } - if (i == m_TextList.GetSize()) { - return FALSE; - } - CTextBox* pText = m_TextList.GetAt(i); - leftx = pText->m_Left; - for (i = m_TextList.GetSize() - 1; i >= 0; i--) { - CTextBox* pText = m_TextList.GetAt(i); - if (pText->m_Text != L" ") { - break; - } - } - pText = m_TextList.GetAt(i); - rightx = pText->m_Right; - return TRUE; -} -void CTextBaseLine::MergeBoxes() { - int i = 0; - while (1) { - if (i >= m_TextList.GetSize() - 1) { - break; - } - CTextBox* pThisText = m_TextList.GetAt(i); - CTextBox* pNextText = m_TextList.GetAt(i + 1); - FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; - FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) - ? pNextText->m_SpaceWidth - : pThisText->m_SpaceWidth; - if (spacew > 0.0 && dx < spacew * 2) { - pThisText->m_Right = pNextText->m_Right; - if (dx > spacew * 1.5) { - pThisText->m_Text += L" "; - } else if (dx > spacew / 3) { - pThisText->m_Text += L' '; - } - pThisText->m_Text += pNextText->m_Text; - pThisText->m_SpaceWidth = - pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth; - m_TextList.RemoveAt(i + 1); - delete pNextText; - } else { - i++; - } - } -} -void CTextBaseLine::WriteOutput(CFX_WideString& str, - FX_FLOAT leftx, - FX_FLOAT pagewidth, - int iTextWidth) { - int lastpos = -1; - for (int i = 0; i < m_TextList.GetSize(); i++) { - CTextBox* pText = m_TextList.GetAt(i); - int xpos; - if (pText->m_pColumn) { - xpos = - (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + - 0.5); - xpos -= pText->m_Text.GetLength(); - } else { - xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5); - } - if (xpos <= lastpos) { - xpos = lastpos + 1; - } - for (int j = lastpos + 1; j < xpos; j++) { - str += ' '; - } - CFX_WideString sSrc(pText->m_Text); - NormalizeString(sSrc); - str += sSrc; - str += ' '; - lastpos = xpos + pText->m_Text.GetLength(); - } -} -void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) { - minchars = 0; - for (int i = 0; i < m_TextList.GetSize(); i++) { - CTextBox* pText = m_TextList.GetAt(i); - if (pText->m_Right - pText->m_Left < 0.002) { - continue; - } - count += pText->m_Text.GetLength(); - width += pText->m_Right - pText->m_Left; - minchars += pText->m_Text.GetLength() + 1; - } -} -#define PI 3.1415926535897932384626433832795 -static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) { - int total_count = 0, rotated_count[3] = {0, 0, 0}; - FX_POSITION pos = page.GetFirstObjectPosition(); - while (pos) { - CPDF_PageObject* pObj = page.GetNextObject(pos); - if (pObj->m_Type != CPDF_PageObject::TEXT) { - continue; - } - total_count++; - CPDF_TextObject* pText = (CPDF_TextObject*)pObj; - FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); - if (angle == 0.0) { - continue; - } - int degree = (int)(angle * 180 / PI + 0.5); - if (degree % 90) { - continue; - } - if (degree < 0) { - degree += 360; - } - int index = degree / 90 % 3 - 1; - if (index < 0) { - continue; - } - rotated_count[index]++; - } - if (total_count == 0) { - return; - } - CFX_Matrix matrix; - if (rotated_count[0] > total_count * 2 / 3) { - matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); - } else if (rotated_count[1] > total_count * 2 / 3) { - matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); - } else if (rotated_count[2] > total_count * 2 / 3) { - matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); - } else { - return; - } - page.Transform(matrix); - page_bbox.Transform(&matrix); -} -void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, - CPDF_Document* pDoc, - CPDF_Dictionary* pPage, - int iMinWidth, - FX_DWORD flags) { - lines.RemoveAll(); - if (!pPage) { - return; - } - CPDF_Page page; - page.Load(pDoc, pPage); - CPDF_ParseOptions options; - options.m_bTextOnly = TRUE; - options.m_bSeparateForm = FALSE; - page.ParseContent(&options); - CFX_FloatRect page_bbox = page.GetPageBBox(); - if (flags & PDF2TXT_AUTO_ROTATE) { - CheckRotate(page, page_bbox); - } - CTextPage texts; - texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; - texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; - texts.m_bBreakSpace = TRUE; - FX_POSITION pos = page.GetFirstObjectPosition(); - while (pos) { - CPDF_PageObject* pObject = page.GetNextObject(pos); - if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { - CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, - pObject->m_Top); - if (!page_bbox.Contains(rect)) { - continue; - } - } - texts.ProcessObject(pObject); - } - texts.WriteOutput(lines, iMinWidth); -} -void PDF_GetPageText(CFX_ByteStringArray& lines, - CPDF_Document* pDoc, - CPDF_Dictionary* pPage, - int iMinWidth, - FX_DWORD flags) { - lines.RemoveAll(); - CFX_WideStringArray wlines; - PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); - for (int i = 0; i < wlines.GetSize(); i++) { - CFX_WideString wstr = wlines[i]; - CFX_ByteString str; - for (int c = 0; c < wstr.GetLength(); c++) { - str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); - } - lines.Add(str); - } -} -void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, - CPDF_Document* pDoc, - CPDF_Dictionary* pPage, - FX_DWORD flags) { - buffer.EstimateSize(0, 10240); - CPDF_Page page; - page.Load(pDoc, pPage); - CPDF_ParseOptions options; - options.m_bTextOnly = TRUE; - options.m_bSeparateForm = FALSE; - page.ParseContent(&options); - GetTextStream_Unicode(buffer, &page, TRUE); -} diff --git a/core/src/fpdftext/fpdf_text_search.cpp b/core/src/fpdftext/fpdf_text_search.cpp deleted file mode 100644 index 5bbfbd9198..0000000000 --- a/core/src/fpdftext/fpdf_text_search.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// Copyright 2014 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#include "core/include/fpdfapi/fpdf_page.h" -#include "core/include/fpdfapi/fpdf_pageobj.h" -#include "core/src/fpdftext/text_int.h" - -class CPDF_TextStream { - public: - CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF) - : m_Buffer(buffer), m_bUseLF(bUseLF), m_pLastObj(nullptr) {} - ~CPDF_TextStream() {} - FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine); - - CFX_WideTextBuf& m_Buffer; - FX_BOOL m_bUseLF; - const CPDF_TextObject* m_pLastObj; -}; -FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1, - const CPDF_TextObject* pTextObj2) { - if (!pTextObj1 || !pTextObj2) { - return FALSE; - } - CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, - pTextObj2->m_Right, pTextObj2->m_Top); - CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, - pTextObj1->m_Right, pTextObj1->m_Top); - if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) { - return TRUE; - } - if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) { - rcPreObj.Intersect(rcCurObj); - if (rcPreObj.IsEmpty()) { - return FALSE; - } - if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > - rcCurObj.Width() / 2) { - return FALSE; - } - if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) { - return FALSE; - } - } - int nPreCount = pTextObj2->CountItems(); - int nCurCount = pTextObj1->CountItems(); - if (nPreCount != nCurCount) { - return FALSE; - } - for (int i = 0; i < nPreCount; i++) { - CPDF_TextObjectItem itemPer, itemCur; - pTextObj2->GetItemInfo(i, &itemPer); - pTextObj1->GetItemInfo(i, &itemCur); - if (itemCur.m_CharCode != itemPer.m_CharCode) { - return FALSE; - } - } - return TRUE; -} -int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) { - if (charCode == -1) { - return 0; - } - int w = pFont->GetCharWidthF(charCode); - if (w == 0) { - CFX_ByteString str; - pFont->AppendChar(str, charCode); - w = pFont->GetStringWidth(str, 1); - if (w == 0) { - FX_RECT BBox; - pFont->GetCharBBox(charCode, BBox); - w = BBox.right - BBox.left; - } - } - return w; -} -int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj, - const CPDF_TextObject* pObj) { - if (FPDFText_IsSameTextObject(pPrevObj, pObj)) { - return -1; - } - CPDF_TextObjectItem item; - int nItem = pPrevObj->CountItems(); - pPrevObj->GetItemInfo(nItem - 1, &item); - FX_WCHAR preChar = 0, curChar = 0; - CFX_WideString wstr = - pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); - if (wstr.GetLength()) { - preChar = wstr.GetAt(0); - } - FX_FLOAT last_pos = item.m_OriginX; - int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont()); - FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000; - last_width = FXSYS_fabs(last_width); - pObj->GetItemInfo(0, &item); - wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); - if (wstr.GetLength()) { - curChar = wstr.GetAt(0); - } - int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont()); - FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000; - this_width = FXSYS_fabs(this_width); - FX_FLOAT threshold = - last_width > this_width ? last_width / 4 : this_width / 4; - CFX_Matrix prev_matrix, prev_reverse; - pPrevObj->GetTextMatrix(&prev_matrix); - prev_reverse.SetReverse(prev_matrix); - FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY(); - prev_reverse.Transform(x, y); - if (FXSYS_fabs(y) > threshold * 2) { - return 2; - } - threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth); - threshold = threshold > 400 - ? (threshold < 700 ? threshold / 4 : threshold / 5) - : (threshold / 2); - threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize()) - : FXSYS_fabs(pObj->GetFontSize()); - threshold /= 1000; - if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && - preChar != L' ') - if (curChar != L' ' && preChar != L' ') { - if ((x - last_pos - last_width) > threshold || - (last_pos - x - last_width) > threshold) { - return 1; - } - if (x < 0 && (last_pos - x - last_width) > threshold) { - return 1; - } - if ((x - last_pos - last_width) > this_width || - (x - last_pos - this_width) > last_width) { - return 1; - } - } - if (last_pos + last_width > x + this_width && curChar == L' ') { - return 3; - } - return 0; -} -FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, - FX_BOOL bFirstLine) { - CPDF_Font* pFont = pObj->GetFont(); - CFX_Matrix matrix; - pObj->GetTextMatrix(&matrix); - int item_index = 0; - if (m_pLastObj) { - int result = FPDFText_ProcessInterObj(m_pLastObj, pObj); - if (result == 2) { - int len = m_Buffer.GetLength(); - if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') { - m_Buffer.Delete(len - 1, 1); - } else { - if (bFirstLine) { - return TRUE; - } - if (m_bUseLF) { - m_Buffer.AppendChar(L'\r'); - m_Buffer.AppendChar(L'\n'); - } else { - m_Buffer.AppendChar(' '); - } - } - } else if (result == 1) { - m_Buffer.AppendChar(L' '); - } else if (result == -1) { - m_pLastObj = pObj; - return FALSE; - } else if (result == 3) { - item_index = 1; - } - } - m_pLastObj = pObj; - int nItems = pObj->CountItems(); - FX_FLOAT Ignorekerning = 0; - for (int i = 1; i < nItems - 1; i += 2) { - CPDF_TextObjectItem item; - pObj->GetItemInfo(i, &item); - if (item.m_CharCode == (FX_DWORD)-1) { - if (i == 1) { - Ignorekerning = item.m_OriginX; - } else if (Ignorekerning > item.m_OriginX) { - Ignorekerning = item.m_OriginX; - } - } else { - Ignorekerning = 0; - break; - } - } - FX_FLOAT spacing = 0; - for (; item_index < nItems; item_index++) { - CPDF_TextObjectItem item; - pObj->GetItemInfo(item_index, &item); - if (item.m_CharCode == (FX_DWORD)-1) { - CFX_WideString wstr = m_Buffer.GetWideString(); - if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') { - continue; - } - FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); - spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000; - continue; - } - FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace; - if (nItems > 3 && !spacing) { - charSpace = 0; - } - if ((spacing || charSpace) && item_index > 0) { - int last_width = 0; - FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); - FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); - FX_FLOAT threshold = 0; - if (space_charcode != -1) { - threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; - } - if (threshold > fontsize_h / 3) { - threshold = 0; - } else { - threshold /= 2; - } - if (threshold == 0) { - threshold = fontsize_h; - int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); - threshold = this_width > last_width ? (FX_FLOAT)this_width - : (FX_FLOAT)last_width; - int nDivide = 6; - if (threshold < 300) { - nDivide = 2; - } else if (threshold < 500) { - nDivide = 4; - } else if (threshold < 700) { - nDivide = 5; - } - threshold = threshold / nDivide; - threshold = fontsize_h * threshold / 1000; - } - if (charSpace > 0.001) { - spacing += matrix.TransformDistance(charSpace); - } else if (charSpace < -0.001) { - spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); - } - if (threshold && (spacing && spacing >= threshold)) { - m_Buffer.AppendChar(L' '); - } - if (item.m_CharCode == (FX_DWORD)-1) { - continue; - } - spacing = 0; - } - CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode); - if (unicode_str.IsEmpty()) { - m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode); - } else { - m_Buffer << unicode_str; - } - } - return FALSE; -} -void GetTextStream_Unicode(CFX_WideTextBuf& buffer, - CPDF_PageObjectList* pPage, - FX_BOOL bUseLF) { - CPDF_TextStream textstream(buffer, bUseLF); - FX_POSITION pos = pPage->GetFirstObjectPosition(); - while (pos) { - CPDF_PageObject* pObject = pPage->GetNextObject(pos); - if (pObject && pObject->m_Type == CPDF_PageObject::TEXT) - textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE); - } -} -CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, - CPDF_Dictionary* pPage) { - CFX_WideTextBuf buffer; - buffer.EstimateSize(0, 1024); - CPDF_Page page; - page.Load(pDoc, pPage); - CPDF_ParseOptions options; - options.m_bTextOnly = TRUE; - options.m_bSeparateForm = FALSE; - page.ParseContent(&options); - CPDF_TextStream textstream(buffer, FALSE); - FX_POSITION pos = page.GetFirstObjectPosition(); - while (pos) { - CPDF_PageObject* pObject = page.GetNextObject(pos); - if (pObject->m_Type != CPDF_PageObject::TEXT) { - continue; - } - if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) { - break; - } - } - return buffer.GetWideString(); -} diff --git a/core/src/fpdftext/text_int.h b/core/src/fpdftext/text_int.h index c420bc7702..93c22736ef 100644 --- a/core/src/fpdftext/text_int.h +++ b/core/src/fpdftext/text_int.h @@ -237,10 +237,5 @@ class CPDF_LinkExtract : public IPDF_LinkExtract { }; FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst); -void NormalizeString(CFX_WideString& str); -void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); -void GetTextStream_Unicode(CFX_WideTextBuf& buffer, - CPDF_PageObjectList* pPage, - FX_BOOL bUseLF); #endif // CORE_SRC_FPDFTEXT_TEXT_INT_H_ diff --git a/core/src/fpdftext/txtproc.h b/core/src/fpdftext/txtproc.h deleted file mode 100644 index 27cec5d0a6..0000000000 --- a/core/src/fpdftext/txtproc.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2014 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#ifndef CORE_SRC_FPDFTEXT_TXTPROC_H_ -#define CORE_SRC_FPDFTEXT_TXTPROC_H_ - -class CTextColumn { - public: - FX_FLOAT m_AvgPos; - int m_Count; - int m_TextPos; -}; -class CTextBox { - public: - CFX_WideString m_Text; - FX_FLOAT m_Left; - FX_FLOAT m_Right; - FX_FLOAT m_SpaceWidth; - FX_FLOAT m_Top; - FX_FLOAT m_Bottom; - FX_FLOAT m_FontSizeV; - CTextColumn* m_pColumn; -}; -class CTextBaseLine { - public: - CTextBaseLine(); - ~CTextBaseLine(); - void InsertTextBox(FX_FLOAT leftx, - FX_FLOAT rightx, - FX_FLOAT topy, - FX_FLOAT bottomy, - FX_FLOAT spacew, - FX_FLOAT fontsize_v, - const CFX_WideString& str); - FX_BOOL GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx); - FX_BOOL CanMerge(CTextBaseLine* pOther); - void Merge(CTextBaseLine* pOther); - void MergeBoxes(); - void CountChars(int& count, FX_FLOAT& width, int& minchars); - void WriteOutput(CFX_WideString& str, - FX_FLOAT leftx, - FX_FLOAT width, - int iWidth); - FX_FLOAT m_BaseLine; - FX_FLOAT m_Top; - FX_FLOAT m_Bottom; - FX_FLOAT m_MaxFontSizeV; - CFX_ArrayTemplate m_TextList; -}; -class CPDF_PageObject; -class CPDF_TextObject; -class CTextPage { - public: - CTextPage(); - ~CTextPage(); - void ProcessObject(CPDF_PageObject* pObj); - CTextBaseLine* InsertTextBox(CTextBaseLine* pBaseLine, - FX_FLOAT basey, - FX_FLOAT leftx, - FX_FLOAT rightx, - FX_FLOAT topy, - FX_FLOAT bottomy, - FX_FLOAT spacew, - FX_FLOAT fontsize_v, - CFX_ByteString& str, - CPDF_Font* pFont); - void WriteOutput(CFX_WideStringArray& lines, int iMinWidth); - FX_BOOL m_bAutoWidth; - FX_BOOL m_bKeepColumn; - FX_BOOL m_bBreakSpace; - - private: - void FindColumns(); - CTextColumn* FindColumn(FX_FLOAT xpos); - void BreakSpace(CPDF_TextObject* pTextObj); - - CFX_ArrayTemplate m_BaseLines; - CFX_ArrayTemplate m_TextColumns; -}; - -#endif // CORE_SRC_FPDFTEXT_TXTPROC_H_ -- cgit v1.2.3