summaryrefslogtreecommitdiff
path: root/core/src/fpdftext/fpdf_text.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'core/src/fpdftext/fpdf_text.cpp')
-rw-r--r--core/src/fpdftext/fpdf_text.cpp1582
1 files changed, 791 insertions, 791 deletions
diff --git a/core/src/fpdftext/fpdf_text.cpp b/core/src/fpdftext/fpdf_text.cpp
index 2576b757b6..048ebde2be 100644
--- a/core/src/fpdftext/fpdf_text.cpp
+++ b/core/src/fpdftext/fpdf_text.cpp
@@ -1,791 +1,791 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include "../../include/fpdfapi/fpdf_page.h"
-#include "../../include/fpdfapi/fpdf_pageobj.h"
-#include "../../include/fpdftext/fpdf_text.h"
-#include "txtproc.h"
-#include "text_int.h"
-#if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_)
-extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR);
-CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar)
-{
- if (destcp == 0) {
- if (unicode < 0x80) {
- return CFX_ByteString((char)unicode);
- }
- FX_LPCSTR altstr = FCS_GetAltStr(unicode);
- if (altstr) {
- return CFX_ByteString(altstr, -1);
- }
- return CFX_ByteString(defchar, -1);
- }
- FX_BOOL bDef = FALSE;
- char buf[10];
- int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef);
- if (ret && !bDef) {
- return CFX_ByteString(buf, ret);
- }
- FX_LPCSTR altstr = FCS_GetAltStr(unicode);
- if (altstr) {
- return CFX_ByteString(altstr, -1);
- }
- return CFX_ByteString(defchar, -1);
-}
-CTextPage::CTextPage()
-{
-}
-CTextPage::~CTextPage()
-{
- int i;
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- delete pBaseLine;
- }
- for (i = 0; i < m_TextColumns.GetSize(); i ++) {
- CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
- delete pTextColumn;
- }
-}
-void CTextPage::ProcessObject(CPDF_PageObject* pObject)
-{
- if (pObject->m_Type != PDFPAGE_TEXT) {
- return;
- }
- CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
- CPDF_Font* pFont = pText->m_TextState.GetFont();
- int count = pText->CountItems();
- FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2);
- if (pPosArray) {
- pText->CalcCharPos(pPosArray);
- }
- FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
- FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
- FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
- FX_FLOAT spacew = 0;
- if (space_charcode != -1) {
- spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
- }
- if (spacew == 0) {
- spacew = fontsize_h / 4;
- }
- if (pText->m_TextState.GetBaselineAngle() != 0) {
- int cc = 0;
- CFX_AffineMatrix matrix;
- pText->GetTextMatrix(&matrix);
- for (int i = 0; i < pText->m_nChars; i ++) {
- FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
- if (charcode == (FX_DWORD) - 1) {
- continue;
- }
- FX_RECT char_box;
- pFont->GetCharBBox(charcode, char_box);
- FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000;
- FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000;
- FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000;
- FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
- cc ++;
- FX_FLOAT char_origx, char_origy;
- matrix.Transform(char_left, 0, char_origx, char_origy);
- matrix.TransformRect(char_left, char_right, char_top, char_bottom);
- CFX_ByteString str;
- pFont->AppendChar(str, charcode);
- InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
- char_bottom, spacew, fontsize_v, str, pFont);
- }
- if (pPosArray) {
- FX_Free(pPosArray);
- }
- return;
- }
- FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
- for (int ii = 0; ii < count * 2; ii ++) {
- pPosArray[ii] *= ratio_h;
- }
- FX_FLOAT baseline = pText->m_PosY;
- CTextBaseLine* pBaseLine = NULL;
- FX_FLOAT topy = pText->m_Top;
- FX_FLOAT bottomy = pText->m_Bottom;
- FX_FLOAT leftx = pText->m_Left;
- int cc = 0;
- CFX_ByteString segment;
- int space_count = 0;
- FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
- for (int i = 0; i < pText->m_nChars; i ++) {
- FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
- if (charcode == (FX_DWORD) - 1) {
- continue;
- }
- FX_FLOAT char_left = pPosArray[cc * 2];
- FX_FLOAT char_right = pPosArray[cc * 2 + 1];
- cc ++;
- if (char_left < last_left || (char_left - last_right) > spacew / 2) {
- pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
- topy, bottomy, spacew, fontsize_v, segment, pFont);
- segment_left = char_left;
- segment = "";
- }
- CFX_WideString wCh = pText->GetFont()->UnicodeFromCharCode(charcode);
- FX_DWORD ch = wCh.GetLength() > 0 ? wCh.GetAt(0) : charcode;
- if (space_count > 1) {
- pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
- topy, bottomy, spacew, fontsize_v, segment, pFont);
- segment = "";
- } else if (space_count == 1) {
- pFont->AppendChar(segment, ' ');
- }
- if (segment.GetLength() == 0) {
- segment_left = char_left;
- }
- segment_right = char_right;
- pFont->AppendChar(segment, charcode);
- space_count = 0;
- last_left = char_left;
- last_right = char_right;
- }
- if (segment.GetLength())
- pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
- topy, bottomy, spacew, fontsize_v, segment, pFont);
- FX_Free(pPosArray);
-}
-static void ConvertPDFString(CFX_ByteString& result, CFX_ByteString& src, CPDF_Font* pFont);
-CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx,
- FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v,
- CFX_ByteString& str, CPDF_Font* pFont)
-{
- if (str.GetLength() == 0) {
- return NULL;
- }
- if (pBaseLine == NULL) {
- int i;
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- if (pExistLine->m_BaseLine == basey) {
- pBaseLine = pExistLine;
- break;
- }
- if (pExistLine->m_BaseLine < basey) {
- break;
- }
- }
- if (pBaseLine == NULL) {
- pBaseLine = FX_NEW CTextBaseLine;
- if (NULL == pBaseLine) {
- return NULL;
- }
- pBaseLine->m_BaseLine = basey;
- m_BaseLines.InsertAt(i, pBaseLine);
- }
- }
- CFX_WideString text;
- FX_LPCSTR pStr = str;
- int len = str.GetLength(), offset = 0;
- while (offset < len) {
- FX_DWORD ch = pFont->GetNextChar(pStr, offset);
- CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
- text += unicode_str;
- }
- pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text);
- return pBaseLine;
-}
-void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth)
-{
- FX_FLOAT lastheight = -1;
- FX_FLOAT lastbaseline = -1;
- FX_FLOAT MinLeftX = 1000000;
- FX_FLOAT MaxRightX = 0;
- int i;
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- FX_FLOAT leftx, rightx;
- if (pBaseLine->GetWidth(leftx, rightx)) {
- if (leftx < MinLeftX) {
- MinLeftX = leftx;
- }
- if (rightx > MaxRightX) {
- MaxRightX = rightx;
- }
- }
- }
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- pBaseLine->MergeBoxes();
- }
- for (i = 1; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
- if (pBaseLine->CanMerge(pPrevLine)) {
- pPrevLine->Merge(pBaseLine);
- delete pBaseLine;
- m_BaseLines.RemoveAt(i);
- i --;
- }
- }
- if (m_bAutoWidth) {
- int* widths = FX_Alloc(int, m_BaseLines.GetSize());
- if (widths) {
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- widths[i] = 0;
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- int TotalChars = 0;
- FX_FLOAT TotalWidth = 0;
- int minchars;
- pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
- if (TotalChars) {
- FX_FLOAT charwidth = TotalWidth / TotalChars;
- widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
- }
- if (widths[i] > 1000) {
- widths[i] = 1000;
- }
- if (widths[i] < minchars) {
- widths[i] = minchars;
- }
- }
- int AvgWidth = 0, widthcount = 0;
- for (i = 0; i < m_BaseLines.GetSize(); i ++)
- if (widths[i]) {
- AvgWidth += widths[i];
- widthcount ++;
- }
- AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
- int MaxWidth = 0;
- for (i = 0; i < m_BaseLines.GetSize(); i ++)
- if (MaxWidth < widths[i]) {
- MaxWidth = widths[i];
- }
- if (MaxWidth > AvgWidth * 6 / 5) {
- MaxWidth = AvgWidth * 6 / 5;
- }
- FX_Free(widths);
- if (iMinWidth < MaxWidth) {
- iMinWidth = MaxWidth;
- }
- }
- }
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- pBaseLine->MergeBoxes();
- }
- if (m_bKeepColumn) {
- FindColumns();
- }
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- if (lastheight >= 0) {
- FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
- if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
- lines.Add(L"");
- }
- }
- lastheight = pBaseLine->m_MaxFontSizeV;
- lastbaseline = pBaseLine->m_BaseLine;
- CFX_WideString str;
- pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
- lines.Add(str);
- }
-}
-void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest)
-{
- wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
- FX_LPWSTR pDst = NULL;
- FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
- if (nCount < 1 ) {
- sDest += wChar;
- return;
- }
- pDst = new FX_WCHAR[nCount];
- FX_Unicode_GetNormalization(wChar, pDst);
- for (int nIndex = 0; nIndex < nCount; nIndex++) {
- sDest += pDst[nIndex];
- }
- delete[] pDst;
-}
-void NormalizeString(CFX_WideString& str)
-{
- if (str.GetLength() <= 0) {
- return;
- }
- CFX_WideString sBuffer;
- IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
- if (NULL == BidiChar) {
- return;
- }
- CFX_WordArray order;
- FX_BOOL bR2L = FALSE;
- FX_INT32 start = 0, count = 0, i = 0;
- int nR2L = 0, nL2R = 0;
- for (i = 0; i < str.GetLength(); i++) {
- if(BidiChar->AppendChar(str.GetAt(i))) {
- FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
- order.Add(start);
- order.Add(count);
- order.Add(ret);
- if(!bR2L) {
- if(ret == 2) {
- nR2L++;
- } else if (ret == 1) {
- nL2R++;
- }
- }
- }
- }
- if(BidiChar->EndChar()) {
- FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
- order.Add(start);
- order.Add(count);
- order.Add(ret);
- if(!bR2L) {
- if(ret == 2) {
- nR2L++;
- } else if(ret == 1) {
- nL2R++;
- }
- }
- }
- if(nR2L > 0 && nR2L >= nL2R) {
- bR2L = TRUE;
- }
- if(bR2L) {
- int count = order.GetSize();
- for(int j = count - 1; j > 0; j -= 3) {
- int ret = order.GetAt(j);
- int start = order.GetAt(j - 2);
- int count1 = order.GetAt(j - 1);
- if(ret == 2 || ret == 0) {
- for(int i = start + count1 - 1; i >= start; i--) {
- NormalizeCompositeChar(str[i], sBuffer);
- }
- } else {
- i = j;
- FX_BOOL bSymbol = FALSE;
- while(i > 0 && order.GetAt(i) != 2) {
- bSymbol = !order.GetAt(i);
- i -= 3;
- }
- int end = start + count1 ;
- int n = 0;
- if(bSymbol) {
- n = i + 6;
- } else {
- n = i + 3;
- }
- if(n >= j) {
- for(int m = start; m < end; m++) {
- sBuffer += str[m];
- }
- } else {
- i = j;
- j = n;
- for(; n <= i; n += 3) {
- int ret = order.GetAt(n);
- int start = order.GetAt(n - 2);
- int count1 = order.GetAt(n - 1);
- int end = start + count1 ;
- for(int m = start; m < end; m++) {
- sBuffer += str[m];
- }
- }
- }
- }
- }
- } else {
- int count = order.GetSize();
- FX_BOOL bL2R = FALSE;
- for(int j = 0; j < count; j += 3) {
- int ret = order.GetAt(j + 2);
- int start = order.GetAt(j);
- int count1 = order.GetAt(j + 1);
- if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
- int i = j + 3;
- while(bR2L && i < count) {
- if(order.GetAt(i + 2) == 1) {
- break;
- } else {
- i += 3;
- }
- }
- if(i == 3) {
- j = -3;
- bL2R = TRUE;
- continue;
- }
- int end = str.GetLength() - 1;
- if(i < count) {
- end = order.GetAt(i) - 1;
- }
- j = i - 3;
- for(int n = end; n >= start; n--) {
- NormalizeCompositeChar(str[i], sBuffer);
- }
- } else {
- int end = start + count1 ;
- for(int i = start; i < end; i++) {
- sBuffer += str[i];
- }
- }
- }
- }
- str.Empty();
- str += sBuffer;
- BidiChar->Release();
-}
-static FX_BOOL IsNumber(CFX_WideString& str)
-{
- for (int i = 0; i < str.GetLength(); i ++) {
- FX_WCHAR ch = str[i];
- if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
- return FALSE;
- }
- }
- return TRUE;
-}
-void CTextPage::FindColumns()
-{
- int i;
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
- CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
- CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
- if (pColumn == NULL) {
- pColumn = FX_NEW CTextColumn;
- if (pColumn) {
- pColumn->m_Count = 1;
- pColumn->m_AvgPos = pTextBox->m_Right;
- pColumn->m_TextPos = -1;
- m_TextColumns.Add(pColumn);
- }
- } else {
- pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
- (pColumn->m_Count + 1);
- pColumn->m_Count ++;
- }
- }
- }
- int mincount = m_BaseLines.GetSize() / 4;
- for (i = 0; i < m_TextColumns.GetSize(); i ++) {
- CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
- if (pTextColumn->m_Count >= mincount) {
- continue;
- }
- delete pTextColumn;
- m_TextColumns.RemoveAt(i);
- i --;
- }
- for (i = 0; i < m_BaseLines.GetSize(); i ++) {
- CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
- for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
- CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
- if (IsNumber(pTextBox->m_Text)) {
- pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
- }
- }
- }
-}
-CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
-{
- for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
- CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
- if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
- return pColumn;
- }
- }
- return NULL;
-}
-void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
-{
-}
-CTextBaseLine::CTextBaseLine()
-{
- m_Top = -100000;
- m_Bottom = 100000;
- m_MaxFontSizeV = 0;
-}
-CTextBaseLine::~CTextBaseLine()
-{
- for (int i = 0; i < m_TextList.GetSize(); i ++) {
- CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
- delete pText;
- }
-}
-void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy,
- FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text)
-{
- if (m_Top < topy) {
- m_Top = topy;
- }
- if (m_Bottom > bottomy) {
- m_Bottom = bottomy;
- }
- if (m_MaxFontSizeV < fontsize_v) {
- m_MaxFontSizeV = fontsize_v;
- }
- int i;
- for (i = 0; i < m_TextList.GetSize(); i ++) {
- CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
- if (pText->m_Left > leftx) {
- break;
- }
- }
- CTextBox* pText = FX_NEW CTextBox;
- if (NULL == pText) {
- return;
- }
- pText->m_Text = text;
- pText->m_Left = leftx;
- pText->m_Right = rightx;
- pText->m_Top = topy;
- pText->m_Bottom = bottomy;
- pText->m_SpaceWidth = spacew;
- pText->m_FontSizeV = fontsize_v;
- pText->m_pColumn = NULL;
- m_TextList.InsertAt(i, pText);
-}
-FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2,
- FX_FLOAT& interlow, FX_FLOAT& interhigh);
-FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
-{
- FX_FLOAT inter_top, inter_bottom;
- if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
- inter_bottom, inter_top)) {
- return FALSE;
- }
- FX_FLOAT inter_h = inter_top - inter_bottom;
- if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
- return FALSE;
- }
- FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
- for (int i = 0; i < m_TextList.GetSize(); i ++) {
- CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
- FX_FLOAT width = pText->m_Right - pText->m_Left;
- for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
- CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
- FX_FLOAT inter_left, inter_right;
- if (!GetIntersection(pText->m_Left, pText->m_Right,
- pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) {
- continue;
- }
- FX_FLOAT inter_w = inter_right - inter_left;
- if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) {
- continue;
- }
- if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
- dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
- return FALSE;
- }
- }
- }
- return TRUE;
-}
-void CTextBaseLine::Merge(CTextBaseLine* pOther)
-{
- for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
- CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
- InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
- pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
- }
-}
-FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
-{
- int i;
- for (i = 0; i < m_TextList.GetSize(); i ++) {
- CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
- if (pText->m_Text != L" ") {
- break;
- }
- }
- if (i == m_TextList.GetSize()) {
- return FALSE;
- }
- CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
- leftx = pText->m_Left;
- for (i = m_TextList.GetSize() - 1; i >= 0; i --) {
- CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
- if (pText->m_Text != L" ") {
- break;
- }
- }
- pText = (CTextBox*)m_TextList.GetAt(i);
- rightx = pText->m_Right;
- return TRUE;
-}
-void CTextBaseLine::MergeBoxes()
-{
- int i = 0;
- while (1) {
- if (i >= m_TextList.GetSize() - 1) {
- break;
- }
- CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
- CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
- FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
- FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ?
- pNextText->m_SpaceWidth : pThisText->m_SpaceWidth;
- if (spacew > 0.0 && dx < spacew * 2) {
- pThisText->m_Right = pNextText->m_Right;
- if (dx > spacew * 1.5) {
- pThisText->m_Text += L" ";
- } else if (dx > spacew / 3) {
- pThisText->m_Text += L' ';
- }
- pThisText->m_Text += pNextText->m_Text;
- pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ?
- spacew : pNextText->m_SpaceWidth;
- m_TextList.RemoveAt(i + 1);
- delete pNextText;
- } else {
- i ++;
- }
- }
-}
-void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth,
- int iTextWidth)
-{
- int lastpos = -1;
- for (int i = 0; i < m_TextList.GetSize(); i ++) {
- CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
- int xpos;
- if (pText->m_pColumn) {
- xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5);
- xpos -= pText->m_Text.GetLength();
- } else {
- xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
- }
- if (xpos <= lastpos) {
- xpos = lastpos + 1;
- }
- for (int j = lastpos + 1; j < xpos; j ++) {
- str += ' ';
- }
- CFX_WideString sSrc(pText->m_Text);
- NormalizeString(sSrc);
- str += sSrc;
- str += ' ';
- lastpos = xpos + pText->m_Text.GetLength();
- }
-}
-void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
-{
- minchars = 0;
- for (int i = 0; i < m_TextList.GetSize(); i ++) {
- CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
- if (pText->m_Right - pText->m_Left < 0.002) {
- continue;
- }
- count += pText->m_Text.GetLength();
- width += pText->m_Right - pText->m_Left;
- minchars += pText->m_Text.GetLength() + 1;
- }
-}
-#define PI 3.1415926535897932384626433832795
-static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox)
-{
- int total_count = 0, rotated_count[3] = {0, 0, 0};
- FX_POSITION pos = page.GetFirstObjectPosition();
- while (pos) {
- CPDF_PageObject* pObj = page.GetNextObject(pos);
- if (pObj->m_Type != PDFPAGE_TEXT) {
- continue;
- }
- total_count ++;
- CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
- FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
- if (angle == 0.0) {
- continue;
- }
- int degree = (int)(angle * 180 / PI + 0.5);
- if (degree % 90) {
- continue;
- }
- if (degree < 0) {
- degree += 360;
- }
- int index = degree / 90 % 3 - 1;
- if (index < 0) {
- continue;
- }
- rotated_count[index] ++;
- }
- if (total_count == 0) {
- return;
- }
- CFX_AffineMatrix matrix;
- if (rotated_count[0] > total_count * 2 / 3) {
- matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
- } else if (rotated_count[1] > total_count * 2 / 3) {
- matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
- } else if (rotated_count[2] > total_count * 2 / 3) {
- matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
- } else {
- return;
- }
- page.Transform(matrix);
- page_bbox.Transform(&matrix);
-}
-void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
- int iMinWidth, FX_DWORD flags)
-{
- lines.RemoveAll();
- if (pPage == NULL) {
- return;
- }
- CPDF_Page page;
- page.Load(pDoc, pPage);
- CPDF_ParseOptions options;
- options.m_bTextOnly = TRUE;
- options.m_bSeparateForm = FALSE;
- page.ParseContent(&options);
- CFX_FloatRect page_bbox = page.GetPageBBox();
- if (flags & PDF2TXT_AUTO_ROTATE) {
- CheckRotate(page, page_bbox);
- }
- CTextPage texts;
- texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
- texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
- texts.m_bBreakSpace = TRUE;
- FX_POSITION pos = page.GetFirstObjectPosition();
- while (pos) {
- CPDF_PageObject* pObject = page.GetNextObject(pos);
- if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
- CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
- if (!page_bbox.Contains(rect)) {
- continue;
- }
- }
- texts.ProcessObject(pObject);
- }
- texts.WriteOutput(lines, iMinWidth);
-}
-void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
- int iMinWidth, FX_DWORD flags)
-{
- lines.RemoveAll();
- CFX_WideStringArray wlines;
- PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
- for (int i = 0; i < wlines.GetSize(); i ++) {
- CFX_WideString wstr = wlines[i];
- CFX_ByteString str;
- for (int c = 0; c < wstr.GetLength(); c ++) {
- str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
- }
- lines.Add(str);
- }
-}
-#endif
-extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
- CFX_PtrArray* pObjArray);
-void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags)
-{
- buffer.EstimateSize(0, 10240);
- CPDF_Page page;
- page.Load(pDoc, pPage);
- CPDF_ParseOptions options;
- options.m_bTextOnly = TRUE;
- options.m_bSeparateForm = FALSE;
- page.ParseContent(&options);
- _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);
-}
+// Copyright 2014 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "../../include/fpdfapi/fpdf_page.h"
+#include "../../include/fpdfapi/fpdf_pageobj.h"
+#include "../../include/fpdftext/fpdf_text.h"
+#include "txtproc.h"
+#include "text_int.h"
+#if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_)
+extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR);
+CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar)
+{
+ if (destcp == 0) {
+ if (unicode < 0x80) {
+ return CFX_ByteString((char)unicode);
+ }
+ FX_LPCSTR altstr = FCS_GetAltStr(unicode);
+ if (altstr) {
+ return CFX_ByteString(altstr, -1);
+ }
+ return CFX_ByteString(defchar, -1);
+ }
+ FX_BOOL bDef = FALSE;
+ char buf[10];
+ int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef);
+ if (ret && !bDef) {
+ return CFX_ByteString(buf, ret);
+ }
+ FX_LPCSTR altstr = FCS_GetAltStr(unicode);
+ if (altstr) {
+ return CFX_ByteString(altstr, -1);
+ }
+ return CFX_ByteString(defchar, -1);
+}
+CTextPage::CTextPage()
+{
+}
+CTextPage::~CTextPage()
+{
+ int i;
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ delete pBaseLine;
+ }
+ for (i = 0; i < m_TextColumns.GetSize(); i ++) {
+ CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
+ delete pTextColumn;
+ }
+}
+void CTextPage::ProcessObject(CPDF_PageObject* pObject)
+{
+ if (pObject->m_Type != PDFPAGE_TEXT) {
+ return;
+ }
+ CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
+ CPDF_Font* pFont = pText->m_TextState.GetFont();
+ int count = pText->CountItems();
+ FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2);
+ if (pPosArray) {
+ pText->CalcCharPos(pPosArray);
+ }
+ FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
+ FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
+ FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
+ FX_FLOAT spacew = 0;
+ if (space_charcode != -1) {
+ spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
+ }
+ if (spacew == 0) {
+ spacew = fontsize_h / 4;
+ }
+ if (pText->m_TextState.GetBaselineAngle() != 0) {
+ int cc = 0;
+ CFX_AffineMatrix matrix;
+ pText->GetTextMatrix(&matrix);
+ for (int i = 0; i < pText->m_nChars; i ++) {
+ FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
+ if (charcode == (FX_DWORD) - 1) {
+ continue;
+ }
+ FX_RECT char_box;
+ pFont->GetCharBBox(charcode, char_box);
+ FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000;
+ FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000;
+ FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000;
+ FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
+ cc ++;
+ FX_FLOAT char_origx, char_origy;
+ matrix.Transform(char_left, 0, char_origx, char_origy);
+ matrix.TransformRect(char_left, char_right, char_top, char_bottom);
+ CFX_ByteString str;
+ pFont->AppendChar(str, charcode);
+ InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
+ char_bottom, spacew, fontsize_v, str, pFont);
+ }
+ if (pPosArray) {
+ FX_Free(pPosArray);
+ }
+ return;
+ }
+ FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
+ for (int ii = 0; ii < count * 2; ii ++) {
+ pPosArray[ii] *= ratio_h;
+ }
+ FX_FLOAT baseline = pText->m_PosY;
+ CTextBaseLine* pBaseLine = NULL;
+ FX_FLOAT topy = pText->m_Top;
+ FX_FLOAT bottomy = pText->m_Bottom;
+ FX_FLOAT leftx = pText->m_Left;
+ int cc = 0;
+ CFX_ByteString segment;
+ int space_count = 0;
+ FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
+ for (int i = 0; i < pText->m_nChars; i ++) {
+ FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
+ if (charcode == (FX_DWORD) - 1) {
+ continue;
+ }
+ FX_FLOAT char_left = pPosArray[cc * 2];
+ FX_FLOAT char_right = pPosArray[cc * 2 + 1];
+ cc ++;
+ if (char_left < last_left || (char_left - last_right) > spacew / 2) {
+ pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
+ topy, bottomy, spacew, fontsize_v, segment, pFont);
+ segment_left = char_left;
+ segment = "";
+ }
+ CFX_WideString wCh = pText->GetFont()->UnicodeFromCharCode(charcode);
+ FX_DWORD ch = wCh.GetLength() > 0 ? wCh.GetAt(0) : charcode;
+ if (space_count > 1) {
+ pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
+ topy, bottomy, spacew, fontsize_v, segment, pFont);
+ segment = "";
+ } else if (space_count == 1) {
+ pFont->AppendChar(segment, ' ');
+ }
+ if (segment.GetLength() == 0) {
+ segment_left = char_left;
+ }
+ segment_right = char_right;
+ pFont->AppendChar(segment, charcode);
+ space_count = 0;
+ last_left = char_left;
+ last_right = char_right;
+ }
+ if (segment.GetLength())
+ pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
+ topy, bottomy, spacew, fontsize_v, segment, pFont);
+ FX_Free(pPosArray);
+}
+static void ConvertPDFString(CFX_ByteString& result, CFX_ByteString& src, CPDF_Font* pFont);
+CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx,
+ FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v,
+ CFX_ByteString& str, CPDF_Font* pFont)
+{
+ if (str.GetLength() == 0) {
+ return NULL;
+ }
+ if (pBaseLine == NULL) {
+ int i;
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ if (pExistLine->m_BaseLine == basey) {
+ pBaseLine = pExistLine;
+ break;
+ }
+ if (pExistLine->m_BaseLine < basey) {
+ break;
+ }
+ }
+ if (pBaseLine == NULL) {
+ pBaseLine = FX_NEW CTextBaseLine;
+ if (NULL == pBaseLine) {
+ return NULL;
+ }
+ pBaseLine->m_BaseLine = basey;
+ m_BaseLines.InsertAt(i, pBaseLine);
+ }
+ }
+ CFX_WideString text;
+ FX_LPCSTR pStr = str;
+ int len = str.GetLength(), offset = 0;
+ while (offset < len) {
+ FX_DWORD ch = pFont->GetNextChar(pStr, offset);
+ CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
+ text += unicode_str;
+ }
+ pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text);
+ return pBaseLine;
+}
+void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth)
+{
+ FX_FLOAT lastheight = -1;
+ FX_FLOAT lastbaseline = -1;
+ FX_FLOAT MinLeftX = 1000000;
+ FX_FLOAT MaxRightX = 0;
+ int i;
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ FX_FLOAT leftx, rightx;
+ if (pBaseLine->GetWidth(leftx, rightx)) {
+ if (leftx < MinLeftX) {
+ MinLeftX = leftx;
+ }
+ if (rightx > MaxRightX) {
+ MaxRightX = rightx;
+ }
+ }
+ }
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ pBaseLine->MergeBoxes();
+ }
+ for (i = 1; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
+ if (pBaseLine->CanMerge(pPrevLine)) {
+ pPrevLine->Merge(pBaseLine);
+ delete pBaseLine;
+ m_BaseLines.RemoveAt(i);
+ i --;
+ }
+ }
+ if (m_bAutoWidth) {
+ int* widths = FX_Alloc(int, m_BaseLines.GetSize());
+ if (widths) {
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ widths[i] = 0;
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ int TotalChars = 0;
+ FX_FLOAT TotalWidth = 0;
+ int minchars;
+ pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
+ if (TotalChars) {
+ FX_FLOAT charwidth = TotalWidth / TotalChars;
+ widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
+ }
+ if (widths[i] > 1000) {
+ widths[i] = 1000;
+ }
+ if (widths[i] < minchars) {
+ widths[i] = minchars;
+ }
+ }
+ int AvgWidth = 0, widthcount = 0;
+ for (i = 0; i < m_BaseLines.GetSize(); i ++)
+ if (widths[i]) {
+ AvgWidth += widths[i];
+ widthcount ++;
+ }
+ AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
+ int MaxWidth = 0;
+ for (i = 0; i < m_BaseLines.GetSize(); i ++)
+ if (MaxWidth < widths[i]) {
+ MaxWidth = widths[i];
+ }
+ if (MaxWidth > AvgWidth * 6 / 5) {
+ MaxWidth = AvgWidth * 6 / 5;
+ }
+ FX_Free(widths);
+ if (iMinWidth < MaxWidth) {
+ iMinWidth = MaxWidth;
+ }
+ }
+ }
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ pBaseLine->MergeBoxes();
+ }
+ if (m_bKeepColumn) {
+ FindColumns();
+ }
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ if (lastheight >= 0) {
+ FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
+ if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
+ lines.Add(L"");
+ }
+ }
+ lastheight = pBaseLine->m_MaxFontSizeV;
+ lastbaseline = pBaseLine->m_BaseLine;
+ CFX_WideString str;
+ pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
+ lines.Add(str);
+ }
+}
+void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest)
+{
+ wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
+ FX_LPWSTR pDst = NULL;
+ FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
+ if (nCount < 1 ) {
+ sDest += wChar;
+ return;
+ }
+ pDst = new FX_WCHAR[nCount];
+ FX_Unicode_GetNormalization(wChar, pDst);
+ for (int nIndex = 0; nIndex < nCount; nIndex++) {
+ sDest += pDst[nIndex];
+ }
+ delete[] pDst;
+}
+void NormalizeString(CFX_WideString& str)
+{
+ if (str.GetLength() <= 0) {
+ return;
+ }
+ CFX_WideString sBuffer;
+ IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
+ if (NULL == BidiChar) {
+ return;
+ }
+ CFX_WordArray order;
+ FX_BOOL bR2L = FALSE;
+ FX_INT32 start = 0, count = 0, i = 0;
+ int nR2L = 0, nL2R = 0;
+ for (i = 0; i < str.GetLength(); i++) {
+ if(BidiChar->AppendChar(str.GetAt(i))) {
+ FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
+ order.Add(start);
+ order.Add(count);
+ order.Add(ret);
+ if(!bR2L) {
+ if(ret == 2) {
+ nR2L++;
+ } else if (ret == 1) {
+ nL2R++;
+ }
+ }
+ }
+ }
+ if(BidiChar->EndChar()) {
+ FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
+ order.Add(start);
+ order.Add(count);
+ order.Add(ret);
+ if(!bR2L) {
+ if(ret == 2) {
+ nR2L++;
+ } else if(ret == 1) {
+ nL2R++;
+ }
+ }
+ }
+ if(nR2L > 0 && nR2L >= nL2R) {
+ bR2L = TRUE;
+ }
+ if(bR2L) {
+ int count = order.GetSize();
+ for(int j = count - 1; j > 0; j -= 3) {
+ int ret = order.GetAt(j);
+ int start = order.GetAt(j - 2);
+ int count1 = order.GetAt(j - 1);
+ if(ret == 2 || ret == 0) {
+ for(int i = start + count1 - 1; i >= start; i--) {
+ NormalizeCompositeChar(str[i], sBuffer);
+ }
+ } else {
+ i = j;
+ FX_BOOL bSymbol = FALSE;
+ while(i > 0 && order.GetAt(i) != 2) {
+ bSymbol = !order.GetAt(i);
+ i -= 3;
+ }
+ int end = start + count1 ;
+ int n = 0;
+ if(bSymbol) {
+ n = i + 6;
+ } else {
+ n = i + 3;
+ }
+ if(n >= j) {
+ for(int m = start; m < end; m++) {
+ sBuffer += str[m];
+ }
+ } else {
+ i = j;
+ j = n;
+ for(; n <= i; n += 3) {
+ int ret = order.GetAt(n);
+ int start = order.GetAt(n - 2);
+ int count1 = order.GetAt(n - 1);
+ int end = start + count1 ;
+ for(int m = start; m < end; m++) {
+ sBuffer += str[m];
+ }
+ }
+ }
+ }
+ }
+ } else {
+ int count = order.GetSize();
+ FX_BOOL bL2R = FALSE;
+ for(int j = 0; j < count; j += 3) {
+ int ret = order.GetAt(j + 2);
+ int start = order.GetAt(j);
+ int count1 = order.GetAt(j + 1);
+ if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
+ int i = j + 3;
+ while(bR2L && i < count) {
+ if(order.GetAt(i + 2) == 1) {
+ break;
+ } else {
+ i += 3;
+ }
+ }
+ if(i == 3) {
+ j = -3;
+ bL2R = TRUE;
+ continue;
+ }
+ int end = str.GetLength() - 1;
+ if(i < count) {
+ end = order.GetAt(i) - 1;
+ }
+ j = i - 3;
+ for(int n = end; n >= start; n--) {
+ NormalizeCompositeChar(str[i], sBuffer);
+ }
+ } else {
+ int end = start + count1 ;
+ for(int i = start; i < end; i++) {
+ sBuffer += str[i];
+ }
+ }
+ }
+ }
+ str.Empty();
+ str += sBuffer;
+ BidiChar->Release();
+}
+static FX_BOOL IsNumber(CFX_WideString& str)
+{
+ for (int i = 0; i < str.GetLength(); i ++) {
+ FX_WCHAR ch = str[i];
+ if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+void CTextPage::FindColumns()
+{
+ int i;
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
+ CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
+ CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
+ if (pColumn == NULL) {
+ pColumn = FX_NEW CTextColumn;
+ if (pColumn) {
+ pColumn->m_Count = 1;
+ pColumn->m_AvgPos = pTextBox->m_Right;
+ pColumn->m_TextPos = -1;
+ m_TextColumns.Add(pColumn);
+ }
+ } else {
+ pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
+ (pColumn->m_Count + 1);
+ pColumn->m_Count ++;
+ }
+ }
+ }
+ int mincount = m_BaseLines.GetSize() / 4;
+ for (i = 0; i < m_TextColumns.GetSize(); i ++) {
+ CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
+ if (pTextColumn->m_Count >= mincount) {
+ continue;
+ }
+ delete pTextColumn;
+ m_TextColumns.RemoveAt(i);
+ i --;
+ }
+ for (i = 0; i < m_BaseLines.GetSize(); i ++) {
+ CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
+ for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
+ CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
+ if (IsNumber(pTextBox->m_Text)) {
+ pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
+ }
+ }
+ }
+}
+CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
+{
+ for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
+ CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
+ if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
+ return pColumn;
+ }
+ }
+ return NULL;
+}
+void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
+{
+}
+CTextBaseLine::CTextBaseLine()
+{
+ m_Top = -100000;
+ m_Bottom = 100000;
+ m_MaxFontSizeV = 0;
+}
+CTextBaseLine::~CTextBaseLine()
+{
+ for (int i = 0; i < m_TextList.GetSize(); i ++) {
+ CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
+ delete pText;
+ }
+}
+void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy,
+ FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text)
+{
+ if (m_Top < topy) {
+ m_Top = topy;
+ }
+ if (m_Bottom > bottomy) {
+ m_Bottom = bottomy;
+ }
+ if (m_MaxFontSizeV < fontsize_v) {
+ m_MaxFontSizeV = fontsize_v;
+ }
+ int i;
+ for (i = 0; i < m_TextList.GetSize(); i ++) {
+ CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
+ if (pText->m_Left > leftx) {
+ break;
+ }
+ }
+ CTextBox* pText = FX_NEW CTextBox;
+ if (NULL == pText) {
+ return;
+ }
+ pText->m_Text = text;
+ pText->m_Left = leftx;
+ pText->m_Right = rightx;
+ pText->m_Top = topy;
+ pText->m_Bottom = bottomy;
+ pText->m_SpaceWidth = spacew;
+ pText->m_FontSizeV = fontsize_v;
+ pText->m_pColumn = NULL;
+ m_TextList.InsertAt(i, pText);
+}
+FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2,
+ FX_FLOAT& interlow, FX_FLOAT& interhigh);
+FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
+{
+ FX_FLOAT inter_top, inter_bottom;
+ if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
+ inter_bottom, inter_top)) {
+ return FALSE;
+ }
+ FX_FLOAT inter_h = inter_top - inter_bottom;
+ if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
+ return FALSE;
+ }
+ FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
+ for (int i = 0; i < m_TextList.GetSize(); i ++) {
+ CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
+ FX_FLOAT width = pText->m_Right - pText->m_Left;
+ for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
+ CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
+ FX_FLOAT inter_left, inter_right;
+ if (!GetIntersection(pText->m_Left, pText->m_Right,
+ pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) {
+ continue;
+ }
+ FX_FLOAT inter_w = inter_right - inter_left;
+ if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) {
+ continue;
+ }
+ if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
+ dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
+ return FALSE;
+ }
+ }
+ }
+ return TRUE;
+}
+void CTextBaseLine::Merge(CTextBaseLine* pOther)
+{
+ for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
+ CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
+ InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
+ pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
+ }
+}
+FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
+{
+ int i;
+ for (i = 0; i < m_TextList.GetSize(); i ++) {
+ CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
+ if (pText->m_Text != L" ") {
+ break;
+ }
+ }
+ if (i == m_TextList.GetSize()) {
+ return FALSE;
+ }
+ CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
+ leftx = pText->m_Left;
+ for (i = m_TextList.GetSize() - 1; i >= 0; i --) {
+ CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
+ if (pText->m_Text != L" ") {
+ break;
+ }
+ }
+ pText = (CTextBox*)m_TextList.GetAt(i);
+ rightx = pText->m_Right;
+ return TRUE;
+}
+void CTextBaseLine::MergeBoxes()
+{
+ int i = 0;
+ while (1) {
+ if (i >= m_TextList.GetSize() - 1) {
+ break;
+ }
+ CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
+ CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
+ FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
+ FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ?
+ pNextText->m_SpaceWidth : pThisText->m_SpaceWidth;
+ if (spacew > 0.0 && dx < spacew * 2) {
+ pThisText->m_Right = pNextText->m_Right;
+ if (dx > spacew * 1.5) {
+ pThisText->m_Text += L" ";
+ } else if (dx > spacew / 3) {
+ pThisText->m_Text += L' ';
+ }
+ pThisText->m_Text += pNextText->m_Text;
+ pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ?
+ spacew : pNextText->m_SpaceWidth;
+ m_TextList.RemoveAt(i + 1);
+ delete pNextText;
+ } else {
+ i ++;
+ }
+ }
+}
+void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth,
+ int iTextWidth)
+{
+ int lastpos = -1;
+ for (int i = 0; i < m_TextList.GetSize(); i ++) {
+ CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
+ int xpos;
+ if (pText->m_pColumn) {
+ xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5);
+ xpos -= pText->m_Text.GetLength();
+ } else {
+ xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
+ }
+ if (xpos <= lastpos) {
+ xpos = lastpos + 1;
+ }
+ for (int j = lastpos + 1; j < xpos; j ++) {
+ str += ' ';
+ }
+ CFX_WideString sSrc(pText->m_Text);
+ NormalizeString(sSrc);
+ str += sSrc;
+ str += ' ';
+ lastpos = xpos + pText->m_Text.GetLength();
+ }
+}
+void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
+{
+ minchars = 0;
+ for (int i = 0; i < m_TextList.GetSize(); i ++) {
+ CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
+ if (pText->m_Right - pText->m_Left < 0.002) {
+ continue;
+ }
+ count += pText->m_Text.GetLength();
+ width += pText->m_Right - pText->m_Left;
+ minchars += pText->m_Text.GetLength() + 1;
+ }
+}
+#define PI 3.1415926535897932384626433832795
+static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox)
+{
+ int total_count = 0, rotated_count[3] = {0, 0, 0};
+ FX_POSITION pos = page.GetFirstObjectPosition();
+ while (pos) {
+ CPDF_PageObject* pObj = page.GetNextObject(pos);
+ if (pObj->m_Type != PDFPAGE_TEXT) {
+ continue;
+ }
+ total_count ++;
+ CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
+ FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
+ if (angle == 0.0) {
+ continue;
+ }
+ int degree = (int)(angle * 180 / PI + 0.5);
+ if (degree % 90) {
+ continue;
+ }
+ if (degree < 0) {
+ degree += 360;
+ }
+ int index = degree / 90 % 3 - 1;
+ if (index < 0) {
+ continue;
+ }
+ rotated_count[index] ++;
+ }
+ if (total_count == 0) {
+ return;
+ }
+ CFX_AffineMatrix matrix;
+ if (rotated_count[0] > total_count * 2 / 3) {
+ matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
+ } else if (rotated_count[1] > total_count * 2 / 3) {
+ matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
+ } else if (rotated_count[2] > total_count * 2 / 3) {
+ matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
+ } else {
+ return;
+ }
+ page.Transform(matrix);
+ page_bbox.Transform(&matrix);
+}
+void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
+ int iMinWidth, FX_DWORD flags)
+{
+ lines.RemoveAll();
+ if (pPage == NULL) {
+ return;
+ }
+ CPDF_Page page;
+ page.Load(pDoc, pPage);
+ CPDF_ParseOptions options;
+ options.m_bTextOnly = TRUE;
+ options.m_bSeparateForm = FALSE;
+ page.ParseContent(&options);
+ CFX_FloatRect page_bbox = page.GetPageBBox();
+ if (flags & PDF2TXT_AUTO_ROTATE) {
+ CheckRotate(page, page_bbox);
+ }
+ CTextPage texts;
+ texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
+ texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
+ texts.m_bBreakSpace = TRUE;
+ FX_POSITION pos = page.GetFirstObjectPosition();
+ while (pos) {
+ CPDF_PageObject* pObject = page.GetNextObject(pos);
+ if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
+ CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
+ if (!page_bbox.Contains(rect)) {
+ continue;
+ }
+ }
+ texts.ProcessObject(pObject);
+ }
+ texts.WriteOutput(lines, iMinWidth);
+}
+void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
+ int iMinWidth, FX_DWORD flags)
+{
+ lines.RemoveAll();
+ CFX_WideStringArray wlines;
+ PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
+ for (int i = 0; i < wlines.GetSize(); i ++) {
+ CFX_WideString wstr = wlines[i];
+ CFX_ByteString str;
+ for (int c = 0; c < wstr.GetLength(); c ++) {
+ str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
+ }
+ lines.Add(str);
+ }
+}
+#endif
+extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
+ CFX_PtrArray* pObjArray);
+void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags)
+{
+ buffer.EstimateSize(0, 10240);
+ CPDF_Page page;
+ page.Load(pDoc, pPage);
+ CPDF_ParseOptions options;
+ options.m_bTextOnly = TRUE;
+ options.m_bSeparateForm = FALSE;
+ page.ParseContent(&options);
+ _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);
+}