summaryrefslogtreecommitdiff
path: root/xfa_test/pdf/document_loader.cc
diff options
context:
space:
mode:
authorBo Xu <bo_xu@foxitsoftware.com>2014-10-28 23:03:33 -0700
committerBo Xu <bo_xu@foxitsoftware.com>2014-11-03 11:10:11 -0800
commitfdc00a7042d912aafaabddae4d9c84199921ef23 (patch)
tree32ab8ac91cc68d2cd15b9168782a71b3f3f5e7b9 /xfa_test/pdf/document_loader.cc
parente9b38fa38de2c95d8260be31c57d9272c4d127ed (diff)
downloadpdfium-fdc00a7042d912aafaabddae4d9c84199921ef23.tar.xz
Merge XFA to PDFium master at 4dc95e7 on 10/28/2014
Diffstat (limited to 'xfa_test/pdf/document_loader.cc')
-rw-r--r--xfa_test/pdf/document_loader.cc513
1 files changed, 513 insertions, 0 deletions
diff --git a/xfa_test/pdf/document_loader.cc b/xfa_test/pdf/document_loader.cc
new file mode 100644
index 0000000000..b2628a6271
--- /dev/null
+++ b/xfa_test/pdf/document_loader.cc
@@ -0,0 +1,513 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "pdf/document_loader.h"
+
+#include "base/logging.h"
+#include "base/strings/string_util.h"
+#include "net/http/http_util.h"
+#include "ppapi/c/pp_errors.h"
+#include "ppapi/cpp/url_loader.h"
+#include "ppapi/cpp/url_request_info.h"
+#include "ppapi/cpp/url_response_info.h"
+
+namespace chrome_pdf {
+
+// Document below size will be downloaded in one chunk.
+const uint32 kMinFileSize = 64*1024;
+
+DocumentLoader::DocumentLoader(Client* client)
+ : client_(client), partial_document_(false), request_pending_(false),
+ current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
+ document_size_(0), header_request_(true), is_multipart_(false) {
+ loader_factory_.Initialize(this);
+}
+
+DocumentLoader::~DocumentLoader() {
+}
+
+bool DocumentLoader::Init(const pp::URLLoader& loader,
+ const std::string& url,
+ const std::string& headers) {
+ DCHECK(url_.empty());
+ url_ = url;
+ loader_ = loader;
+
+ std::string response_headers;
+ if (!headers.empty()) {
+ response_headers = headers;
+ } else {
+ pp::URLResponseInfo response = loader_.GetResponseInfo();
+ pp::Var headers_var = response.GetHeaders();
+
+ if (headers_var.is_string()) {
+ response_headers = headers_var.AsString();
+ }
+ }
+
+ bool accept_ranges_bytes = false;
+ bool content_encoded = false;
+ uint32 content_length = 0;
+ std::string type;
+ std::string disposition;
+ if (!response_headers.empty()) {
+ net::HttpUtil::HeadersIterator it(response_headers.begin(),
+ response_headers.end(), "\n");
+ while (it.GetNext()) {
+ if (LowerCaseEqualsASCII(it.name(), "content-length")) {
+ content_length = atoi(it.values().c_str());
+ } else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
+ accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes");
+ } else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) {
+ content_encoded = true;
+ } else if (LowerCaseEqualsASCII(it.name(), "content-type")) {
+ type = it.values();
+ size_t semi_colon_pos = type.find(';');
+ if (semi_colon_pos != std::string::npos) {
+ type = type.substr(0, semi_colon_pos);
+ }
+ TrimWhitespace(type, base::TRIM_ALL, &type);
+ } else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) {
+ disposition = it.values();
+ }
+ }
+ }
+ if (!type.empty() &&
+ !EndsWith(type, "/pdf", false) &&
+ !EndsWith(type, ".pdf", false) &&
+ !EndsWith(type, "/x-pdf", false) &&
+ !EndsWith(type, "/*", false) &&
+ !EndsWith(type, "/acrobat", false) &&
+ !EndsWith(type, "/unknown", false)) {
+ return false;
+ }
+ if (StartsWithASCII(disposition, "attachment", false)) {
+ return false;
+ }
+
+ if (content_length > 0)
+ chunk_stream_.Preallocate(content_length);
+
+ document_size_ = content_length;
+ requests_count_ = 0;
+
+ // Enable partial loading only if file size is above the threshold.
+ // It will allow avoiding latency for multiple requests.
+ if (content_length > kMinFileSize &&
+ accept_ranges_bytes &&
+ !content_encoded) {
+ LoadPartialDocument();
+ } else {
+ LoadFullDocument();
+ }
+ return true;
+}
+
+void DocumentLoader::LoadPartialDocument() {
+ partial_document_ = true;
+ // Force the main request to be cancelled, since if we're a full-frame plugin
+ // there could be other references to the loader.
+ loader_.Close();
+ loader_ = pp::URLLoader();
+ // Download file header.
+ header_request_ = true;
+ RequestData(0, std::min(GetRequestSize(), document_size_));
+}
+
+void DocumentLoader::LoadFullDocument() {
+ partial_document_ = false;
+ chunk_buffer_.clear();
+ ReadMore();
+}
+
+bool DocumentLoader::IsDocumentComplete() const {
+ if (document_size_ == 0) // Document size unknown.
+ return false;
+ return IsDataAvailable(0, document_size_);
+}
+
+uint32 DocumentLoader::GetAvailableData() const {
+ if (document_size_ == 0) { // If document size is unknown.
+ return current_pos_;
+ }
+
+ std::vector<std::pair<size_t, size_t> > ranges;
+ chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
+ uint32 available = document_size_;
+ std::vector<std::pair<size_t, size_t> >::iterator it;
+ for (it = ranges.begin(); it != ranges.end(); ++it) {
+ available -= it->second;
+ }
+ return available;
+}
+
+void DocumentLoader::ClearPendingRequests() {
+ // The first item in the queue is pending (need to keep it in the queue).
+ if (pending_requests_.size() > 1) {
+ // Remove all elements except the first one.
+ pending_requests_.erase(++pending_requests_.begin(),
+ pending_requests_.end());
+ }
+}
+
+bool DocumentLoader::GetBlock(uint32 position, uint32 size, void* buf) const {
+ return chunk_stream_.ReadData(position, size, buf);
+}
+
+bool DocumentLoader::IsDataAvailable(uint32 position, uint32 size) const {
+ return chunk_stream_.IsRangeAvailable(position, size);
+}
+
+void DocumentLoader::RequestData(uint32 position, uint32 size) {
+ DCHECK(partial_document_);
+
+ // We have some artefact request from
+ // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
+ // document is complete.
+ // We need this fix in PDFIum. Adding this as a work around.
+ // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
+ // Test url:
+ // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
+ if (IsDocumentComplete())
+ return;
+
+ pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
+ DownloadPendingRequests();
+}
+
+void DocumentLoader::DownloadPendingRequests() {
+ if (request_pending_ || pending_requests_.empty())
+ return;
+
+ // Remove already completed requests.
+ // By design DownloadPendingRequests() should have at least 1 request in the
+ // queue. ReadComplete() will remove the last pending comment from the queue.
+ while (pending_requests_.size() > 1) {
+ if (IsDataAvailable(pending_requests_.front().first,
+ pending_requests_.front().second)) {
+ pending_requests_.pop_front();
+ } else {
+ break;
+ }
+ }
+
+ uint32 pos = pending_requests_.front().first;
+ uint32 size = pending_requests_.front().second;
+ if (IsDataAvailable(pos, size)) {
+ ReadComplete();
+ return;
+ }
+
+ // If current request has been partially downloaded already, split it into
+ // a few smaller requests.
+ std::vector<std::pair<size_t, size_t> > ranges;
+ chunk_stream_.GetMissedRanges(pos, size, &ranges);
+ if (ranges.size() > 0) {
+ pending_requests_.pop_front();
+ pending_requests_.insert(pending_requests_.begin(),
+ ranges.begin(), ranges.end());
+ pos = pending_requests_.front().first;
+ size = pending_requests_.front().second;
+ }
+
+ uint32 cur_request_size = GetRequestSize();
+ // If size is less than default request, try to expand download range for
+ // more optimal download.
+ if (size < cur_request_size && partial_document_) {
+ // First, try to expand block towards the end of the file.
+ uint32 new_pos = pos;
+ uint32 new_size = cur_request_size;
+ if (pos + new_size > document_size_)
+ new_size = document_size_ - pos;
+
+ std::vector<std::pair<size_t, size_t> > ranges;
+ if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
+ new_pos = ranges[0].first;
+ new_size = ranges[0].second;
+ }
+
+ // Second, try to expand block towards the beginning of the file.
+ if (new_size < cur_request_size) {
+ uint32 block_end = new_pos + new_size;
+ if (block_end > cur_request_size) {
+ new_pos = block_end - cur_request_size;
+ } else {
+ new_pos = 0;
+ }
+ new_size = block_end - new_pos;
+
+ if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
+ new_pos = ranges.back().first;
+ new_size = ranges.back().second;
+ }
+ }
+ pos = new_pos;
+ size = new_size;
+ }
+
+ size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
+ size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
+ if (pos - last_byte_before < cur_request_size) {
+ size = pos + size - last_byte_before;
+ pos = last_byte_before;
+ }
+
+ if ((pos + size < first_byte_after) &&
+ (pos + size + cur_request_size >= first_byte_after))
+ size = first_byte_after - pos;
+
+ request_pending_ = true;
+
+ // Start downloading first pending request.
+ loader_.Close();
+ loader_ = client_->CreateURLLoader();
+ pp::CompletionCallback callback =
+ loader_factory_.NewCallback(&DocumentLoader::DidOpen);
+ pp::URLRequestInfo request = GetRequest(pos, size);
+ requests_count_++;
+ int rv = loader_.Open(request, callback);
+ if (rv != PP_OK_COMPLETIONPENDING)
+ callback.Run(rv);
+}
+
+pp::URLRequestInfo DocumentLoader::GetRequest(uint32 position,
+ uint32 size) const {
+ pp::URLRequestInfo request(client_->GetPluginInstance());
+ request.SetURL(url_.c_str());
+ request.SetMethod("GET");
+ request.SetFollowRedirects(true);
+
+ const size_t kBufSize = 100;
+ char buf[kBufSize];
+ // According to rfc2616, byte range specifies position of the first and last
+ // bytes in the requested range inclusively. Therefore we should subtract 1
+ // from the position + size, to get index of the last byte that needs to be
+ // downloaded.
+ base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
+ position + size - 1);
+ pp::Var header(buf);
+ request.SetHeaders(header);
+
+ return request;
+}
+
+void DocumentLoader::DidOpen(int32_t result) {
+ if (result != PP_OK) {
+ NOTREACHED();
+ return;
+ }
+
+ int32_t http_code = loader_.GetResponseInfo().GetStatusCode();
+ if (http_code >= 400 && http_code < 500) {
+ // Error accessing resource. 4xx error indicate subsequent requests
+ // will fail too.
+ // E.g. resource has been removed from the server while loading it.
+ // https://code.google.com/p/chromium/issues/detail?id=414827
+ return;
+ }
+
+ is_multipart_ = false;
+ current_chunk_size_ = 0;
+ current_chunk_read_ = 0;
+
+ pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
+ std::string headers;
+ if (headers_var.is_string())
+ headers = headers_var.AsString();
+
+ std::string boundary = GetMultiPartBoundary(headers);
+ if (boundary.size()) {
+ // Leave position untouched for now, when we read the data we'll get it.
+ is_multipart_ = true;
+ multipart_boundary_ = boundary;
+ } else {
+ // Need to make sure that the server returned a byte-range, since it's
+ // possible for a server to just ignore our bye-range request and just
+ // return the entire document even if it supports byte-range requests.
+ // i.e. sniff response to
+ // http://www.act.org/compass/sample/pdf/geometry.pdf
+ current_pos_ = 0;
+ uint32 start_pos, end_pos;
+ if (GetByteRange(headers, &start_pos, &end_pos)) {
+ current_pos_ = start_pos;
+ if (end_pos && end_pos > start_pos)
+ current_chunk_size_ = end_pos - start_pos + 1;
+ }
+ }
+
+ ReadMore();
+}
+
+bool DocumentLoader::GetByteRange(const std::string& headers, uint32* start,
+ uint32* end) {
+ net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
+ while (it.GetNext()) {
+ if (LowerCaseEqualsASCII(it.name(), "content-range")) {
+ std::string range = it.values().c_str();
+ if (StartsWithASCII(range, "bytes", false)) {
+ range = range.substr(strlen("bytes"));
+ std::string::size_type pos = range.find('-');
+ std::string range_end;
+ if (pos != std::string::npos)
+ range_end = range.substr(pos + 1);
+ TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
+ TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
+ *start = atoi(range.c_str());
+ *end = atoi(range_end.c_str());
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+std::string DocumentLoader::GetMultiPartBoundary(const std::string& headers) {
+ net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
+ while (it.GetNext()) {
+ if (LowerCaseEqualsASCII(it.name(), "content-type")) {
+ std::string type = base::StringToLowerASCII(it.values());
+ if (StartsWithASCII(type, "multipart/", true)) {
+ const char* boundary = strstr(type.c_str(), "boundary=");
+ if (!boundary) {
+ NOTREACHED();
+ break;
+ }
+
+ return std::string(boundary + 9);
+ }
+ }
+ }
+ return std::string();
+}
+
+void DocumentLoader::ReadMore() {
+ pp::CompletionCallback callback =
+ loader_factory_.NewCallback(&DocumentLoader::DidRead);
+ int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
+ if (rv != PP_OK_COMPLETIONPENDING)
+ callback.Run(rv);
+}
+
+void DocumentLoader::DidRead(int32_t result) {
+ if (result > 0) {
+ char* start = buffer_;
+ size_t length = result;
+ if (is_multipart_ && result > 2) {
+ for (int i = 2; i < result; ++i) {
+ if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
+ (i >= 4 &&
+ buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
+ buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
+ uint32 start_pos, end_pos;
+ if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
+ current_pos_ = start_pos;
+ start += i;
+ length -= i;
+ if (end_pos && end_pos > start_pos)
+ current_chunk_size_ = end_pos - start_pos + 1;
+ }
+ break;
+ }
+ }
+
+ // Reset this flag so we don't look inside the buffer in future calls of
+ // DidRead for this response. Note that this code DOES NOT handle multi-
+ // part responses with more than one part (we don't issue them at the
+ // moment, so they shouldn't arrive).
+ is_multipart_ = false;
+ }
+
+ if (current_chunk_size_ &&
+ current_chunk_read_ + length > current_chunk_size_)
+ length = current_chunk_size_ - current_chunk_read_;
+
+ if (length) {
+ if (document_size_ > 0) {
+ chunk_stream_.WriteData(current_pos_, start, length);
+ } else {
+ // If we did not get content-length in the response, we can't
+ // preallocate buffer for the entire document. Resizing array causing
+ // memory fragmentation issues on the large files and OOM exceptions.
+ // To fix this, we collect all chunks of the file to the list and
+ // concatenate them together after request is complete.
+ chunk_buffer_.push_back(std::vector<unsigned char>());
+ chunk_buffer_.back().resize(length);
+ memcpy(&(chunk_buffer_.back()[0]), start, length);
+ }
+ current_pos_ += length;
+ current_chunk_read_ += length;
+ client_->OnNewDataAvailable();
+ }
+ ReadMore();
+ } else if (result == PP_OK) {
+ ReadComplete();
+ } else {
+ NOTREACHED();
+ }
+}
+
+void DocumentLoader::ReadComplete() {
+ if (!partial_document_) {
+ if (document_size_ == 0) {
+ // For the document with no 'content-length" specified we've collected all
+ // the chunks already. Let's allocate final document buffer and copy them
+ // over.
+ chunk_stream_.Preallocate(current_pos_);
+ uint32 pos = 0;
+ std::list<std::vector<unsigned char> >::iterator it;
+ for (it = chunk_buffer_.begin(); it != chunk_buffer_.end(); ++it) {
+ chunk_stream_.WriteData(pos, &((*it)[0]), it->size());
+ pos += it->size();
+ }
+ chunk_buffer_.clear();
+ }
+ document_size_ = current_pos_;
+ client_->OnDocumentComplete();
+ return;
+ }
+
+ request_pending_ = false;
+ pending_requests_.pop_front();
+
+ // If there are more pending request - continue downloading.
+ if (!pending_requests_.empty()) {
+ DownloadPendingRequests();
+ return;
+ }
+
+ if (IsDocumentComplete()) {
+ client_->OnDocumentComplete();
+ return;
+ }
+
+ if (header_request_)
+ client_->OnPartialDocumentLoaded();
+ else
+ client_->OnPendingRequestComplete();
+ header_request_ = false;
+
+ // The OnPendingRequestComplete could have added more requests.
+ if (!pending_requests_.empty()) {
+ DownloadPendingRequests();
+ } else {
+ // Document is not complete and we have no outstanding requests.
+ // Let's keep downloading PDF file in small chunks.
+ uint32 pos = chunk_stream_.GetFirstMissingByte();
+ std::vector<std::pair<size_t, size_t> > ranges;
+ chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
+ DCHECK(ranges.size() > 0);
+ RequestData(ranges[0].first, ranges[0].second);
+ }
+}
+
+uint32 DocumentLoader::GetRequestSize() const {
+ // Document loading strategy:
+ // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
+ // double the size (64k), and so on, until we cap max request size at 2M for
+ // 71 or more requests.
+ uint32 limited_count = std::min(std::max(requests_count_, 10u), 70u);
+ return 32*1024 * (1 << ((limited_count - 1) / 10u));
+}
+
+} // namespace chrome_pdf