blob: 28a6ee4ebd74dfc643fd49759ddde124f7847d0b [file] [log] [blame]
[email protected]1b1e9eff2014-05-20 01:56:401// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "pdf/document_loader.h"
6
7#include "base/logging.h"
8#include "base/strings/string_util.h"
9#include "net/http/http_util.h"
10#include "ppapi/c/pp_errors.h"
11#include "ppapi/cpp/url_loader.h"
12#include "ppapi/cpp/url_request_info.h"
13#include "ppapi/cpp/url_response_info.h"
14
15namespace chrome_pdf {
16
thestig945cd0cb2015-05-28 01:58:0517namespace {
18
[email protected]1b1e9eff2014-05-20 01:56:4019// Document below size will be downloaded in one chunk.
thestig945cd0cb2015-05-28 01:58:0520const uint32_t kMinFileSize = 64 * 1024;
21
22// If the headers have a byte-range response, writes the start and end
23// positions and returns true if at least the start position was parsed.
24// The end position will be set to 0 if it was not found or parsed from the
25// response.
26// Returns false if not even a start position could be parsed.
27bool GetByteRange(const std::string& headers, uint32_t* start, uint32_t* end) {
28 net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
29 while (it.GetNext()) {
brettwbc17d2c82015-06-09 22:39:0830 if (base::LowerCaseEqualsASCII(it.name(), "content-range")) {
thestig945cd0cb2015-05-28 01:58:0531 std::string range = it.values().c_str();
brettw44ce0ec52015-06-12 01:57:5732 if (base::StartsWithASCII(range, "bytes", false)) {
thestig945cd0cb2015-05-28 01:58:0533 range = range.substr(strlen("bytes"));
34 std::string::size_type pos = range.find('-');
35 std::string range_end;
36 if (pos != std::string::npos)
37 range_end = range.substr(pos + 1);
38 TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
39 TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
40 *start = atoi(range.c_str());
41 *end = atoi(range_end.c_str());
42 return true;
43 }
44 }
45 }
46 return false;
47}
48
49// If the headers have a multi-part response, returns the boundary name.
50// Otherwise returns an empty string.
51std::string GetMultiPartBoundary(const std::string& headers) {
52 net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
53 while (it.GetNext()) {
brettwbc17d2c82015-06-09 22:39:0854 if (base::LowerCaseEqualsASCII(it.name(), "content-type")) {
thestig945cd0cb2015-05-28 01:58:0555 std::string type = base::StringToLowerASCII(it.values());
brettw44ce0ec52015-06-12 01:57:5756 if (base::StartsWithASCII(type, "multipart/", true)) {
thestig945cd0cb2015-05-28 01:58:0557 const char* boundary = strstr(type.c_str(), "boundary=");
58 if (!boundary) {
59 NOTREACHED();
60 break;
61 }
62
63 return std::string(boundary + 9);
64 }
65 }
66 }
67 return std::string();
68}
69
thestig488102f2015-05-29 03:25:2670bool IsValidContentType(const std::string& type) {
brettwa7ff1b292015-07-16 17:49:2971 return (base::EndsWith(type, "/pdf", base::CompareCase::INSENSITIVE_ASCII) ||
72 base::EndsWith(type, ".pdf", base::CompareCase::INSENSITIVE_ASCII) ||
73 base::EndsWith(type, "/x-pdf",
74 base::CompareCase::INSENSITIVE_ASCII) ||
75 base::EndsWith(type, "/*", base::CompareCase::INSENSITIVE_ASCII) ||
76 base::EndsWith(type, "/acrobat",
77 base::CompareCase::INSENSITIVE_ASCII) ||
78 base::EndsWith(type, "/unknown",
79 base::CompareCase::INSENSITIVE_ASCII));
thestig488102f2015-05-29 03:25:2680}
81
thestig945cd0cb2015-05-28 01:58:0582} // namespace
83
84DocumentLoader::Client::~Client() {
85}
[email protected]1b1e9eff2014-05-20 01:56:4086
87DocumentLoader::DocumentLoader(Client* client)
88 : client_(client), partial_document_(false), request_pending_(false),
89 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
90 document_size_(0), header_request_(true), is_multipart_(false) {
91 loader_factory_.Initialize(this);
92}
93
94DocumentLoader::~DocumentLoader() {
95}
96
97bool DocumentLoader::Init(const pp::URLLoader& loader,
98 const std::string& url,
99 const std::string& headers) {
100 DCHECK(url_.empty());
101 url_ = url;
102 loader_ = loader;
103
104 std::string response_headers;
105 if (!headers.empty()) {
106 response_headers = headers;
107 } else {
108 pp::URLResponseInfo response = loader_.GetResponseInfo();
109 pp::Var headers_var = response.GetHeaders();
110
111 if (headers_var.is_string()) {
112 response_headers = headers_var.AsString();
113 }
114 }
115
116 bool accept_ranges_bytes = false;
117 bool content_encoded = false;
thestig945cd0cb2015-05-28 01:58:05118 uint32_t content_length = 0;
[email protected]1b1e9eff2014-05-20 01:56:40119 std::string type;
120 std::string disposition;
thestig488102f2015-05-29 03:25:26121
122 // This happens for PDFs not loaded from http(s) sources.
123 if (response_headers == "Content-Type: text/plain") {
brettw44ce0ec52015-06-12 01:57:57124 if (!base::StartsWithASCII(url, "http://", false) &&
125 !base::StartsWithASCII(url, "https://", false)) {
thestig488102f2015-05-29 03:25:26126 type = "application/pdf";
127 }
128 }
129 if (type.empty() && !response_headers.empty()) {
[email protected]1b1e9eff2014-05-20 01:56:40130 net::HttpUtil::HeadersIterator it(response_headers.begin(),
131 response_headers.end(), "\n");
132 while (it.GetNext()) {
brettwbc17d2c82015-06-09 22:39:08133 if (base::LowerCaseEqualsASCII(it.name(), "content-length")) {
[email protected]1b1e9eff2014-05-20 01:56:40134 content_length = atoi(it.values().c_str());
brettwbc17d2c82015-06-09 22:39:08135 } else if (base::LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
136 accept_ranges_bytes = base::LowerCaseEqualsASCII(it.values(), "bytes");
137 } else if (base::LowerCaseEqualsASCII(it.name(), "content-encoding")) {
[email protected]1b1e9eff2014-05-20 01:56:40138 content_encoded = true;
brettwbc17d2c82015-06-09 22:39:08139 } else if (base::LowerCaseEqualsASCII(it.name(), "content-type")) {
[email protected]1b1e9eff2014-05-20 01:56:40140 type = it.values();
141 size_t semi_colon_pos = type.find(';');
142 if (semi_colon_pos != std::string::npos) {
143 type = type.substr(0, semi_colon_pos);
144 }
145 TrimWhitespace(type, base::TRIM_ALL, &type);
brettwbc17d2c82015-06-09 22:39:08146 } else if (base::LowerCaseEqualsASCII(it.name(), "content-disposition")) {
[email protected]1b1e9eff2014-05-20 01:56:40147 disposition = it.values();
148 }
149 }
150 }
thestig488102f2015-05-29 03:25:26151 if (!type.empty() && !IsValidContentType(type))
[email protected]1b1e9eff2014-05-20 01:56:40152 return false;
brettw44ce0ec52015-06-12 01:57:57153 if (base::StartsWithASCII(disposition, "attachment", false))
[email protected]1b1e9eff2014-05-20 01:56:40154 return false;
[email protected]1b1e9eff2014-05-20 01:56:40155
156 if (content_length > 0)
157 chunk_stream_.Preallocate(content_length);
158
159 document_size_ = content_length;
160 requests_count_ = 0;
161
[email protected]1b1e9eff2014-05-20 01:56:40162 // Enable partial loading only if file size is above the threshold.
163 // It will allow avoiding latency for multiple requests.
164 if (content_length > kMinFileSize &&
165 accept_ranges_bytes &&
166 !content_encoded) {
167 LoadPartialDocument();
168 } else {
169 LoadFullDocument();
170 }
171 return true;
172}
173
174void DocumentLoader::LoadPartialDocument() {
175 partial_document_ = true;
176 // Force the main request to be cancelled, since if we're a full-frame plugin
177 // there could be other references to the loader.
178 loader_.Close();
179 loader_ = pp::URLLoader();
180 // Download file header.
181 header_request_ = true;
182 RequestData(0, std::min(GetRequestSize(), document_size_));
183}
184
185void DocumentLoader::LoadFullDocument() {
186 partial_document_ = false;
187 chunk_buffer_.clear();
188 ReadMore();
189}
190
191bool DocumentLoader::IsDocumentComplete() const {
192 if (document_size_ == 0) // Document size unknown.
193 return false;
194 return IsDataAvailable(0, document_size_);
195}
196
thestig945cd0cb2015-05-28 01:58:05197uint32_t DocumentLoader::GetAvailableData() const {
[email protected]1b1e9eff2014-05-20 01:56:40198 if (document_size_ == 0) { // If document size is unknown.
199 return current_pos_;
200 }
201
202 std::vector<std::pair<size_t, size_t> > ranges;
203 chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
thestig945cd0cb2015-05-28 01:58:05204 uint32_t available = document_size_;
205 for (const auto& range : ranges)
206 available -= range.second;
[email protected]1b1e9eff2014-05-20 01:56:40207 return available;
208}
209
210void DocumentLoader::ClearPendingRequests() {
211 // The first item in the queue is pending (need to keep it in the queue).
212 if (pending_requests_.size() > 1) {
213 // Remove all elements except the first one.
214 pending_requests_.erase(++pending_requests_.begin(),
215 pending_requests_.end());
216 }
217}
218
thestig945cd0cb2015-05-28 01:58:05219bool DocumentLoader::GetBlock(uint32_t position,
220 uint32_t size,
221 void* buf) const {
[email protected]1b1e9eff2014-05-20 01:56:40222 return chunk_stream_.ReadData(position, size, buf);
223}
224
thestig945cd0cb2015-05-28 01:58:05225bool DocumentLoader::IsDataAvailable(uint32_t position, uint32_t size) const {
[email protected]1b1e9eff2014-05-20 01:56:40226 return chunk_stream_.IsRangeAvailable(position, size);
227}
228
thestig945cd0cb2015-05-28 01:58:05229void DocumentLoader::RequestData(uint32_t position, uint32_t size) {
[email protected]1b1e9eff2014-05-20 01:56:40230 DCHECK(partial_document_);
231
232 // We have some artefact request from
233 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
234 // document is complete.
235 // We need this fix in PDFIum. Adding this as a work around.
236 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
237 // Test url:
238 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
239 if (IsDocumentComplete())
240 return;
241
242 pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
243 DownloadPendingRequests();
244}
245
246void DocumentLoader::DownloadPendingRequests() {
247 if (request_pending_ || pending_requests_.empty())
248 return;
249
250 // Remove already completed requests.
251 // By design DownloadPendingRequests() should have at least 1 request in the
252 // queue. ReadComplete() will remove the last pending comment from the queue.
253 while (pending_requests_.size() > 1) {
254 if (IsDataAvailable(pending_requests_.front().first,
255 pending_requests_.front().second)) {
256 pending_requests_.pop_front();
257 } else {
258 break;
259 }
260 }
261
thestig945cd0cb2015-05-28 01:58:05262 uint32_t pos = pending_requests_.front().first;
263 uint32_t size = pending_requests_.front().second;
[email protected]1b1e9eff2014-05-20 01:56:40264 if (IsDataAvailable(pos, size)) {
265 ReadComplete();
266 return;
267 }
268
269 // If current request has been partially downloaded already, split it into
270 // a few smaller requests.
271 std::vector<std::pair<size_t, size_t> > ranges;
272 chunk_stream_.GetMissedRanges(pos, size, &ranges);
thestig945cd0cb2015-05-28 01:58:05273 if (!ranges.empty()) {
[email protected]1b1e9eff2014-05-20 01:56:40274 pending_requests_.pop_front();
275 pending_requests_.insert(pending_requests_.begin(),
276 ranges.begin(), ranges.end());
277 pos = pending_requests_.front().first;
278 size = pending_requests_.front().second;
279 }
280
thestig945cd0cb2015-05-28 01:58:05281 uint32_t cur_request_size = GetRequestSize();
[email protected]1b1e9eff2014-05-20 01:56:40282 // If size is less than default request, try to expand download range for
283 // more optimal download.
284 if (size < cur_request_size && partial_document_) {
285 // First, try to expand block towards the end of the file.
thestig945cd0cb2015-05-28 01:58:05286 uint32_t new_pos = pos;
287 uint32_t new_size = cur_request_size;
[email protected]1b1e9eff2014-05-20 01:56:40288 if (pos + new_size > document_size_)
289 new_size = document_size_ - pos;
290
291 std::vector<std::pair<size_t, size_t> > ranges;
292 if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
293 new_pos = ranges[0].first;
294 new_size = ranges[0].second;
295 }
296
297 // Second, try to expand block towards the beginning of the file.
298 if (new_size < cur_request_size) {
thestig945cd0cb2015-05-28 01:58:05299 uint32_t block_end = new_pos + new_size;
[email protected]1b1e9eff2014-05-20 01:56:40300 if (block_end > cur_request_size) {
301 new_pos = block_end - cur_request_size;
302 } else {
303 new_pos = 0;
304 }
305 new_size = block_end - new_pos;
306
307 if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
308 new_pos = ranges.back().first;
309 new_size = ranges.back().second;
310 }
311 }
312 pos = new_pos;
313 size = new_size;
314 }
315
316 size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
317 size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
318 if (pos - last_byte_before < cur_request_size) {
319 size = pos + size - last_byte_before;
320 pos = last_byte_before;
321 }
322
323 if ((pos + size < first_byte_after) &&
324 (pos + size + cur_request_size >= first_byte_after))
325 size = first_byte_after - pos;
326
327 request_pending_ = true;
328
329 // Start downloading first pending request.
330 loader_.Close();
331 loader_ = client_->CreateURLLoader();
332 pp::CompletionCallback callback =
333 loader_factory_.NewCallback(&DocumentLoader::DidOpen);
334 pp::URLRequestInfo request = GetRequest(pos, size);
335 requests_count_++;
336 int rv = loader_.Open(request, callback);
337 if (rv != PP_OK_COMPLETIONPENDING)
338 callback.Run(rv);
339}
340
thestig945cd0cb2015-05-28 01:58:05341pp::URLRequestInfo DocumentLoader::GetRequest(uint32_t position,
342 uint32_t size) const {
[email protected]1b1e9eff2014-05-20 01:56:40343 pp::URLRequestInfo request(client_->GetPluginInstance());
thestiga9ceb722015-04-30 02:06:09344 request.SetURL(url_);
[email protected]1b1e9eff2014-05-20 01:56:40345 request.SetMethod("GET");
346 request.SetFollowRedirects(true);
thestiga9ceb722015-04-30 02:06:09347 request.SetCustomReferrerURL(url_);
[email protected]1b1e9eff2014-05-20 01:56:40348
349 const size_t kBufSize = 100;
350 char buf[kBufSize];
351 // According to rfc2616, byte range specifies position of the first and last
352 // bytes in the requested range inclusively. Therefore we should subtract 1
353 // from the position + size, to get index of the last byte that needs to be
354 // downloaded.
355 base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
356 position + size - 1);
357 pp::Var header(buf);
358 request.SetHeaders(header);
359
360 return request;
361}
362
363void DocumentLoader::DidOpen(int32_t result) {
364 if (result != PP_OK) {
365 NOTREACHED();
366 return;
367 }
368
gene7cafb2ce62014-10-24 00:56:53369 int32_t http_code = loader_.GetResponseInfo().GetStatusCode();
370 if (http_code >= 400 && http_code < 500) {
371 // Error accessing resource. 4xx error indicate subsequent requests
372 // will fail too.
373 // E.g. resource has been removed from the server while loading it.
374 // https://code.google.com/p/chromium/issues/detail?id=414827
375 return;
376 }
377
[email protected]1b1e9eff2014-05-20 01:56:40378 is_multipart_ = false;
379 current_chunk_size_ = 0;
380 current_chunk_read_ = 0;
381
382 pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
383 std::string headers;
384 if (headers_var.is_string())
385 headers = headers_var.AsString();
386
387 std::string boundary = GetMultiPartBoundary(headers);
thestig945cd0cb2015-05-28 01:58:05388 if (!boundary.empty()) {
[email protected]1b1e9eff2014-05-20 01:56:40389 // Leave position untouched for now, when we read the data we'll get it.
390 is_multipart_ = true;
391 multipart_boundary_ = boundary;
392 } else {
393 // Need to make sure that the server returned a byte-range, since it's
394 // possible for a server to just ignore our bye-range request and just
395 // return the entire document even if it supports byte-range requests.
396 // i.e. sniff response to
397 // http://www.act.org/compass/sample/pdf/geometry.pdf
398 current_pos_ = 0;
thestig945cd0cb2015-05-28 01:58:05399 uint32_t start_pos, end_pos;
[email protected]1b1e9eff2014-05-20 01:56:40400 if (GetByteRange(headers, &start_pos, &end_pos)) {
401 current_pos_ = start_pos;
402 if (end_pos && end_pos > start_pos)
403 current_chunk_size_ = end_pos - start_pos + 1;
404 }
405 }
406
407 ReadMore();
408}
409
[email protected]1b1e9eff2014-05-20 01:56:40410void DocumentLoader::ReadMore() {
411 pp::CompletionCallback callback =
412 loader_factory_.NewCallback(&DocumentLoader::DidRead);
413 int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
414 if (rv != PP_OK_COMPLETIONPENDING)
415 callback.Run(rv);
416}
417
418void DocumentLoader::DidRead(int32_t result) {
419 if (result > 0) {
420 char* start = buffer_;
421 size_t length = result;
422 if (is_multipart_ && result > 2) {
423 for (int i = 2; i < result; ++i) {
424 if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
425 (i >= 4 &&
426 buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
427 buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
thestig945cd0cb2015-05-28 01:58:05428 uint32_t start_pos, end_pos;
[email protected]1b1e9eff2014-05-20 01:56:40429 if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
430 current_pos_ = start_pos;
431 start += i;
432 length -= i;
433 if (end_pos && end_pos > start_pos)
434 current_chunk_size_ = end_pos - start_pos + 1;
435 }
436 break;
437 }
438 }
439
440 // Reset this flag so we don't look inside the buffer in future calls of
441 // DidRead for this response. Note that this code DOES NOT handle multi-
442 // part responses with more than one part (we don't issue them at the
443 // moment, so they shouldn't arrive).
444 is_multipart_ = false;
445 }
446
447 if (current_chunk_size_ &&
448 current_chunk_read_ + length > current_chunk_size_)
449 length = current_chunk_size_ - current_chunk_read_;
450
451 if (length) {
452 if (document_size_ > 0) {
453 chunk_stream_.WriteData(current_pos_, start, length);
454 } else {
455 // If we did not get content-length in the response, we can't
456 // preallocate buffer for the entire document. Resizing array causing
457 // memory fragmentation issues on the large files and OOM exceptions.
458 // To fix this, we collect all chunks of the file to the list and
459 // concatenate them together after request is complete.
460 chunk_buffer_.push_back(std::vector<unsigned char>());
461 chunk_buffer_.back().resize(length);
462 memcpy(&(chunk_buffer_.back()[0]), start, length);
463 }
464 current_pos_ += length;
465 current_chunk_read_ += length;
466 client_->OnNewDataAvailable();
467 }
468 ReadMore();
469 } else if (result == PP_OK) {
470 ReadComplete();
471 } else {
472 NOTREACHED();
473 }
474}
475
476void DocumentLoader::ReadComplete() {
477 if (!partial_document_) {
478 if (document_size_ == 0) {
479 // For the document with no 'content-length" specified we've collected all
480 // the chunks already. Let's allocate final document buffer and copy them
481 // over.
482 chunk_stream_.Preallocate(current_pos_);
thestig945cd0cb2015-05-28 01:58:05483 uint32_t pos = 0;
484 for (auto& chunk : chunk_buffer_) {
485 chunk_stream_.WriteData(pos, &(chunk[0]), chunk.size());
486 pos += chunk.size();
[email protected]1b1e9eff2014-05-20 01:56:40487 }
488 chunk_buffer_.clear();
489 }
490 document_size_ = current_pos_;
491 client_->OnDocumentComplete();
492 return;
493 }
494
495 request_pending_ = false;
496 pending_requests_.pop_front();
497
498 // If there are more pending request - continue downloading.
499 if (!pending_requests_.empty()) {
500 DownloadPendingRequests();
501 return;
502 }
503
504 if (IsDocumentComplete()) {
505 client_->OnDocumentComplete();
506 return;
507 }
508
509 if (header_request_)
510 client_->OnPartialDocumentLoaded();
511 else
512 client_->OnPendingRequestComplete();
513 header_request_ = false;
514
515 // The OnPendingRequestComplete could have added more requests.
516 if (!pending_requests_.empty()) {
517 DownloadPendingRequests();
518 } else {
519 // Document is not complete and we have no outstanding requests.
520 // Let's keep downloading PDF file in small chunks.
thestig945cd0cb2015-05-28 01:58:05521 uint32_t pos = chunk_stream_.GetFirstMissingByte();
[email protected]1b1e9eff2014-05-20 01:56:40522 std::vector<std::pair<size_t, size_t> > ranges;
523 chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
thestig945cd0cb2015-05-28 01:58:05524 DCHECK(!ranges.empty());
[email protected]1b1e9eff2014-05-20 01:56:40525 RequestData(ranges[0].first, ranges[0].second);
526 }
527}
528
thestig945cd0cb2015-05-28 01:58:05529uint32_t DocumentLoader::GetRequestSize() const {
[email protected]a2c2d532014-05-29 20:41:30530 // Document loading strategy:
531 // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
532 // double the size (64k), and so on, until we cap max request size at 2M for
533 // 71 or more requests.
thestig945cd0cb2015-05-28 01:58:05534 uint32_t limited_count = std::min(std::max(requests_count_, 10u), 70u);
535 return 32 * 1024 * (1 << ((limited_count - 1) / 10u));
[email protected]1b1e9eff2014-05-20 01:56:40536}
537
538} // namespace chrome_pdf