Bug 696129: Be more forgiving of broken files.

The PDF spec says that old format xrefs should start with: xref\n<start> <len> The example file in question has: xref <start> <len> which confuses our parsing code. Update the parse code to avoid using fz_read_line, and to instead work on a char level. Also, downgrade the error given when the first object is not free to be a warning. Now we do 'just in time' repair, we are probably better able to cope.
author: Robin Watts <robin.watts@artifex.com> 2015-10-01 18:05:46 +0100
committer: Robin Watts <robin.watts@artifex.com> 2015-10-02 15:53:19 +0100
commit: 1acaaf2b40614401378aa697de47093be9f390fe (patch)
tree: 884597d2a9345e93aa55f19aba40b9d2f40f8100 /source/pdf/pdf-xref.c
parent: 1cf9a6c5092adce610f3e6b33219e09b4383249f (diff)
download: mupdf-1acaaf2b40614401378aa697de47093be9f390fe.tar.xz
1 files changed, 32 insertions, 5 deletions
diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c
index 727dcda7..7c722c9c 100644
--- a/source/pdf/pdf-xref.c
+++ b/source/pdf/pdf-xref.c
@@ -587,6 +587,31 @@ pdf_read_start_xref(fz_context *ctx, pdf_document *doc)
 	fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find startxref");
 }
 
+static void
+fz_skip_space(fz_context *ctx, fz_stream *stm)
+{
+	do
+	{
+		int c = fz_peek_byte(ctx, stm);
+		if (c > 32 && c != EOF)
+			return;
+		(void)fz_read_byte(ctx, stm);
+	}
+	while (1);
+}
+
+static int fz_skip_string(fz_context *ctx, fz_stream *stm, const char *str)
+{
+	while (*str)
+	{
+		int c = fz_peek_byte(ctx, stm);
+		if (c == EOF || c != *str++)
+			return 1;
+		(void)fz_read_byte(ctx, stm);
+	}
+	return 0;
+}
+
 /*
  * trailer dictionary
  */
@@ -608,9 +633,10 @@ pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *b
 	/* Record the current file read offset so that we can reinstate it */
 	ofs = fz_tell(ctx, doc->file);
 
-	fz_read_line(ctx, doc->file, buf->scratch, buf->size);
-	if (strncmp(buf->scratch, "xref", 4) != 0)
+	fz_skip_space(ctx, doc->file);
+	if (fz_skip_string(ctx, doc->file, "xref"))
 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find xref marker");
+	fz_skip_space(ctx, doc->file);
 
 	while (1)
 	{
@@ -752,9 +778,10 @@ pdf_read_old_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
 	int xref_len = pdf_xref_size_from_old_trailer(ctx, doc, buf);
 	pdf_xref_entry *table;
 
-	fz_read_line(ctx, file, buf->scratch, buf->size);
-	if (strncmp(buf->scratch, "xref", 4) != 0)
+	fz_skip_space(ctx, doc->file);
+	if (fz_skip_string(ctx, doc->file, "xref"))
 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find xref marker");
+	fz_skip_space(ctx, doc->file);
 
 	while (1)
 	{
@@ -1151,7 +1178,7 @@ pdf_load_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
 	}
 	/* broken pdfs where first object is not free */
 	else if (entry->type != 'f')
-		fz_throw(ctx, FZ_ERROR_GENERIC, "first object in xref is not free");
+		fz_warn(ctx, "first object in xref is not free");
 
 	/* broken pdfs where object offsets are out of range */
 	xref_len = pdf_xref_len(ctx, doc);
author	Robin Watts <robin.watts@artifex.com>	2015-10-01 18:05:46 +0100
committer	Robin Watts <robin.watts@artifex.com>	2015-10-02 15:53:19 +0100
commit	1acaaf2b40614401378aa697de47093be9f390fe (patch)
tree	884597d2a9345e93aa55f19aba40b9d2f40f8100 /source/pdf/pdf-xref.c
parent	1cf9a6c5092adce610f3e6b33219e09b4383249f (diff)
download	mupdf-1acaaf2b40614401378aa697de47093be9f390fe.tar.xz