Bug 699308: Fix stream reading logic to better cope with duff Lengths.

Always look for the "endstream" marker after a PDF stream to see if we've hit the end. Allow for "endobj" to cope with producers that omit endstream entirely. Avoid slowing down legal files by only checking for the end marker after the specified length has been read.
author: Robin Watts <Robin.Watts@artifex.com> 2018-05-31 15:59:53 +0100
committer: Robin Watts <robin.watts@artifex.com> 2018-07-06 18:00:03 +0100
commit: 75c457ddd28a629a9e1d6e1a8fa313ffef1457cb (patch)
tree: 96099a85afbfcaa5af457aa230dfcb89fbd99825 /source
parent: 533684eb51bb72df871a63eb17e589e2ec9bb547 (diff)
download: mupdf-75c457ddd28a629a9e1d6e1a8fa313ffef1457cb.tar.xz
2 files changed, 109 insertions, 7 deletions
diff --git a/source/fitz/filter-basic.c b/source/fitz/filter-basic.c
index 5e03a176..6928ffff 100644
--- a/source/fitz/filter-basic.c
+++ b/source/fitz/filter-basic.c
@@ -8,9 +8,12 @@ struct null_filter
 {
 	fz_stream *chain;
 	fz_range *ranges;
+	int look_for_endstream;
 	int nranges;
 	int next_range;
 	size_t remain;
+	unsigned int extras;
+	unsigned int size;
 	int64_t offset;
 	unsigned char buffer[4096];
 };
@@ -19,7 +22,9 @@ static int
 next_null(fz_context *ctx, fz_stream *stm, size_t max)
 {
 	struct null_filter *state = stm->state;
-	size_t n;
+	size_t n, i, nbytes_in_buffer;
+	const char *rp;
+	unsigned int size;
 
 	while (state->remain == 0 && state->next_range < state->nranges)
 	{
@@ -29,7 +34,7 @@ next_null(fz_context *ctx, fz_stream *stm, size_t max)
 	}
 
 	if (state->remain == 0)
-		return EOF;
+		goto maybe_ended;
 	fz_seek(ctx, state->chain, state->offset, 0);
 	n = fz_available(ctx, state->chain, max);
 	if (n > state->remain)
@@ -40,12 +45,88 @@ next_null(fz_context *ctx, fz_stream *stm, size_t max)
 	stm->rp = state->buffer;
 	stm->wp = stm->rp + n;
 	if (n == 0)
-		return EOF;
+		goto maybe_ended;
 	state->chain->rp += n;
 	state->remain -= n;
 	state->offset += (int64_t)n;
 	stm->pos += (int64_t)n;
 	return *stm->rp++;
+
+maybe_ended:
+	if (state->look_for_endstream == 0)
+		return EOF;
+
+	/* We should distrust the stream length, and check for end
+	 * marker before terminating the stream - this is to cope
+	 * with files with duff "Length" values. */
+	fz_seek(ctx, state->chain, state->offset, 0);
+
+	/* Move any data left over in our buffer down to the start.
+	 * Ordinarily, there won't be any, but this allows for the
+	 * case where we were part way through matching a stream end
+	 * marker when the buffer filled before. */
+	nbytes_in_buffer = state->extras;
+	if (nbytes_in_buffer)
+		memmove(state->buffer, stm->rp, nbytes_in_buffer);
+	stm->rp = state->buffer;
+	stm->wp = stm->rp + nbytes_in_buffer;
+
+	/* In most sane files, we'll get "\nendstream" instantly. We
+	 * should only need (say) 32 bytes to be sure. For crap files
+	 * where we overread regularly, don't harm performance by
+	 * working in small chunks. */
+	state->size *= 2;
+	if (state->size > sizeof(state->buffer))
+		state->size = sizeof(state->buffer);
+#define END_CHECK_SIZE 32
+	size = state->size;
+	while (nbytes_in_buffer < size)
+	{
+		n = fz_available(ctx, state->chain, size - nbytes_in_buffer);
+		if (n == 0)
+			break;
+		if (n > size - nbytes_in_buffer)
+			n = size - nbytes_in_buffer;
+		memcpy(stm->wp, state->chain->rp, n);
+		stm->wp += n;
+		state->chain->rp += n;
+		nbytes_in_buffer += n;
+	}
+
+	*stm->wp = 0; /* Be friendly to strcmp */
+	rp = (char *)state->buffer;
+	n = 0;
+	/* If we don't have at least 11 bytes in the buffer, then we don't have
+	 * enough bytes for the worst case terminator. Also, we're dangerously
+	 * close to the end of the file. Don't risk overrunning the buffer. */
+	if (nbytes_in_buffer >= 11)
+		for (i = 0; i < nbytes_in_buffer - 11; )
+		{
+			n = i;
+			if (rp[i] == '\r')
+				i++;
+			if (rp[i] == '\n')
+				i++;
+			if (rp[i++] != 'e')
+				continue;
+			if (rp[i++] != 'n')
+				continue;
+			if (rp[i++] != 'd')
+				continue;
+			if (memcmp(&rp[i], "stream", 6) == 0 || (memcmp(&rp[i], "obj", 3) == 0))
+				break;
+			i++;
+		}
+
+	/* We have at least n bytes before we hit an end marker */
+	state->offset += (int64_t)nbytes_in_buffer - state->extras;
+	state->extras = nbytes_in_buffer - n;
+	stm->wp = stm->rp + n;
+	stm->pos += n;
+
+	if (n == 0)
+		return EOF;
+	return *stm->rp++;
 }
 
 static void
@@ -57,8 +138,8 @@ close_null(fz_context *ctx, void *state_)
 	fz_free(ctx, state);
 }
 
-fz_stream *
-fz_open_null_n(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges)
+static fz_stream *
+fz_open_null_n_terminator(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges, int terminator)
 {
 	struct null_filter *state = NULL;
 
@@ -69,10 +150,13 @@ fz_open_null_n(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges)
 		{
 			state->ranges = fz_calloc(ctx, nranges, sizeof(*ranges));
 			memcpy(state->ranges, ranges, nranges * sizeof(*ranges));
+			state->look_for_endstream = terminator;
 			state->nranges = nranges;
 			state->next_range = 1;
 			state->remain = ranges[0].len;
 			state->offset = ranges[0].offset;
+			state->extras = 0;
+			state->size = END_CHECK_SIZE>>1;
 		}
 		else
 		{
@@ -95,6 +179,12 @@ fz_open_null_n(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges)
 }
 
 fz_stream *
+fz_open_null_n(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges)
+{
+	return fz_open_null_n_terminator(ctx, chain, ranges, nranges, 0);
+}
+
+fz_stream *
 fz_open_null(fz_context *ctx, fz_stream *chain, int len, int64_t offset)
 {
 	fz_range range;
@@ -104,9 +194,21 @@ fz_open_null(fz_context *ctx, fz_stream *chain, int len, int64_t offset)
 
 	range.offset = offset;
 	range.len = len;
-	return fz_open_null_n(ctx, chain, &range, 1);
+	return fz_open_null_n_terminator(ctx, chain, &range, 1, 0);
 }
 
+fz_stream *
+fz_open_pdf_stream(fz_context *ctx, fz_stream *chain, int len, int64_t offset)
+{
+	fz_range range;
+
+	if (len < 0)
+		len = 0;
+
+	range.offset = offset;
+	range.len = len;
+	return fz_open_null_n_terminator(ctx, chain, &range, 1, 1);
+}
 
 /* Concat filter concatenates several streams into one */
 
diff --git a/source/pdf/pdf-stream.c b/source/pdf/pdf-stream.c
index 04f1ed46..796e2490 100644
--- a/source/pdf/pdf-stream.c
+++ b/source/pdf/pdf-stream.c
@@ -298,7 +298,7 @@ pdf_open_raw_filter(fz_context *ctx, fz_stream *file_stm, pdf_document *doc, pdf
 
 	hascrypt = pdf_stream_has_crypt(ctx, stmobj);
 	len = pdf_dict_get_int(ctx, stmobj, PDF_NAME(Length));
-	null_stm = fz_open_null(ctx, file_stm, len, offset);
+	null_stm = fz_open_pdf_stream(ctx, file_stm, len, offset);
 	if (doc->crypt && !hascrypt)
 	{
 		fz_try(ctx)
author	Robin Watts <Robin.Watts@artifex.com>	2018-05-31 15:59:53 +0100
committer	Robin Watts <robin.watts@artifex.com>	2018-07-06 18:00:03 +0100
commit	75c457ddd28a629a9e1d6e1a8fa313ffef1457cb (patch)
tree	96099a85afbfcaa5af457aa230dfcb89fbd99825 /source
parent	533684eb51bb72df871a63eb17e589e2ec9bb547 (diff)
download	mupdf-75c457ddd28a629a9e1d6e1a8fa313ffef1457cb.tar.xz