Clean up null/range/endstream filter.

Use separate functions to keep the code simpler. Use memmem to simplify and optimize search for 'endstream' token. Do not look for 'endobj' since that could cause a false positives in compressed object streams that have duff lengths.
author: Tor Andersson <tor.andersson@artifex.com> 2018-07-10 12:58:57 +0200
committer: Tor Andersson <tor.andersson@artifex.com> 2018-08-10 12:09:33 +0200
commit: 09f2e173850e011e6390c49a4f761e87dd87ffba (patch)
tree: 6026165eb2c859792e1c9c3578d470a4e6837f72
parent: 62876a7025e31897e7ccb92ff8d461d3fef6ddb4 (diff)
download: mupdf-09f2e173850e011e6390c49a4f761e87dd87ffba.tar.xz
7 files changed, 207 insertions, 127 deletions
diff --git a/include/mupdf/fitz/filter.h b/include/mupdf/fitz/filter.h
index bdce36e8..5dd136a1 100644
--- a/include/mupdf/fitz/filter.h
+++ b/include/mupdf/fitz/filter.h
@@ -12,12 +12,12 @@ typedef struct fz_jbig2_globals_s fz_jbig2_globals;
 typedef struct
 {
 	int64_t offset;
-	int len;
+	size_t length;
 } fz_range;
 
-fz_stream *fz_open_null_n(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges);
-fz_stream *fz_open_null(fz_context *ctx, fz_stream *chain, int len, int64_t offset);
-fz_stream *fz_open_pdf_stream(fz_context *ctx, fz_stream *chain, int len, int64_t offset);
+fz_stream *fz_open_null_filter(fz_context *ctx, fz_stream *chain, int len, int64_t offset);
+fz_stream *fz_open_range_filter(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges);
+fz_stream *fz_open_endstream_filter(fz_context *ctx, fz_stream *chain, int len, int64_t offset);
 fz_stream *fz_open_concat(fz_context *ctx, int max, int pad);
 void fz_concat_push_drop(fz_context *ctx, fz_stream *concat, fz_stream *chain); /* Ownership of chain is passed in */
 fz_stream *fz_open_arc4(fz_context *ctx, fz_stream *chain, unsigned char *key, unsigned keylen);
diff --git a/source/fitz/filter-basic.c b/source/fitz/filter-basic.c
index 6928ffff..0713a62e 100644
--- a/source/fitz/filter-basic.c
+++ b/source/fitz/filter-basic.c
@@ -2,39 +2,90 @@
 
 #include <string.h>
 
-/* Null filter copies a specified amount of data */
+/* The null filter reads a specified amount of data from the substream. */
 
 struct null_filter
 {
 	fz_stream *chain;
+	size_t remain;
+	int64_t offset;
+	unsigned char buffer[4096];
+};
+
+static int
+next_null(fz_context *ctx, fz_stream *stm, size_t max)
+{
+	struct null_filter *state = stm->state;
+	size_t n;
+
+	if (state->remain == 0)
+		return EOF;
+
+	fz_seek(ctx, state->chain, state->offset, 0);
+	n = fz_available(ctx, state->chain, max);
+	if (n == 0)
+		return EOF;
+	if (n > state->remain)
+		n = state->remain;
+	if (n > sizeof(state->buffer))
+		n = sizeof(state->buffer);
+
+	memcpy(state->buffer, state->chain->rp, n);
+	stm->rp = state->buffer;
+	stm->wp = stm->rp + n;
+	state->chain->rp += n;
+	state->remain -= n;
+	state->offset += n;
+	stm->pos += n;
+	return *stm->rp++;
+}
+
+static void
+close_null(fz_context *ctx, void *state_)
+{
+	struct null_filter *state = (struct null_filter *)state_;
+	fz_drop_stream(ctx, state->chain);
+	fz_free(ctx, state);
+}
+
+fz_stream *
+fz_open_null_filter(fz_context *ctx, fz_stream *chain, int len, int64_t offset)
+{
+	struct null_filter *state = fz_malloc_struct(ctx, struct null_filter);
+	state->chain = fz_keep_stream(ctx, chain);
+	state->remain = len;
+	state->offset = offset;
+	return fz_new_stream(ctx, state, next_null, close_null);
+}
+
+/* The range filter copies data from specified ranges of the chained stream */
+
+struct range_filter
+{
+	fz_stream *chain;
 	fz_range *ranges;
-	int look_for_endstream;
 	int nranges;
 	int next_range;
 	size_t remain;
-	unsigned int extras;
-	unsigned int size;
 	int64_t offset;
 	unsigned char buffer[4096];
 };
 
 static int
-next_null(fz_context *ctx, fz_stream *stm, size_t max)
+next_range(fz_context *ctx, fz_stream *stm, size_t max)
 {
-	struct null_filter *state = stm->state;
-	size_t n, i, nbytes_in_buffer;
-	const char *rp;
-	unsigned int size;
+	struct range_filter *state = stm->state;
+	size_t n;
 
 	while (state->remain == 0 && state->next_range < state->nranges)
 	{
 		fz_range *range = &state->ranges[state->next_range++];
-		state->remain = range->len;
+		state->remain = range->length;
 		state->offset = range->offset;
 	}
 
 	if (state->remain == 0)
-		goto maybe_ended;
+		return EOF;
 	fz_seek(ctx, state->chain, state->offset, 0);
 	n = fz_available(ctx, state->chain, max);
 	if (n > state->remain)
@@ -45,21 +96,107 @@ next_null(fz_context *ctx, fz_stream *stm, size_t max)
 	stm->rp = state->buffer;
 	stm->wp = stm->rp + n;
 	if (n == 0)
-		goto maybe_ended;
+		return EOF;
 	state->chain->rp += n;
 	state->remain -= n;
-	state->offset += (int64_t)n;
-	stm->pos += (int64_t)n;
+	state->offset += n;
+	stm->pos += n;
 	return *stm->rp++;
+}
 
-maybe_ended:
-	if (state->look_for_endstream == 0)
+static void
+close_range(fz_context *ctx, void *state_)
+{
+	struct range_filter *state = (struct range_filter *)state_;
+	fz_drop_stream(ctx, state->chain);
+	fz_free(ctx, state->ranges);
+	fz_free(ctx, state);
+}
+
+fz_stream *
+fz_open_range_filter(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges)
+{
+	struct range_filter *state = NULL;
+
+	state = fz_malloc_struct(ctx, struct range_filter);
+	fz_try(ctx)
+	{
+		if (nranges > 0)
+		{
+			state->ranges = fz_calloc(ctx, nranges, sizeof(*ranges));
+			memcpy(state->ranges, ranges, nranges * sizeof(*ranges));
+			state->nranges = nranges;
+			state->next_range = 1;
+			state->remain = ranges[0].length;
+			state->offset = ranges[0].offset;
+		}
+		else
+		{
+			state->ranges = NULL;
+			state->nranges = 0;
+			state->next_range = 1;
+			state->remain = 0;
+			state->offset = 0;
+		}
+		state->chain = fz_keep_stream(ctx, chain);
+	}
+	fz_catch(ctx)
+	{
+		fz_free(ctx, state->ranges);
+		fz_free(ctx, state);
+		fz_rethrow(ctx);
+	}
+
+	return fz_new_stream(ctx, state, next_range, close_range);
+}
+
+/*
+ * The endstream filter reads a PDF substream, and starts to look for an 'endstream' token
+ * after the specified length.
+ */
+
+#define END_CHECK_SIZE 32
+
+struct endstream_filter
+{
+	fz_stream *chain;
+	size_t remain, extras, size;
+	int64_t offset;
+	int warned;
+	unsigned char buffer[4096];
+};
+
+static int
+next_endstream(fz_context *ctx, fz_stream *stm, size_t max)
+{
+	struct endstream_filter *state = stm->state;
+	size_t n, nbytes_in_buffer, size;
+	unsigned char *rp;
+
+	if (state->remain == 0)
+		goto look_for_endstream;
+
+	fz_seek(ctx, state->chain, state->offset, 0);
+	n = fz_available(ctx, state->chain, max);
+	if (n == 0)
 		return EOF;
+	if (n > state->remain)
+		n = state->remain;
+	if (n > sizeof(state->buffer))
+		n = sizeof(state->buffer);
+	memcpy(state->buffer, state->chain->rp, n);
+	stm->rp = state->buffer;
+	stm->wp = stm->rp + n;
+	state->chain->rp += n;
+	state->remain -= n;
+	state->offset += n;
+	stm->pos += n;
+	return *stm->rp++;
 
+look_for_endstream:
 	/* We should distrust the stream length, and check for end
 	 * marker before terminating the stream - this is to cope
 	 * with files with duff "Length" values. */
-	fz_seek(ctx, state->chain, state->offset, 0);
 
 	/* Move any data left over in our buffer down to the start.
 	 * Ordinarily, there won't be any, but this allows for the
@@ -75,11 +212,13 @@ maybe_ended:
 	 * should only need (say) 32 bytes to be sure. For crap files
 	 * where we overread regularly, don't harm performance by
 	 * working in small chunks. */
-	state->size *= 2;
-	if (state->size > sizeof(state->buffer))
-		state->size = sizeof(state->buffer);
-#define END_CHECK_SIZE 32
-	size = state->size;
+	size = state->size * 2;
+	if (size > sizeof(state->buffer))
+		size = sizeof(state->buffer);
+	state->size = size;
+
+	/* Read enough data into our buffer to start looking for the 'endstream' token. */
+	fz_seek(ctx, state->chain, state->offset, 0);
 	while (nbytes_in_buffer < size)
 	{
 		n = fz_available(ctx, state->chain, size - nbytes_in_buffer);
@@ -91,123 +230,64 @@ maybe_ended:
 		stm->wp += n;
 		state->chain->rp += n;
 		nbytes_in_buffer += n;
+		state->offset += n;
 	}
 
-	*stm->wp = 0; /* Be friendly to strcmp */
-	rp = (char *)state->buffer;
-	n = 0;
-	/* If we don't have at least 11 bytes in the buffer, then we don't have
-	 * enough bytes for the worst case terminator. Also, we're dangerously
-	 * close to the end of the file. Don't risk overrunning the buffer. */
-	if (nbytes_in_buffer >= 11)
-		for (i = 0; i < nbytes_in_buffer - 11; )
-		{
-			n = i;
-			if (rp[i] == '\r')
-				i++;
-			if (rp[i] == '\n')
-				i++;
-			if (rp[i++] != 'e')
-				continue;
-			if (rp[i++] != 'n')
-				continue;
-			if (rp[i++] != 'd')
-				continue;
-			if (memcmp(&rp[i], "stream", 6) == 0 || (memcmp(&rp[i], "obj", 3) == 0))
-				break;
-			i++;
-		}
+	/* Look for the 'endstream' token. */
+	rp = fz_memmem(state->buffer, nbytes_in_buffer, "endstream", 9);
+	if (rp)
+	{
+		/* Include newline (CR|LF|CRLF) before 'endstream' token */
+		if (rp > state->buffer && rp[-1] == '\n') --rp;
+		if (rp > state->buffer && rp[-1] == '\r') --rp;
+		n = rp - state->buffer;
+		stm->eof = 1; /* We're done, don't call us again! */
+	}
+	else if (nbytes_in_buffer > 11) /* 11 covers enough data to detect "\r?\n?endstream" */
+		n = nbytes_in_buffer - 11; /* no endstream, but there is more data */
+	else
+		n = nbytes_in_buffer; /* no endstream, but at the end of the file */
 
 	/* We have at least n bytes before we hit an end marker */
-	state->offset += (int64_t)nbytes_in_buffer - state->extras;
 	state->extras = nbytes_in_buffer - n;
 	stm->wp = stm->rp + n;
 	stm->pos += n;
 
 	if (n == 0)
 		return EOF;
+
+	if (!state->warned)
+	{
+		state->warned = 1;
+		fz_warn(ctx, "PDF stream Length incorrect");
+	}
 	return *stm->rp++;
 }
 
 static void
-close_null(fz_context *ctx, void *state_)
+close_endstream(fz_context *ctx, void *state_)
 {
-	struct null_filter *state = (struct null_filter *)state_;
+	struct endstream_filter *state = (struct endstream_filter *)state_;
 	fz_drop_stream(ctx, state->chain);
-	fz_free(ctx, state->ranges);
 	fz_free(ctx, state);
 }
 
-static fz_stream *
-fz_open_null_n_terminator(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges, int terminator)
-{
-	struct null_filter *state = NULL;
-
-	state = fz_malloc_struct(ctx, struct null_filter);
-	fz_try(ctx)
-	{
-		if (nranges > 0)
-		{
-			state->ranges = fz_calloc(ctx, nranges, sizeof(*ranges));
-			memcpy(state->ranges, ranges, nranges * sizeof(*ranges));
-			state->look_for_endstream = terminator;
-			state->nranges = nranges;
-			state->next_range = 1;
-			state->remain = ranges[0].len;
-			state->offset = ranges[0].offset;
-			state->extras = 0;
-			state->size = END_CHECK_SIZE>>1;
-		}
-		else
-		{
-			state->ranges = NULL;
-			state->nranges = 0;
-			state->next_range = 1;
-			state->remain = 0;
-			state->offset = 0;
-		}
-		state->chain = fz_keep_stream(ctx, chain);
-	}
-	fz_catch(ctx)
-	{
-		fz_free(ctx, state->ranges);
-		fz_free(ctx, state);
-		fz_rethrow(ctx);
-	}
-
-	return fz_new_stream(ctx, state, next_null, close_null);
-}
-
-fz_stream *
-fz_open_null_n(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges)
-{
-	return fz_open_null_n_terminator(ctx, chain, ranges, nranges, 0);
-}
-
 fz_stream *
-fz_open_null(fz_context *ctx, fz_stream *chain, int len, int64_t offset)
+fz_open_endstream_filter(fz_context *ctx, fz_stream *chain, int len, int64_t offset)
 {
-	fz_range range;
+	struct endstream_filter *state;
 
 	if (len < 0)
 		len = 0;
 
-	range.offset = offset;
-	range.len = len;
-	return fz_open_null_n_terminator(ctx, chain, &range, 1, 0);
-}
-
-fz_stream *
-fz_open_pdf_stream(fz_context *ctx, fz_stream *chain, int len, int64_t offset)
-{
-	fz_range range;
-
-	if (len < 0)
-		len = 0;
+	state = fz_malloc_struct(ctx, struct endstream_filter);
+	state->chain = fz_keep_stream(ctx, chain);
+	state->remain = len;
+	state->offset = offset;
+	state->extras = 0;
+	state->size = END_CHECK_SIZE >> 1; /* size is doubled first thing when used */
 
-	range.offset = offset;
-	range.len = len;
-	return fz_open_null_n_terminator(ctx, chain, &range, 1, 1);
+	return fz_new_stream(ctx, state, next_endstream, close_endstream);
 }
 
 /* Concat filter concatenates several streams into one */
@@ -238,7 +318,7 @@ next_concat(fz_context *ctx, fz_stream *stm, size_t max)
 		{
 			stm->rp = state->chain[state->current]->rp;
 			stm->wp = state->chain[state->current]->wp;
-			stm->pos += (int64_t)n;
+			stm->pos += n;
 			return *stm->rp++;
 		}
 		else
@@ -675,7 +755,7 @@ next_arc4(fz_context *ctx, fz_stream *stm, size_t max)
 	stm->wp = state->buffer + n;
 	fz_arc4_encrypt(&state->arc4, stm->rp, state->chain->rp, n);
 	state->chain->rp += n;
-	stm->pos += (int64_t)n;
+	stm->pos += n;
 
 	return *stm->rp++;
 }
diff --git a/source/fitz/untar.c b/source/fitz/untar.c
index 1588a8ac..9aac222b 100644
--- a/source/fitz/untar.c
+++ b/source/fitz/untar.c
@@ -118,7 +118,7 @@ static fz_stream *open_tar_entry(fz_context *ctx, fz_archive *arch, const char *
 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find named tar archive entry");
 
 	fz_seek(ctx, file, ent->offset + 512, 0);
-	return fz_open_null(ctx, file, ent->size, fz_tell(ctx, file));
+	return fz_open_null_filter(ctx, file, ent->size, fz_tell(ctx, file));
 }
 
 static fz_buffer *read_tar_entry(fz_context *ctx, fz_archive *arch, const char *name)
diff --git a/source/fitz/unzip.c b/source/fitz/unzip.c
index 4eb90dda..dfe4fb6b 100644
--- a/source/fitz/unzip.c
+++ b/source/fitz/unzip.c
@@ -286,7 +286,7 @@ static fz_stream *open_zip_entry(fz_context *ctx, fz_archive *arch, const char *
 
 	method = read_zip_entry_header(ctx, zip, ent);
 	if (method == 0)
-		return fz_open_null(ctx, file, ent->usize, fz_tell(ctx, file));
+		return fz_open_null_filter(ctx, file, ent->usize, fz_tell(ctx, file));
 	if (method == 8)
 		return fz_open_flated(ctx, file, -15);
 	fz_throw(ctx, FZ_ERROR_GENERIC, "unknown zip method: %d", method);
diff --git a/source/pdf/pdf-form.c b/source/pdf/pdf-form.c
index d6969ce7..a838b4b8 100644
--- a/source/pdf/pdf-form.c
+++ b/source/pdf/pdf-form.c
@@ -1245,7 +1245,7 @@ int pdf_signature_widget_byte_range(fz_context *ctx, pdf_document *doc, pdf_widg
 		for (i = 0; i < n; i++)
 		{
 			byte_range[i].offset = pdf_array_get_int(ctx, br, 2*i);
-			byte_range[i].len = pdf_array_get_int(ctx, br, 2*i+1);
+			byte_range[i].length = pdf_array_get_int(ctx, br, 2*i+1);
 		}
 	}
 
@@ -1268,7 +1268,7 @@ fz_stream *pdf_signature_widget_hash_bytes(fz_context *ctx, pdf_document *doc, p
 			pdf_signature_widget_byte_range(ctx, doc, widget, byte_range);
 		}
 
-		bytes = fz_open_null_n(ctx, doc->file, byte_range, byte_range_len);
+		bytes = fz_open_range_filter(ctx, doc->file, byte_range, byte_range_len);
 	}
 	fz_always(ctx)
 	{
diff --git a/source/pdf/pdf-signature.c b/source/pdf/pdf-signature.c
index 4d62e7f1..cb7a807b 100644
--- a/source/pdf/pdf-signature.c
+++ b/source/pdf/pdf-signature.c
@@ -29,11 +29,11 @@ void pdf_write_digest(fz_context *ctx, fz_output *out, pdf_obj *byte_range, int
 		for (i = 0; i < brange_len; i++)
 		{
 			brange[i].offset = pdf_array_get_int(ctx, byte_range, 2*i);
-			brange[i].len = pdf_array_get_int(ctx, byte_range, 2*i+1);
+			brange[i].length = pdf_array_get_int(ctx, byte_range, 2*i+1);
 		}
 
 		stm = fz_stream_from_output(ctx, out);
-		in = fz_open_null_n(ctx, stm, brange, brange_len);
+		in = fz_open_range_filter(ctx, stm, brange, brange_len);
 
 		digest_len = (hexdigest_length - 2) / 2;
 		digest = fz_malloc(ctx, digest_len);
diff --git a/source/pdf/pdf-stream.c b/source/pdf/pdf-stream.c
index 796e2490..54d3d561 100644
--- a/source/pdf/pdf-stream.c
+++ b/source/pdf/pdf-stream.c
@@ -298,7 +298,7 @@ pdf_open_raw_filter(fz_context *ctx, fz_stream *file_stm, pdf_document *doc, pdf
 
 	hascrypt = pdf_stream_has_crypt(ctx, stmobj);
 	len = pdf_dict_get_int(ctx, stmobj, PDF_NAME(Length));
-	null_stm = fz_open_pdf_stream(ctx, file_stm, len, offset);
+	null_stm = fz_open_endstream_filter(ctx, file_stm, len, offset);
 	if (doc->crypt && !hascrypt)
 	{
 		fz_try(ctx)
@@ -359,7 +359,7 @@ pdf_open_inline_stream(fz_context *ctx, pdf_document *doc, pdf_obj *stmobj, int
 
 	if (imparams)
 		imparams->type = FZ_IMAGE_RAW;
-	return fz_open_null(ctx, file_stm, length, fz_tell(ctx, file_stm));
+	return fz_open_null_filter(ctx, file_stm, length, fz_tell(ctx, file_stm));
 }
 
 void
author	Tor Andersson <tor.andersson@artifex.com>	2018-07-10 12:58:57 +0200
committer	Tor Andersson <tor.andersson@artifex.com>	2018-08-10 12:09:33 +0200
commit	09f2e173850e011e6390c49a4f761e87dd87ffba (patch)
tree	6026165eb2c859792e1c9c3578d470a4e6837f72
parent	62876a7025e31897e7ccb92ff8d461d3fef6ddb4 (diff)
download	mupdf-09f2e173850e011e6390c49a4f761e87dd87ffba.tar.xz