From 65a18b200f998f9699b905414cf054caefef3fb6 Mon Sep 17 00:00:00 2001 From: Sylvain Bruyere Date: Fri, 31 May 2024 08:16:56 +0200 Subject: [PATCH] Improve TryReadStream with simplification & fix of Stream Invalid Length cutting off Streams (#838) * Improve TryReadStream with simplification & fix of Stream Invalid Length cutting off Streams - Fix of Stream invalid Length issue causing stream data being cut off: fix https://github.com/UglyToad/PdfPig/issues/809 - Improve Stream Token read performance by: - simplifying TryReadStream(), avoiding use of MemoryStream, with benefice of already existing Memory Span of "inputBytes" - removing the unecessary List<> * Add Stream with Invalid Length unit test * Use of Memory<> instead of direct Span to avoid byte array allocation .ToArray. Suggestion from (https://github.com/UglyToad/PdfPig/pull/838/files/4153e4a1b421aee6158799175ced081c9f533a13#r1619509165) --- .../Scanner/PdfTokenScannerTests.cs | 38 ++++ .../Tokenization/Scanner/PdfTokenScanner.cs | 190 ++++++++---------- 2 files changed, 123 insertions(+), 105 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs index 3ff8681f7..7fb9d8db5 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs @@ -256,6 +256,44 @@ 353 0 obj Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]); } + [Fact] + public void ReadsStreamObjectWithInvalidLength() + { + string invalidLengthStream = "ABCD" + new string('e', 3996); + + string s = $@" +352 0 obj +<< /S 1273 /Filter /FlateDecode /Length 353 0 R >> +stream +{invalidLengthStream} +endstream +endobj +353 0 obj +1479 +endobj"; + + var locationProvider = new TestObjectLocationProvider(); + // Mark location of "353 0 obj" + locationProvider.Offsets[new IndirectReference(353, 0)] = 1643; + + var scanner = GetScanner(s, locationProvider); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + var stream = Assert.IsType(tokens[0].Data); + + var data = stream.Data.ToArray(); + + var str = Encoding.UTF8.GetString(data); + + Assert.Equal(data.Length, invalidLengthStream.Length); + Assert.StartsWith("ABCDeeeee", str); + + Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]); + } + [Fact] public void ReadsSimpleStreamObject() { diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 56b8608e5..af8db9451 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -7,6 +7,7 @@ using System.Globalization; using System.IO; using System.Linq; + using System.Text; using System.Text.RegularExpressions; using Core; using Encryption; @@ -320,7 +321,7 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull int endStreamPosition = 0; int commonPartPosition = 0; - const string commonPart = "end"; + const string endWordPart = "end"; const string streamPart = "stream"; const string objPart = "obj"; @@ -330,150 +331,129 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull return true; } - // Track any 'endobj' or 'endstream' operators we see. - var observedEndLocations = new List(); + long streamDataStart = inputBytes.CurrentOffset; - // Begin reading the stream. - using (var memoryStream = new MemoryStream()) - using (var binaryWrite = new BinaryWriter(memoryStream)) + PossibleStreamEndLocation? possibleEndLocation = null; + + + while (inputBytes.MoveNext()) { - while (inputBytes.MoveNext()) + if (length.HasValue && read == length) { - if (length.HasValue && read == length) - { - // TODO: read ahead and check we're at the end... - // break; - } + // TODO: read ahead and check we're at the end... + // break; + } - // We are reading 'end' (possibly). - if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition]) - { - commonPartPosition++; - } - else if (commonPartPosition == commonPart.Length) + // We are reading 'end' (possibly). + if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition]) + { + commonPartPosition++; + } + else if (commonPartPosition == endWordPart.Length) + { + // We are reading 'stream' after 'end' + if (inputBytes.CurrentByte == streamPart[endStreamPosition]) { - // We are reading 'stream' after 'end' - if (inputBytes.CurrentByte == streamPart[endStreamPosition]) - { - endObjPosition = 0; - endStreamPosition++; - - // We've finished reading 'endstream', add it to the end tokens we've seen. - if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte))) - { - var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream); + endObjPosition = 0; + endStreamPosition++; - observedEndLocations.Add(token); + // We've finished reading 'endstream', add it to the end tokens we've seen. + if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte))) + { + var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream); - if (length.HasValue && read > length) - { - break; - } + possibleEndLocation = token; - endStreamPosition = 0; + if (length.HasValue && read > length) + { + break; } - } - else if (inputBytes.CurrentByte == objPart[endObjPosition]) - { - // We are reading 'obj' after 'end' endStreamPosition = 0; - endObjPosition++; + } + } + else if (inputBytes.CurrentByte == objPart[endObjPosition]) + { + // We are reading 'obj' after 'end' + + endStreamPosition = 0; + endObjPosition++; - // We have finished reading 'endobj'. - if (endObjPosition == objPart.Length) + // We have finished reading 'endobj'. + if (endObjPosition == objPart.Length) + { + // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now. + if (possibleEndLocation != null) { - // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now. - if (observedEndLocations.Count > 0) - { - var lastEndToken = observedEndLocations[observedEndLocations.Count - 1]; + var lastEndToken = possibleEndLocation.Value; - inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1); + inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1); - break; - } + break; + } - var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject); - observedEndLocations.Add(token); + var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject); - if (read > length) - { - break; - } - } - } - else - { - // We were reading 'end' but then we had a character mismatch. - // Reset all the counters. + possibleEndLocation = token; - endStreamPosition = 0; - endObjPosition = 0; - commonPartPosition = 0; + if (read > length) + { + break; + } } } else { - // For safety reset every counter in case we had a partial read. + // We were reading 'end' but then we had a character mismatch. + // Reset all the counters. endStreamPosition = 0; endObjPosition = 0; - commonPartPosition = (inputBytes.CurrentByte == commonPart[0]) ? 1 : 0; + commonPartPosition = 0; } - - binaryWrite.Write(inputBytes.CurrentByte); - - read++; } - - binaryWrite.Flush(); - - if (observedEndLocations.Count == 0) + else { - return false; + // For safety reset every counter in case we had a partial read. + + endStreamPosition = 0; + endObjPosition = 0; + commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0; } - memoryStream.Seek(0, SeekOrigin.Begin); - if (length.HasValue && memoryStream.Length >= length) - { - // Use the declared length to copy just the data we want. - byte[] data = new byte[length.Value]; + read++; + } - memoryStream.Read(data, 0, (int)length.Value); + long streamDataEnd = inputBytes.CurrentOffset + 1; - stream = new StreamToken(streamDictionaryToken, data); - } - else - { - // Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that. - var lastEnd = observedEndLocations[observedEndLocations.Count - 1]; + if (possibleEndLocation == null) + return false; - var dataLength = lastEnd.Offset - startDataOffset; + var lastEnd = possibleEndLocation; - var current = inputBytes.CurrentOffset; + var dataLength = lastEnd.Value.Offset - startDataOffset; - // 3 characters, 'e', '\n' and possibly '\r' - inputBytes.Seek(lastEnd.Offset - 3); - inputBytes.MoveNext(); + // 3 characters, 'e', '\n' and possibly '\r' + inputBytes.Seek(lastEnd.Value.Offset - 3); + inputBytes.MoveNext(); - if (inputBytes.CurrentByte == '\r') - { - dataLength -= 3; - } - else - { - dataLength -= 2; - } + if (inputBytes.CurrentByte == '\r') + { + dataLength -= 3; + } + else + { + dataLength -= 2; + } - inputBytes.Seek(current); + Memory data = new byte[dataLength]; - byte[] data = new byte[dataLength]; + inputBytes.Seek(streamDataStart); + inputBytes.Read(data.Span); - memoryStream.Read(data, 0, (int)dataLength); + inputBytes.Seek(streamDataEnd); - stream = new StreamToken(streamDictionaryToken, data); - } - } + stream = new StreamToken(streamDictionaryToken, data); return true; }