Skip to content

Commit

Permalink
Improve TryReadStream with simplification & fix of Stream Invalid Len…
Browse files Browse the repository at this point in the history
…gth cutting off Streams (#838)

* Improve TryReadStream with simplification & fix of Stream Invalid Length cutting off Streams

- Fix of Stream invalid Length issue causing stream data being cut off: fix #809

- Improve Stream Token read performance by:
  -  simplifying TryReadStream(), avoiding use of MemoryStream, with benefice of already existing Memory Span of "inputBytes"
  - removing the unecessary List<>

* Add Stream with Invalid Length unit test

* Use of Memory<> instead of direct Span to avoid byte array allocation .ToArray.
Suggestion from (https://github.com/UglyToad/PdfPig/pull/838/files/4153e4a1b421aee6158799175ced081c9f533a13#r1619509165)
  • Loading branch information
sbruyere authored May 31, 2024
1 parent d7e434e commit 65a18b2
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 105 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,44 @@ 353 0 obj
Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
}

[Fact]
public void ReadsStreamObjectWithInvalidLength()
{
string invalidLengthStream = "ABCD" + new string('e', 3996);

string s = $@"
352 0 obj
<< /S 1273 /Filter /FlateDecode /Length 353 0 R >>
stream
{invalidLengthStream}
endstream
endobj
353 0 obj
1479
endobj";

var locationProvider = new TestObjectLocationProvider();
// Mark location of "353 0 obj"
locationProvider.Offsets[new IndirectReference(353, 0)] = 1643;

var scanner = GetScanner(s, locationProvider);

var tokens = ReadToEnd(scanner);

Assert.Equal(2, tokens.Count);

var stream = Assert.IsType<StreamToken>(tokens[0].Data);

var data = stream.Data.ToArray();

var str = Encoding.UTF8.GetString(data);

Assert.Equal(data.Length, invalidLengthStream.Length);
Assert.StartsWith("ABCDeeeee", str);

Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
}

[Fact]
public void ReadsSimpleStreamObject()
{
Expand Down
190 changes: 85 additions & 105 deletions src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using Core;
using Encryption;
Expand Down Expand Up @@ -320,7 +321,7 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull
int endStreamPosition = 0;
int commonPartPosition = 0;

const string commonPart = "end";
const string endWordPart = "end";
const string streamPart = "stream";
const string objPart = "obj";

Expand All @@ -330,150 +331,129 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull
return true;
}

// Track any 'endobj' or 'endstream' operators we see.
var observedEndLocations = new List<PossibleStreamEndLocation>();
long streamDataStart = inputBytes.CurrentOffset;

// Begin reading the stream.
using (var memoryStream = new MemoryStream())
using (var binaryWrite = new BinaryWriter(memoryStream))
PossibleStreamEndLocation? possibleEndLocation = null;


while (inputBytes.MoveNext())
{
while (inputBytes.MoveNext())
if (length.HasValue && read == length)
{
if (length.HasValue && read == length)
{
// TODO: read ahead and check we're at the end...
// break;
}
// TODO: read ahead and check we're at the end...
// break;
}

// We are reading 'end' (possibly).
if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition])
{
commonPartPosition++;
}
else if (commonPartPosition == commonPart.Length)
// We are reading 'end' (possibly).
if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
{
commonPartPosition++;
}
else if (commonPartPosition == endWordPart.Length)
{
// We are reading 'stream' after 'end'
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
{
// We are reading 'stream' after 'end'
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
{
endObjPosition = 0;
endStreamPosition++;

// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
{
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
endObjPosition = 0;
endStreamPosition++;

observedEndLocations.Add(token);
// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
{
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);

if (length.HasValue && read > length)
{
break;
}
possibleEndLocation = token;

endStreamPosition = 0;
if (length.HasValue && read > length)
{
break;
}
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'

endStreamPosition = 0;
endObjPosition++;
}
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'

endStreamPosition = 0;
endObjPosition++;

// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
{
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (possibleEndLocation != null)
{
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (observedEndLocations.Count > 0)
{
var lastEndToken = observedEndLocations[observedEndLocations.Count - 1];
var lastEndToken = possibleEndLocation.Value;

inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);

break;
}
break;
}

var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
observedEndLocations.Add(token);
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);

if (read > length)
{
break;
}
}
}
else
{
// We were reading 'end' but then we had a character mismatch.
// Reset all the counters.
possibleEndLocation = token;

endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = 0;
if (read > length)
{
break;
}
}
}
else
{
// For safety reset every counter in case we had a partial read.
// We were reading 'end' but then we had a character mismatch.
// Reset all the counters.

endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = (inputBytes.CurrentByte == commonPart[0]) ? 1 : 0;
commonPartPosition = 0;
}

binaryWrite.Write(inputBytes.CurrentByte);

read++;
}

binaryWrite.Flush();

if (observedEndLocations.Count == 0)
else
{
return false;
// For safety reset every counter in case we had a partial read.

endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
}

memoryStream.Seek(0, SeekOrigin.Begin);
if (length.HasValue && memoryStream.Length >= length)
{
// Use the declared length to copy just the data we want.
byte[] data = new byte[length.Value];
read++;
}

memoryStream.Read(data, 0, (int)length.Value);
long streamDataEnd = inputBytes.CurrentOffset + 1;

stream = new StreamToken(streamDictionaryToken, data);
}
else
{
// Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that.
var lastEnd = observedEndLocations[observedEndLocations.Count - 1];
if (possibleEndLocation == null)
return false;

var dataLength = lastEnd.Offset - startDataOffset;
var lastEnd = possibleEndLocation;

var current = inputBytes.CurrentOffset;
var dataLength = lastEnd.Value.Offset - startDataOffset;

// 3 characters, 'e', '\n' and possibly '\r'
inputBytes.Seek(lastEnd.Offset - 3);
inputBytes.MoveNext();
// 3 characters, 'e', '\n' and possibly '\r'
inputBytes.Seek(lastEnd.Value.Offset - 3);
inputBytes.MoveNext();

if (inputBytes.CurrentByte == '\r')
{
dataLength -= 3;
}
else
{
dataLength -= 2;
}
if (inputBytes.CurrentByte == '\r')
{
dataLength -= 3;
}
else
{
dataLength -= 2;
}

inputBytes.Seek(current);
Memory<byte> data = new byte[dataLength];

byte[] data = new byte[dataLength];
inputBytes.Seek(streamDataStart);
inputBytes.Read(data.Span);

memoryStream.Read(data, 0, (int)dataLength);
inputBytes.Seek(streamDataEnd);

stream = new StreamToken(streamDictionaryToken, data);
}
}
stream = new StreamToken(streamDictionaryToken, data);

return true;
}
Expand Down

0 comments on commit 65a18b2

Please sign in to comment.