Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve GlyphList #967

Merged
merged 2 commits into from
Jan 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions src/UglyToad.PdfPig.Fonts/GlyphList.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,13 @@

private readonly Dictionary<string, string> oddNameToUnicodeCache = new Dictionary<string, string>();

private static readonly Lazy<GlyphList> LazyAdobeGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("glyphlist"));
private static readonly Lazy<GlyphList> LazyAdobeGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("glyphlist", "additional"));

/// <summary>
/// The Adobe Glyph List.
/// The Adobe Glyph List (includes an extension to the Adobe Glyph List.).
/// </summary>
public static GlyphList AdobeGlyphList => LazyAdobeGlyphList.Value;

private static readonly Lazy<GlyphList> LazyAdditionalGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("additional"));

/// <summary>
/// An extension to the Adobe Glyph List.
/// </summary>
public static GlyphList AdditionalGlyphList => LazyAdditionalGlyphList.Value;

private static readonly Lazy<GlyphList> LazyZapfDingbatsGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("zapfdingbats"));

/// <summary>
Expand Down Expand Up @@ -103,7 +96,7 @@
return result;
}

string unicode;
string? unicode;

Check warning on line 99 in src/UglyToad.PdfPig.Fonts/GlyphList.cs

View workflow job for this annotation

GitHub Actions / build

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 99 in src/UglyToad.PdfPig.Fonts/GlyphList.cs

View workflow job for this annotation

GitHub Actions / build

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 99 in src/UglyToad.PdfPig.Fonts/GlyphList.cs

View workflow job for this annotation

GitHub Actions / build

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 99 in src/UglyToad.PdfPig.Fonts/GlyphList.cs

View workflow job for this annotation

GitHub Actions / build

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.
// 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any.
if (name.IndexOf('.') > 0)
{
Expand Down
56 changes: 31 additions & 25 deletions src/UglyToad.PdfPig.Fonts/GlyphListFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,49 +4,56 @@
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using Util;

internal class GlyphListFactory
internal static class GlyphListFactory
{
public static GlyphList Get(string listName)
#if NET
private const char Semicolon = ';';
#else
private static readonly char[] Semicolon = [';'];
#endif

public static GlyphList Get(params string[] listNames)
{
using (var resource =
typeof(GlyphListFactory).Assembly.GetManifestResourceStream(
$"UglyToad.PdfPig.Fonts.Resources.GlyphList.{listName}"))
var result = new Dictionary<string, string>(listNames.Any(n => string.Equals("glyphlist", n, StringComparison.OrdinalIgnoreCase)) ? 4300 : 0);

foreach (var listName in listNames)
{
if (resource == null)
using (var resource =
typeof(GlyphListFactory).Assembly.GetManifestResourceStream(
$"UglyToad.PdfPig.Fonts.Resources.GlyphList.{listName}"))
{
throw new ArgumentException($"No embedded glyph list resource was found with the name {listName}.");
}
if (resource == null)
{
throw new ArgumentException($"No embedded glyph list resource was found with the name {listName}.");
}

int? capacity = null;
// Prevent too much wasted memory capacity for Adobe GlyphList
if (string.Equals("glyphlist", listName, StringComparison.OrdinalIgnoreCase))
{
capacity = 4300;
ReadInternal(resource, result);
}

return ReadInternal(resource, capacity);
}

#if NET
result.TrimExcess();
#endif
return new GlyphList(result);
}

public static GlyphList Read(Stream stream)
{
return ReadInternal(stream);
var result = new Dictionary<string, string>();
ReadInternal(stream, result);
return new GlyphList(result);
}

private static readonly char[] Semicolon = [';'];

private static GlyphList ReadInternal(Stream stream, int? defaultDictionaryCapacity = 0)
private static void ReadInternal(Stream stream, Dictionary<string, string> result)
{
if (stream == null)
{
throw new ArgumentNullException(nameof(stream));
}

var result = defaultDictionaryCapacity.HasValue ? new Dictionary<string, string>(defaultDictionaryCapacity.Value) : [];


using (var reader = new StreamReader(stream))
{
while (!reader.EndOfStream)
Expand All @@ -62,7 +69,7 @@ private static GlyphList ReadInternal(Stream stream, int? defaultDictionaryCapac
{
continue;
}

var parts = line.Split(Semicolon, StringSplitOptions.RemoveEmptyEntries);

if (parts.Length != 2)
Expand All @@ -86,11 +93,10 @@ private static GlyphList ReadInternal(Stream stream, int? defaultDictionaryCapac
value += char.ConvertFromUtf32(code);
}

System.Diagnostics.Debug.Assert(!result.ContainsKey(key));
result[key] = value;
}
}

return new GlyphList(result);
}
}
}
42 changes: 42 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/AdditionalGlyphListTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System.Linq;

public class AdditionalGlyphListTests
{
[Fact]
public void Type1FontSimple1()
{
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("2108.11480")))
{
var page = document.GetPage(2);
Assert.Contains("\u22c3", page.Letters.Select(l => l.Value));
}
}

[Fact]
public void Type1FontSimple2()
{
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("ICML03-081")))
{
var page = document.GetPage(2);
Assert.Contains("\u2211", page.Letters.Select(l => l.Value));
Assert.Contains("\u220f", page.Letters.Select(l => l.Value));
Assert.Contains("[", page.Letters.Select(l => l.Value));
Assert.Contains("]", page.Letters.Select(l => l.Value));
}
}

[Fact]
public void Type1FontSimple3()
{
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData")))
{
var page = document.GetPage(4);
Assert.Contains("(", page.Letters.Select(l => l.Value));
Assert.Contains(")", page.Letters.Select(l => l.Value));
Assert.Contains("\u2211", page.Letters.Select(l => l.Value));
}
}
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
55 changes: 55 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/ZapfDingbatsTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System.Linq;

public class ZapfDingbatsTests
{
[Fact]
public void Type1Standard14Font1()
{
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("TIKA-469-0")))
{
var page = document.GetPage(2);
Assert.Contains("●", page.Letters.Select(l => l.Value));
}
}

[Fact]
public void Type1Standard14Font2()
{
// This document does not actually contain circular references
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("MOZILLA-LINK-5251-1")))
{
var page = document.GetPage(1);
Assert.Contains("✁", page.Letters.Select(l => l.Value));
Assert.Contains("✂", page.Letters.Select(l => l.Value));
Assert.Contains("✄", page.Letters.Select(l => l.Value));
Assert.Contains("☎", page.Letters.Select(l => l.Value));
Assert.Contains("✆", page.Letters.Select(l => l.Value));
Assert.Contains("✇", page.Letters.Select(l => l.Value));
}
}

[Fact]
public void Type1FontSimple1()
{
// This document does not actually contain circular references
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("MOZILLA-2775-1")))
{
var page = document.GetPage(11);
Assert.Contains("●", page.Letters.Select(l => l.Value));
}
}

[Fact]
public void Type1FontSimple2()
{
// This document does not actually contain circular references
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("PDFBOX-492-4.jar-8")))
{
var page = document.GetPage(1);
Assert.Contains("\u25a0", page.Letters.Select(l => l.Value));
}
}
}
}
6 changes: 4 additions & 2 deletions src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeSimpleFont.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ public TrueTypeSimpleFont(

Details = descriptor?.ToDetails(Name?.Data)
?? FontDetails.GetDefault(Name?.Data);

// Assumption is ZapfDingbats is not possible here. We need to change the behaviour if not the case
System.Diagnostics.Debug.Assert(!(encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats")));
}

public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
Expand Down Expand Up @@ -102,8 +105,7 @@ public bool TryGetUnicode(int characterCode, [NotNullWhen(true)] out string? val
// Look up the character name in the Adobe Glyph List or additional Glyph List.
try
{
value = GlyphList.AdobeGlyphList.NameToUnicode(encodedCharacterName)
?? GlyphList.AdditionalGlyphList.NameToUnicode(encodedCharacterName);
value = GlyphList.AdobeGlyphList.NameToUnicode(encodedCharacterName);
}
catch
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ public TrueTypeStandard14FallbackSimpleFont(NameToken name, AdobeFontMetrics fon
fontMetrics.Weight == "Bold",
fontMetrics.Weight == "Bold" ? 700 : FontDetails.DefaultWeight,
fontMetrics.ItalicAngle != 0);

// Assumption is ZapfDingbats is not possible here. We need to change the behaviour if not the case
System.Diagnostics.Debug.Assert(!(encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats")));
}

public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
Expand Down
11 changes: 11 additions & 0 deletions src/UglyToad.PdfPig/PdfFonts/Simple/Type1FontSimple.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ internal sealed class Type1FontSimple : IFont

private readonly TransformationMatrix fontMatrix;

private readonly bool isZapfDingbats;

public NameToken Name { get; }

public bool IsVertical { get; } = false;
Expand Down Expand Up @@ -80,6 +82,7 @@ public Type1FontSimple(
Name = name;
Details = fontDescriptor?.ToDetails(name?.Data)
?? FontDetails.GetDefault(name?.Data);
isZapfDingbats = encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats");
}

public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
Expand Down Expand Up @@ -124,6 +127,14 @@ public bool TryGetUnicode(int characterCode, [NotNullWhen(true)] out string? val

try
{
if (isZapfDingbats)
{
value = GlyphList.ZapfDingbats.NameToUnicode(name);
if (value is not null)
{
return true;
}
}
value = GlyphList.AdobeGlyphList.NameToUnicode(name);
}
catch
Expand Down
38 changes: 18 additions & 20 deletions src/UglyToad.PdfPig/PdfFonts/Simple/Type1Standard14Font.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ internal sealed class Type1Standard14Font : IFont
{
private readonly AdobeFontMetrics standardFontMetrics;
private readonly Encoding encoding;
private readonly bool isZapfDingbats;

public NameToken Name { get; }

Expand All @@ -39,6 +40,7 @@ public Type1Standard14Font(AdobeFontMetrics standardFontMetrics, Encoding? overr
standardFontMetrics.Weight == "Bold",
standardFontMetrics.Weight == "Bold" ? 700 : FontDetails.DefaultWeight,
standardFontMetrics.ItalicAngle != 0);
isZapfDingbats = encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats");
}

public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
Expand All @@ -49,39 +51,35 @@ public int ReadCharacterCode(IInputBytes bytes, out int codeLength)

public bool TryGetUnicode(int characterCode, [NotNullWhen(true)] out string? value)
{
value = null;

var name = encoding.GetName(characterCode);

if (string.Equals(name, GlyphList.NotDefined, StringComparison.OrdinalIgnoreCase))
{
value = null;
return false;
}

if (encoding is ZapfDingbatsEncoding)
try
{
var listed = GlyphList.ZapfDingbats.NameToUnicode(name);
if (isZapfDingbats)
{
value = GlyphList.ZapfDingbats.NameToUnicode(name);

value = listed;
if (value is not null)
{
return true;
}
}

return true;
value = GlyphList.AdobeGlyphList.NameToUnicode(name);
}

if (encoding is StandardEncoding || encoding is SymbolEncoding)
catch
{
var listed = GlyphList.AdobeGlyphList.NameToUnicode(name);

value = listed;

return true;
return false;
}
else
{
Debug.WriteLine($"Warning: Type1Standard14Font with unexpected encoding: '{encoding.EncodingName}' Expected: 'ZapfDingbatsEncoding','SymbolEncoding' or 'StandardEncoding' . Font: '{standardFontMetrics.FontName}'");
var listed = GlyphList.AdobeGlyphList.NameToUnicode(name);

value = listed;

return true;
}
return value is not null;
}

public CharacterBoundingBox GetBoundingBox(int characterCode)
Expand Down
3 changes: 3 additions & 0 deletions src/UglyToad.PdfPig/PdfFonts/Simple/Type3Font.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ public Type3Font(NameToken name, PdfRectangle boundingBox, TransformationMatrix
this.widths = widths;
this.toUnicodeCMap = new ToUnicodeCMap(toUnicodeCMap);
Details = FontDetails.GetDefault(name?.Data);

// Assumption is ZapfDingbats is not possible here. We need to change the behaviour if not the case
System.Diagnostics.Debug.Assert(!(encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats")));
}

public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
Expand Down
Loading