From 2f925ccd3c83c31fb35d2f1e421650dc56b88ac9 Mon Sep 17 00:00:00 2001
From: Michael R Sweet <msweet@msweet.org>
Date: Thu, 6 Mar 2025 12:40:19 -0500
Subject: [PATCH] Update documentation and pdf2text example (Issue #95)

---
 CHANGES.md          |   1 +
 doc/pdfio.3         | 239 ++++++++++++++++++++++++++++++++++++----
 doc/pdfio.html      | 211 ++++++++++++++++++++++++++++++++----
 doc/pdfio.md        | 257 ++++++++++++++++++++++++++++++++++++++++----
 examples/pdf2text.c |  62 ++++++++++-
 pdfio-content.c     |  16 ++-
 pdfio-stream.c      |  15 +--
 7 files changed, 726 insertions(+), 75 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index cf9ae67..1c0c061 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -12,6 +12,7 @@ v1.5.0 - YYYY-MM-DD
 - Added support for using libpng to embed PNG images in PDF output (Issue #90)
 - Added support for writing the PCLm subset of PDF (Issue #99)
 - Now support opening damaged PDF files (Issue #45)
+- Updated documentation (Issue #95)
 - Updated the pdf2txt example to support font encodings.
 - Fixed a potential heap overflow in the TrueType font code.
 
diff --git a/doc/pdfio.3 b/doc/pdfio.3
index 30aea17..0ec0417 100644
--- a/doc/pdfio.3
+++ b/doc/pdfio.3
@@ -1,4 +1,4 @@
-.TH pdfio 3 "pdf read/write library" "2025-02-20" "pdf read/write library"
+.TH pdfio 3 "pdf read/write library" "2025-03-06" "pdf read/write library"
 .SH NAME
 pdfio \- pdf read/write library
 .SH Introduction
@@ -34,7 +34,7 @@ PDFio is
 .I not
  concerned with rendering or viewing a PDF file, although a PDF RIP or viewer could be written using it.
 .PP
-PDFio is Copyright \[co] 2021\-2024 by Michael R Sweet and is licensed under the Apache License Version 2.0 with an (optional) exception to allow linking against GPL2/LGPL2 software. See the files "LICENSE" and "NOTICE" for more information.
+PDFio is Copyright \[co] 2021\-2025 by Michael R Sweet and is licensed under the Apache License Version 2.0 with an (optional) exception to allow linking against GPL2/LGPL2 software. See the files "LICENSE" and "NOTICE" for more information.
 .SS Requirements
 .PP
 PDFio requires the following to build the software:
@@ -52,9 +52,11 @@ A POSIX\-compliant sh program
 
 .IP \(bu 5
 .PP
-ZLIB (https://www.zlib.net) 1.0 or higher
+ZLIB (https://www.zlib.net/) 1.0 or higher
 
 
+.PP
+PDFio will also use libpng 1.6 or higher (https://www.libpng.org/) to provide enhanced PNG image support.
 .PP
 IDE files for Xcode (macOS/iOS) and Visual Studio (Windows) are also provided.
 .SS Installing PDFio
@@ -1097,28 +1099,83 @@ The pdfioinfo.c example program opens a PDF file and prints the title, author, c
 .fi
 .SS Extract Text from PDF File
 .PP
-The pdf2text.c example code extracts non\-Unicode text from a PDF file by scanning each page for strings and text drawing commands. Since it doesn't look at the font encoding or support Unicode text, it is really only useful to extract plain ASCII text from a PDF file. And since it writes text in the order it appears in the page stream, it may not come out in the same order as appears on the page.
+The pdf2text.c example code extracts text from a PDF file and writes it to the standard output. Unlike some other PDF tools, it outputs the text in the order it is seen in each page stream so the output might appear "jumbled" if the PDF producer doesn't output text in reading order. The code is able to handle different font encodings and produces UTF\-8 output.
 .PP
-The pdfioStreamGetToken function is used to read individual tokens from the page streams. Tokens starting with the open parenthesis are text strings, while PDF operators are left as\-is. We use some simple logic to make sure that we include spaces between text strings and add newlines for the text operators that start a new line in a text block:
+The pdfioStreamGetToken function is used to read individual tokens from the page streams:
 .nf
 
     pdfio_stream_t *st;              // Page stream
+    char           buffer[1024],     // Token buffer
+                   *bufptr,          // Pointer into buffer
+                   name[256];        // Current (font) name
     bool           first = true;     // First string on line?
-    char           buffer[1024];     // Token buffer
+    int            encoding[256];    // Font encoding to Unicode
+    bool           in_array = false; // Are we in an array?
     
     // Read PDF tokens from the page stream...
     while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
     {
-      if (buffer[0] == '(')
+.fi
+.PP
+Justified text can be found inside arrays ("[ ... ]"), so we look for the array delimiter tokens and any (spacing) numbers inside an array. Experimentation has shown that numbers greater than 100 can be treated as whitespace:
+.nf
+
+      if (!strcmp(buffer, "["))
+      {
+        // Start of an array for justified text...
+        in_array = true;
+      }
+      else if (!strcmp(buffer, "]"))
+      {
+        // End of an array for justified text...
+        in_array = false;
+      }
+      else if (!first && in_array && (isdigit(buffer[0]) || buffer[0] == '\-') && fabs(atof(buffer)) > 100)
+      {
+        // Whitespace in a justified text block...
+        putchar(' ');
+      }
+.fi
+.PP
+Tokens starting with \'(' or \'<' are text fragments. 8\-bit text starting with \'(' needs to be mapped to Unicode using the current font encoding while hex strings starting with \'<' are UTF\-16 (Unicode) that need to be converted to UTF\-8:
+.nf
+
+      else if (buffer[0] == '(')
       {
         // Text string using an 8\-bit encoding
-        if (first)
-          first = false;
-        else if (buffer[1] != ' ')
-          putchar(' ');
+        first = false;
     
-        fputs(buffer + 1, stdout);
+        for (bufptr = buffer + 1; *bufptr; bufptr ++)
+          put_utf8(encoding[*bufptr & 255]);
       }
+      else if (buffer[0] == '<')
+      {
+        // Unicode text string
+        first = false;
+    
+        puts_utf16(buffer + 1);
+      }
+.fi
+.PP
+Simple (8\-bit) fonts include an encoding table that maps the 8\-bit characters to one of 1051 Unicode glyph names. Since each font can use a different encoding, we look for font names starting with \'/' and the "Tf" (set text font) operator token and load that font's encoding using the load_encoding function:
+.nf
+
+      else if (buffer[0] == '/')
+      {
+        // Save name...
+        strncpy(name, buffer + 1, sizeof(name) \- 1);
+        name[sizeof(name) \- 1] = '\\0';
+      }
+      else if (!strcmp(buffer, "Tf") && name[0])
+      {
+        // Set font...
+        load_encoding(obj, name, encoding);
+      }
+.fi
+.PP
+Finally, some text operators start a new line in a text block, so when we see their tokens we output a newline:
+.nf
+
       else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") ||
                !strcmp(buffer, "\\'") || !strcmp(buffer, "\\""))
       {
@@ -1127,9 +1184,150 @@ The pdfioStreamGetToken function is used to read individual tokens from the page
         first = true;
       }
     }
+.fi
+.PP
+The load_encoding Function
+.PP
+The load_encoding function looks up the named font in the page's "Resources" dictionary. Every PDF simple font contains an "Encoding" dictionary with a base encoding ("WinANSI", "MacRoman", or "MacExpert") and a differences array that lists character indexes and glyph names for an 8\-bit font.
+.PP
+We start by initializing the encoding array to the default WinANSI encoding and looking up the font object for the named font:
+.nf
+
+    static void
+    load_encoding(
+        pdfio_obj_t   *page_obj,            // I \- Page object
+        const char    *name,                // I \- Font name
+        int           encoding[256])        // O \- Encoding table
+    {
+      size_t        i, j;                   // Looping vars
+      pdfio_dict_t  *page_dict,             // Page dictionary
+                    *resources_dict,        // Resources dictionary
+                    *font_dict;             // Font dictionary
+      pdfio_obj_t   *font_obj,              // Font object
+                    *encoding_obj;          // Encoding object
+      static int    win_ansi[32] =          // WinANSI characters from 128 to 159
+      {
+        ...
+      };
+      static int    mac_roman[128] =        // MacRoman characters from 128 to 255
+      {
+        ...
+      };
     
-    if (!first)
-      putchar('\\n');
+    
+      // Initialize the encoding to be the "standard" WinAnsi...
+      for (i = 0; i < 128; i ++)
+        encoding[i] = i;
+      for (i = 160; i < 256; i ++)
+        encoding[i] = i;
+      memcpy(encoding + 128, win_ansi, sizeof(win_ansi));
+    
+      // Find the named font...
+      if ((page_dict = pdfioObjGetDict(page_obj)) == NULL)
+        return;
+    
+      if ((resources_dict = pdfioDictGetDict(page_dict, "Resources")) == NULL)
+        return;
+    
+      if ((font_dict = pdfioDictGetDict(resources_dict, "Font")) == NULL)
+      {
+        // Font resources not a dictionary, see if it is an object...
+        if ((font_obj = pdfioDictGetObj(resources_dict, "Font")) != NULL)
+          font_dict = pdfioObjGetDict(font_obj);
+    
+        if (!font_dict)
+          return;
+      }
+    
+      if ((font_obj = pdfioDictGetObj(font_dict, name)) == NULL)
+        return;
+.fi
+.PP
+Once we have found the font we see if it has an "Encoding" dictionary:
+.nf
+
+      pdfio_dict_t  *encoding_dict;         // Encoding dictionary
+    
+      if ((encoding_obj = pdfioDictGetObj(pdfioObjGetDict(font_obj), "Encoding")) == NULL)
+        return;
+    
+      if ((encoding_dict = pdfioObjGetDict(encoding_obj)) == NULL)
+        return;
+.fi
+.PP
+Once we have the encoding dictionary we can get the "BaseEncoding" and "Differences" values:
+.nf
+
+      const char    *base_encoding;         // BaseEncoding name
+      pdfio_array_t *differences;           // Differences array
+    
+      // OK, have the encoding object, build the encoding using it...
+      base_encoding = pdfioDictGetName(encoding_dict, "BaseEncoding");
+      differences   = pdfioDictGetArray(encoding_dict, "Differences");
+.fi
+.PP
+If the base encoding is "MacRomainEncoding", we need to reset the upper 128 characters in the encoding array match it:
+.nf
+
+      if (base_encoding && !strcmp(base_encoding, "MacRomanEncoding"))
+      {
+        // Map upper 128
+        memcpy(encoding + 128, mac_roman, sizeof(mac_roman));
+      }
+.fi
+.PP
+Then we loop through the differences array, keeping track of the current index within the encoding array. A number indicates a new index while a name is the Unicode glyph for the current index:
+.nf
+
+      typedef struct name_map_s
+      {
+        const char    *name;                // Character name
+        int           unicode;              // Unicode value
+      } name_map_t;
+    
+      static name_map_t unicode_map[1051];  // List of glyph names
+    
+      if (differences)
+      {
+        // Apply differences
+        size_t      count = pdfioArrayGetSize(differences);
+                                            // Number of differences
+        const char  *name;                  // Character name
+        size_t      idx = 0;                // Index in encoding array
+    
+        for (i = 0; i < count; i ++)
+        {
+          switch (pdfioArrayGetType(differences, i))
+          {
+            case PDFIO_VALTYPE_NUMBER :
+                // Get the index of the next character...
+                idx = (size_t)pdfioArrayGetNumber(differences, i);
+                break;
+    
+            case PDFIO_VALTYPE_NAME :
+                // Lookup name and apply to encoding...
+                if (idx < 0 || idx > 255)
+                  break;
+    
+                name = pdfioArrayGetName(differences, i);
+                for (j = 0; j < (sizeof(unicode_map) / sizeof(unicode_map[0])); j ++)
+                {
+                  if (!strcmp(name, unicode_map[j].name))
+                  {
+                    encoding[idx] = unicode_map[j].unicode;
+                    break;
+                  }
+                }
+                idx ++;
+                break;
+    
+            default :
+                // Do nothing for other values
+                break;
+          }
+        }
+      }
+    }
 .fi
 .SS Create a PDF File With Text and an Image
 .PP
@@ -4365,12 +4563,13 @@ bool  pdfioStreamGetToken (
 );
 .fi
 .PP
-This function reads a single PDF token from a stream.  Operator tokens,
-boolean values, and numbers are returned as-is in the provided string buffer.
-String values start with the opening parenthesis ('(') but have all escaping
-resolved and the terminating parenthesis removed.  Hexadecimal string values
-start with the opening angle bracket ('<') and have all whitespace and the
-terminating angle bracket removed.
+This function reads a single PDF token from a stream, skipping all whitespace
+and comments.  Operator tokens, boolean values, and numbers are returned
+as-is in the provided string buffer.  String values start with the opening
+parenthesis ('(') but have all escaping resolved and the terminating
+parenthesis removed.  Hexadecimal string values start with the opening angle
+bracket ('<') and have all whitespace and the terminating angle bracket
+removed.
 .SS pdfioStreamPeek
 Peek at data in a stream.
 .PP
diff --git a/doc/pdfio.html b/doc/pdfio.html
index d695b88..1ec01f5 100644
--- a/doc/pdfio.html
+++ b/doc/pdfio.html
@@ -525,7 +525,7 @@ span.string {
 </li>
 </ul>
 <p>PDFio is <em>not</em> concerned with rendering or viewing a PDF file, although a PDF RIP or viewer could be written using it.</p>
-<p>PDFio is Copyright © 2021-2024 by Michael R Sweet and is licensed under the Apache License Version 2.0 with an (optional) exception to allow linking against GPL2/LGPL2 software. See the files &quot;LICENSE&quot; and &quot;NOTICE&quot; for more information.</p>
+<p>PDFio is Copyright © 2021-2025 by Michael R Sweet and is licensed under the Apache License Version 2.0 with an (optional) exception to allow linking against GPL2/LGPL2 software. See the files &quot;LICENSE&quot; and &quot;NOTICE&quot; for more information.</p>
 <h3 class="title" id="requirements">Requirements</h3>
 <p>PDFio requires the following to build the software:</p>
 <ul>
@@ -535,9 +535,10 @@ span.string {
 </li>
 <li><p>A POSIX-compliant <code>sh</code> program</p>
 </li>
-<li><p>ZLIB (<a href="https://www.zlib.net">https://www.zlib.net</a>) 1.0 or higher</p>
+<li><p>ZLIB (<a href="https://www.zlib.net/">https://www.zlib.net/</a>) 1.0 or higher</p>
 </li>
 </ul>
+<p>PDFio will also use libpng 1.6 or higher (<a href="https://www.libpng.org/">https://www.libpng.org/</a>) to provide enhanced PNG image support.</p>
 <p>IDE files for Xcode (macOS/iOS) and Visual Studio (Windows) are also provided.</p>
 <h3 class="title" id="installing-pdfio">Installing PDFio</h3>
 <p>PDFio comes with a configure script that creates a portable makefile that will work on any POSIX-compliant system with ZLIB installed. To make it, run:</p>
@@ -1215,26 +1216,69 @@ main(<span class="reserved">int</span>  argc,                         <span clas
 }
 </code></pre>
 <h3 class="title" id="extract-text-from-pdf-file">Extract Text from PDF File</h3>
-<p>The <code>pdf2text.c</code> example code extracts non-Unicode text from a PDF file by scanning each page for strings and text drawing commands. Since it doesn't look at the font encoding or support Unicode text, it is really only useful to extract plain ASCII text from a PDF file. And since it writes text in the order it appears in the page stream, it may not come out in the same order as appears on the page.</p>
-<p>The <a href="#pdfioStreamGetToken"><code>pdfioStreamGetToken</code></a> function is used to read individual tokens from the page streams. Tokens starting with the open parenthesis are text strings, while PDF operators are left as-is. We use some simple logic to make sure that we include spaces between text strings and add newlines for the text operators that start a new line in a text block:</p>
+<p>The <code>pdf2text.c</code> example code extracts text from a PDF file and writes it to the standard output. Unlike some other PDF tools, it outputs the text in the order it is seen in each page stream so the output might appear &quot;jumbled&quot; if the PDF producer doesn't output text in reading order. The code is able to handle different font encodings and produces UTF-8 output.</p>
+<p>The <a href="#pdfioStreamGetToken"><code>pdfioStreamGetToken</code></a> function is used to read individual tokens from the page streams:</p>
 <pre><code class="language-c">pdfio_stream_t *st;              <span class="comment">// Page stream</span>
+<span class="reserved">char</span>           buffer[<span class="number">1024</span>],     <span class="comment">// Token buffer</span>
+               *bufptr,          <span class="comment">// Pointer into buffer</span>
+               name[<span class="number">256</span>];        <span class="comment">// Current (font) name</span>
 <span class="reserved">bool</span>           first = <span class="reserved">true</span>;     <span class="comment">// First string on line?</span>
-<span class="reserved">char</span>           buffer[<span class="number">1024</span>];     <span class="comment">// Token buffer</span>
+<span class="reserved">int</span>            encoding[<span class="number">256</span>];    <span class="comment">// Font encoding to Unicode</span>
+<span class="reserved">bool</span>           in_array = <span class="reserved">false</span>; <span class="comment">// Are we in an array?</span>
 
 <span class="comment">// Read PDF tokens from the page stream...</span>
 <span class="reserved">while</span> (pdfioStreamGetToken(st, buffer, <span class="reserved">sizeof</span>(buffer)))
 {
-  <span class="reserved">if</span> (buffer[<span class="number">0</span>] == <span class="string">'('</span>)
+</code></pre>
+<p>Justified text can be found inside arrays (&quot;[ ... ]&quot;), so we look for the array delimiter tokens and any (spacing) numbers inside an array. Experimentation has shown that numbers greater than 100 can be treated as whitespace:</p>
+<pre><code class="language-c">  <span class="reserved">if</span> (!strcmp(buffer, <span class="string">&quot;[&quot;</span>))
+  {
+    <span class="comment">// Start of an array for justified text...</span>
+    in_array = <span class="reserved">true</span>;
+  }
+  <span class="reserved">else</span> <span class="reserved">if</span> (!strcmp(buffer, <span class="string">&quot;]&quot;</span>))
+  {
+    <span class="comment">// End of an array for justified text...</span>
+    in_array = <span class="reserved">false</span>;
+  }
+  <span class="reserved">else</span> <span class="reserved">if</span> (!first &amp;&amp; in_array &amp;&amp; (isdigit(buffer[<span class="number">0</span>]) || buffer[<span class="number">0</span>] == <span class="string">'-'</span>) &amp;&amp; fabs(atof(buffer)) &gt; <span class="number">100</span>)
+  {
+    <span class="comment">// Whitespace in a justified text block...</span>
+    putchar(<span class="string">' '</span>);
+  }
+</code></pre>
+<p>Tokens starting with '(' or '&lt;' are text fragments. 8-bit text starting with '(' needs to be mapped to Unicode using the current font encoding while hex strings starting with '&lt;' are UTF-16 (Unicode) that need to be converted to UTF-8:</p>
+<pre><code class="language-c">  <span class="reserved">else</span> <span class="reserved">if</span> (buffer[<span class="number">0</span>] == <span class="string">'('</span>)
   {
     <span class="comment">// Text string using an 8-bit encoding</span>
-    <span class="reserved">if</span> (first)
-      first = <span class="reserved">false</span>;
-    <span class="reserved">else</span> <span class="reserved">if</span> (buffer[<span class="number">1</span>] != <span class="string">' '</span>)
-      putchar(<span class="string">' '</span>);
+    first = <span class="reserved">false</span>;
 
-    fputs(buffer + <span class="number">1</span>, stdout);
+    <span class="reserved">for</span> (bufptr = buffer + <span class="number">1</span>; *bufptr; bufptr ++)
+      put_utf8(encoding[*bufptr &amp; <span class="number">255</span>]);
   }
-  <span class="reserved">else</span> <span class="reserved">if</span> (!strcmp(buffer, <span class="string">&quot;Td&quot;</span>) || !strcmp(buffer, <span class="string">&quot;TD&quot;</span>) || !strcmp(buffer, <span class="string">&quot;T*&quot;</span>) ||
+  <span class="reserved">else</span> <span class="reserved">if</span> (buffer[<span class="number">0</span>] == <span class="string">'&lt;'</span>)
+  {
+    <span class="comment">// Unicode text string</span>
+    first = <span class="reserved">false</span>;
+
+    puts_utf16(buffer + <span class="number">1</span>);
+  }
+</code></pre>
+<p>Simple (8-bit) fonts include an encoding table that maps the 8-bit characters to one of 1051 Unicode glyph names. Since each font can use a different encoding, we look for font names starting with '/' and the &quot;Tf&quot; (set text font) operator token and load that font's encoding using the <a href="#the-loadencoding-function">load_encoding</a> function:</p>
+<pre><code class="language-c">  <span class="reserved">else</span> <span class="reserved">if</span> (buffer[<span class="number">0</span>] == <span class="string">'/'</span>)
+  {
+    <span class="comment">// Save name...</span>
+    strncpy(name, buffer + <span class="number">1</span>, <span class="reserved">sizeof</span>(name) - <span class="number">1</span>);
+    name[<span class="reserved">sizeof</span>(name) - <span class="number">1</span>] = <span class="string">'\0'</span>;
+  }
+  <span class="reserved">else</span> <span class="reserved">if</span> (!strcmp(buffer, <span class="string">&quot;Tf&quot;</span>) &amp;&amp; name[<span class="number">0</span>])
+  {
+    <span class="comment">// Set font...</span>
+    load_encoding(obj, name, encoding);
+  }
+</code></pre>
+<p>Finally, some text operators start a new line in a text block, so when we see their tokens we output a newline:</p>
+<pre><code class="language-c">  <span class="reserved">else</span> <span class="reserved">if</span> (!strcmp(buffer, <span class="string">&quot;Td&quot;</span>) || !strcmp(buffer, <span class="string">&quot;TD&quot;</span>) || !strcmp(buffer, <span class="string">&quot;T*&quot;</span>) ||
            !strcmp(buffer, <span class="string">&quot;\'&quot;</span>) || !strcmp(buffer, <span class="string">&quot;\&quot;&quot;</span>))
   {
     <span class="comment">// Text operators that advance to the next line in the block</span>
@@ -1242,9 +1286,133 @@ main(<span class="reserved">int</span>  argc,                         <span clas
     first = <span class="reserved">true</span>;
   }
 }
+</code></pre>
+<h4 id="the-loadencoding-function">The <code>load_encoding</code> Function</h4>
+<p>The <code>load_encoding</code> function looks up the named font in the page's &quot;Resources&quot; dictionary. Every PDF simple font contains an &quot;Encoding&quot; dictionary with a base encoding (&quot;WinANSI&quot;, &quot;MacRoman&quot;, or &quot;MacExpert&quot;) and a differences array that lists character indexes and glyph names for an 8-bit font.</p>
+<p>We start by initializing the encoding array to the default WinANSI encoding and looking up the font object for the named font:</p>
+<pre><code class="language-c"><span class="reserved">static</span> <span class="reserved">void</span>
+load_encoding(
+    pdfio_obj_t   *page_obj,            <span class="comment">// I - Page object</span>
+    <span class="reserved">const</span> <span class="reserved">char</span>    *name,                <span class="comment">// I - Font name</span>
+    <span class="reserved">int</span>           encoding[<span class="number">256</span>])        <span class="comment">// O - Encoding table</span>
+{
+  size_t        i, j;                   <span class="comment">// Looping vars</span>
+  pdfio_dict_t  *page_dict,             <span class="comment">// Page dictionary</span>
+                *resources_dict,        <span class="comment">// Resources dictionary</span>
+                *font_dict;             <span class="comment">// Font dictionary</span>
+  pdfio_obj_t   *font_obj,              <span class="comment">// Font object</span>
+                *encoding_obj;          <span class="comment">// Encoding object</span>
+  <span class="reserved">static</span> <span class="reserved">int</span>    win_ansi[<span class="number">32</span>] =          <span class="comment">// WinANSI characters from 128 to 159</span>
+  {
+    ...
+  };
+  <span class="reserved">static</span> <span class="reserved">int</span>    mac_roman[<span class="number">128</span>] =        <span class="comment">// MacRoman characters from 128 to 255</span>
+  {
+    ...
+  };
 
-<span class="reserved">if</span> (!first)
-  putchar(<span class="string">'\n'</span>);
+
+  <span class="comment">// Initialize the encoding to be the &quot;standard&quot; WinAnsi...</span>
+  <span class="reserved">for</span> (i = <span class="number">0</span>; i &lt; <span class="number">128</span>; i ++)
+    encoding[i] = i;
+  <span class="reserved">for</span> (i = <span class="number">160</span>; i &lt; <span class="number">256</span>; i ++)
+    encoding[i] = i;
+  memcpy(encoding + <span class="number">128</span>, win_ansi, <span class="reserved">sizeof</span>(win_ansi));
+
+  <span class="comment">// Find the named font...</span>
+  <span class="reserved">if</span> ((page_dict = pdfioObjGetDict(page_obj)) == NULL)
+    <span class="reserved">return</span>;
+
+  <span class="reserved">if</span> ((resources_dict = pdfioDictGetDict(page_dict, <span class="string">&quot;Resources&quot;</span>)) == NULL)
+    <span class="reserved">return</span>;
+
+  <span class="reserved">if</span> ((font_dict = pdfioDictGetDict(resources_dict, <span class="string">&quot;Font&quot;</span>)) == NULL)
+  {
+    <span class="comment">// Font resources not a dictionary, see if it is an object...</span>
+    <span class="reserved">if</span> ((font_obj = pdfioDictGetObj(resources_dict, <span class="string">&quot;Font&quot;</span>)) != NULL)
+      font_dict = pdfioObjGetDict(font_obj);
+
+    <span class="reserved">if</span> (!font_dict)
+      <span class="reserved">return</span>;
+  }
+
+  <span class="reserved">if</span> ((font_obj = pdfioDictGetObj(font_dict, name)) == NULL)
+    <span class="reserved">return</span>;
+</code></pre>
+<p>Once we have found the font we see if it has an &quot;Encoding&quot; dictionary:</p>
+<pre><code class="language-c">  pdfio_dict_t  *encoding_dict;         <span class="comment">// Encoding dictionary</span>
+
+  <span class="reserved">if</span> ((encoding_obj = pdfioDictGetObj(pdfioObjGetDict(font_obj), <span class="string">&quot;Encoding&quot;</span>)) == NULL)
+    <span class="reserved">return</span>;
+
+  <span class="reserved">if</span> ((encoding_dict = pdfioObjGetDict(encoding_obj)) == NULL)
+    <span class="reserved">return</span>;
+</code></pre>
+<p>Once we have the encoding dictionary we can get the &quot;BaseEncoding&quot; and &quot;Differences&quot; values:</p>
+<pre><code class="language-c">  <span class="reserved">const</span> <span class="reserved">char</span>    *base_encoding;         <span class="comment">// BaseEncoding name</span>
+  pdfio_array_t *differences;           <span class="comment">// Differences array</span>
+
+  <span class="comment">// OK, have the encoding object, build the encoding using it...</span>
+  base_encoding = pdfioDictGetName(encoding_dict, <span class="string">&quot;BaseEncoding&quot;</span>);
+  differences   = pdfioDictGetArray(encoding_dict, <span class="string">&quot;Differences&quot;</span>);
+</code></pre>
+<p>If the base encoding is &quot;MacRomainEncoding&quot;, we need to reset the upper 128 characters in the encoding array match it:</p>
+<pre><code class="language-c">  <span class="reserved">if</span> (base_encoding &amp;&amp; !strcmp(base_encoding, <span class="string">&quot;MacRomanEncoding&quot;</span>))
+  {
+    <span class="comment">// Map upper 128</span>
+    memcpy(encoding + <span class="number">128</span>, mac_roman, <span class="reserved">sizeof</span>(mac_roman));
+  }
+</code></pre>
+<p>Then we loop through the differences array, keeping track of the current index within the encoding array. A number indicates a new index while a name is the Unicode glyph for the current index:</p>
+<pre><code class="language-c">  <span class="reserved">typedef</span> <span class="reserved">struct</span> name_map_s
+  {
+    <span class="reserved">const</span> <span class="reserved">char</span>    *name;                <span class="comment">// Character name</span>
+    <span class="reserved">int</span>           unicode;              <span class="comment">// Unicode value</span>
+  } name_map_t;
+
+  <span class="reserved">static</span> name_map_t unicode_map[<span class="number">1051</span>];  <span class="comment">// List of glyph names</span>
+
+  <span class="reserved">if</span> (differences)
+  {
+    <span class="comment">// Apply differences</span>
+    size_t      count = pdfioArrayGetSize(differences);
+                                        <span class="comment">// Number of differences</span>
+    <span class="reserved">const</span> <span class="reserved">char</span>  *name;                  <span class="comment">// Character name</span>
+    size_t      idx = <span class="number">0</span>;                <span class="comment">// Index in encoding array</span>
+
+    <span class="reserved">for</span> (i = <span class="number">0</span>; i &lt; count; i ++)
+    {
+      <span class="reserved">switch</span> (pdfioArrayGetType(differences, i))
+      {
+        <span class="reserved">case</span> PDFIO_VALTYPE_NUMBER :
+            <span class="comment">// Get the index of the next character...</span>
+            idx = (size_t)pdfioArrayGetNumber(differences, i);
+            <span class="reserved">break</span>;
+
+        <span class="reserved">case</span> PDFIO_VALTYPE_NAME :
+            <span class="comment">// Lookup name and apply to encoding...</span>
+            <span class="reserved">if</span> (idx &lt; <span class="number">0</span> || idx &gt; <span class="number">255</span>)
+              <span class="reserved">break</span>;
+
+            name = pdfioArrayGetName(differences, i);
+            <span class="reserved">for</span> (j = <span class="number">0</span>; j &lt; (<span class="reserved">sizeof</span>(unicode_map) / <span class="reserved">sizeof</span>(unicode_map[<span class="number">0</span>])); j ++)
+            {
+              <span class="reserved">if</span> (!strcmp(name, unicode_map[j].name))
+              {
+                encoding[idx] = unicode_map[j].unicode;
+                <span class="reserved">break</span>;
+              }
+            }
+            idx ++;
+            <span class="reserved">break</span>;
+
+        <span class="reserved">default</span> :
+            <span class="comment">// Do nothing for other values</span>
+            <span class="reserved">break</span>;
+      }
+    }
+  }
+}
 </code></pre>
 <h3 class="title" id="create-a-pdf-file-with-text-and-an-image">Create a PDF File With Text and an Image</h3>
 <p>The <code>image2pdf.c</code> example code creates a PDF file containing a JPEG or PNG image file and optional caption on a single page. The <code>create_pdf_image_file</code> function creates the PDF file, embeds a base font and the named JPEG or PNG image file, and then creates a page with the image centered on the page with any text centered below:</p>
@@ -4874,14 +5042,15 @@ size_t pdfioPageGetNumStreams(<a href="#pdfio_obj_t">pdfio_obj_t</a> *page);</p>
 <td class="description">Size of string buffer</td></tr>
 </tbody></table>
 <h4 class="returnvalue">Return Value</h4>
-<p class="description"><code>true</code> on success, <code>false</code> on EOF</p>
+<p class="description"><code>true</code> on success, <code>false</code> on end-of-stream or error</p>
 <h4 class="discussion">Discussion</h4>
-<p class="discussion">This function reads a single PDF token from a stream.  Operator tokens,
-boolean values, and numbers are returned as-is in the provided string buffer.
-String values start with the opening parenthesis ('(') but have all escaping
-resolved and the terminating parenthesis removed.  Hexadecimal string values
-start with the opening angle bracket ('&lt;') and have all whitespace and the
-terminating angle bracket removed.</p>
+<p class="discussion">This function reads a single PDF token from a stream, skipping all whitespace
+and comments.  Operator tokens, boolean values, and numbers are returned
+as-is in the provided string buffer.  String values start with the opening
+parenthesis ('(') but have all escaping resolved and the terminating
+parenthesis removed.  Hexadecimal string values start with the opening angle
+bracket ('&lt;') and have all whitespace and the terminating angle bracket
+removed.</p>
 <h3 class="function"><a id="pdfioStreamPeek">pdfioStreamPeek</a></h3>
 <p class="description">Peek at data in a stream.</p>
 <p class="code">
diff --git a/doc/pdfio.md b/doc/pdfio.md
index d736a62..5072bfe 100644
--- a/doc/pdfio.md
+++ b/doc/pdfio.md
@@ -15,7 +15,7 @@ goals of PDFio are:
 PDFio is *not* concerned with rendering or viewing a PDF file, although a PDF
 RIP or viewer could be written using it.
 
-PDFio is Copyright © 2021-2024 by Michael R Sweet and is licensed under the
+PDFio is Copyright © 2021-2025 by Michael R Sweet and is licensed under the
 Apache License Version 2.0 with an (optional) exception to allow linking against
 GPL2/LGPL2 software.  See the files "LICENSE" and "NOTICE" for more information.
 
@@ -28,7 +28,10 @@ PDFio requires the following to build the software:
 - A C99 compiler such as Clang, GCC, or MS Visual C
 - A POSIX-compliant `make` program
 - A POSIX-compliant `sh` program
-- ZLIB (<https://www.zlib.net>) 1.0 or higher
+- ZLIB (<https://www.zlib.net/>) 1.0 or higher
+
+PDFio will also use libpng 1.6 or higher (<https://www.libpng.org/>) to provide
+enhanced PNG image support.
 
 IDE files for Xcode (macOS/iOS) and Visual Studio (Windows) are also provided.
 
@@ -941,37 +944,98 @@ main(int  argc,                         // I - Number of command-line arguments
 Extract Text from PDF File
 --------------------------
 
-The `pdf2text.c` example code extracts non-Unicode text from a PDF file by
-scanning each page for strings and text drawing commands.  Since it doesn't
-look at the font encoding or support Unicode text, it is really only useful to
-extract plain ASCII text from a PDF file.  And since it writes text in the order
-it appears in the page stream, it may not come out in the same order as appears
-on the page.
+The `pdf2text.c` example code extracts text from a PDF file and writes it to the
+standard output.  Unlike some other PDF tools, it outputs the text in the order
+it is seen in each page stream so the output might appear "jumbled" if the PDF
+producer doesn't output text in reading order.  The code is able to handle
+different font encodings and produces UTF-8 output.
 
 The [`pdfioStreamGetToken`](@@) function is used to read individual tokens from
-the page streams.  Tokens starting with the open parenthesis are text strings,
-while PDF operators are left as-is.  We use some simple logic to make sure that
-we include spaces between text strings and add newlines for the text operators
-that start a new line in a text block:
+the page streams:
 
 ```c
 pdfio_stream_t *st;              // Page stream
+char           buffer[1024],     // Token buffer
+               *bufptr,          // Pointer into buffer
+               name[256];        // Current (font) name
 bool           first = true;     // First string on line?
-char           buffer[1024];     // Token buffer
+int            encoding[256];    // Font encoding to Unicode
+bool           in_array = false; // Are we in an array?
 
 // Read PDF tokens from the page stream...
 while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
 {
-  if (buffer[0] == '(')
+```
+
+Justified text can be found inside arrays ("[ ... ]"), so we look for the array
+delimiter tokens and any (spacing) numbers inside an array.  Experimentation has
+shown that numbers greater than 100 can be treated as whitespace:
+
+```c
+  if (!strcmp(buffer, "["))
+  {
+    // Start of an array for justified text...
+    in_array = true;
+  }
+  else if (!strcmp(buffer, "]"))
+  {
+    // End of an array for justified text...
+    in_array = false;
+  }
+  else if (!first && in_array && (isdigit(buffer[0]) || buffer[0] == '-') && fabs(atof(buffer)) > 100)
+  {
+    // Whitespace in a justified text block...
+    putchar(' ');
+  }
+```
+
+Tokens starting with '(' or '<' are text fragments.  8-bit text starting with
+'(' needs to be mapped to Unicode using the current font encoding while hex
+strings starting with '<' are UTF-16 (Unicode) that need to be converted to
+UTF-8:
+
+```c
+  else if (buffer[0] == '(')
   {
     // Text string using an 8-bit encoding
-    if (first)
-      first = false;
-    else if (buffer[1] != ' ')
-      putchar(' ');
+    first = false;
 
-    fputs(buffer + 1, stdout);
+    for (bufptr = buffer + 1; *bufptr; bufptr ++)
+      put_utf8(encoding[*bufptr & 255]);
   }
+  else if (buffer[0] == '<')
+  {
+    // Unicode text string
+    first = false;
+
+    puts_utf16(buffer + 1);
+  }
+```
+
+Simple (8-bit) fonts include an encoding table that maps the 8-bit characters to
+one of 1051 Unicode glyph names.  Since each font can use a different encoding,
+we look for font names starting with '/' and the "Tf" (set text font) operator
+token and load that font's encoding using the
+[load_encoding](#the-loadencoding-function) function:
+
+```c
+  else if (buffer[0] == '/')
+  {
+    // Save name...
+    strncpy(name, buffer + 1, sizeof(name) - 1);
+    name[sizeof(name) - 1] = '\0';
+  }
+  else if (!strcmp(buffer, "Tf") && name[0])
+  {
+    // Set font...
+    load_encoding(obj, name, encoding);
+  }
+```
+
+Finally, some text operators start a new line in a text block, so when we see
+their tokens we output a newline:
+
+```c
   else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") ||
            !strcmp(buffer, "\'") || !strcmp(buffer, "\""))
   {
@@ -980,9 +1044,160 @@ while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
     first = true;
   }
 }
+```
 
-if (!first)
-  putchar('\n');
+
+### The `load_encoding` Function
+
+The `load_encoding` function looks up the named font in the page's "Resources"
+dictionary.  Every PDF simple font contains an "Encoding" dictionary with a base
+encoding ("WinANSI", "MacRoman", or "MacExpert") and a differences array that
+lists character indexes and glyph names for an 8-bit font.
+
+We start by initializing the encoding array to the default WinANSI encoding and
+looking up the font object for the named font:
+
+```c
+static void
+load_encoding(
+    pdfio_obj_t   *page_obj,            // I - Page object
+    const char    *name,                // I - Font name
+    int           encoding[256])        // O - Encoding table
+{
+  size_t        i, j;                   // Looping vars
+  pdfio_dict_t  *page_dict,             // Page dictionary
+                *resources_dict,        // Resources dictionary
+                *font_dict;             // Font dictionary
+  pdfio_obj_t   *font_obj,              // Font object
+                *encoding_obj;          // Encoding object
+  static int    win_ansi[32] =          // WinANSI characters from 128 to 159
+  {
+    ...
+  };
+  static int    mac_roman[128] =        // MacRoman characters from 128 to 255
+  {
+    ...
+  };
+
+
+  // Initialize the encoding to be the "standard" WinAnsi...
+  for (i = 0; i < 128; i ++)
+    encoding[i] = i;
+  for (i = 160; i < 256; i ++)
+    encoding[i] = i;
+  memcpy(encoding + 128, win_ansi, sizeof(win_ansi));
+
+  // Find the named font...
+  if ((page_dict = pdfioObjGetDict(page_obj)) == NULL)
+    return;
+
+  if ((resources_dict = pdfioDictGetDict(page_dict, "Resources")) == NULL)
+    return;
+
+  if ((font_dict = pdfioDictGetDict(resources_dict, "Font")) == NULL)
+  {
+    // Font resources not a dictionary, see if it is an object...
+    if ((font_obj = pdfioDictGetObj(resources_dict, "Font")) != NULL)
+      font_dict = pdfioObjGetDict(font_obj);
+
+    if (!font_dict)
+      return;
+  }
+
+  if ((font_obj = pdfioDictGetObj(font_dict, name)) == NULL)
+    return;
+```
+
+Once we have found the font we see if it has an "Encoding" dictionary:
+
+```c
+  pdfio_dict_t  *encoding_dict;         // Encoding dictionary
+
+  if ((encoding_obj = pdfioDictGetObj(pdfioObjGetDict(font_obj), "Encoding")) == NULL)
+    return;
+
+  if ((encoding_dict = pdfioObjGetDict(encoding_obj)) == NULL)
+    return;
+```
+
+Once we have the encoding dictionary we can get the "BaseEncoding" and
+"Differences" values:
+
+```c
+  const char    *base_encoding;         // BaseEncoding name
+  pdfio_array_t *differences;           // Differences array
+
+  // OK, have the encoding object, build the encoding using it...
+  base_encoding = pdfioDictGetName(encoding_dict, "BaseEncoding");
+  differences   = pdfioDictGetArray(encoding_dict, "Differences");
+```
+
+If the base encoding is "MacRomainEncoding", we need to reset the upper 128
+characters in the encoding array match it:
+
+```c
+  if (base_encoding && !strcmp(base_encoding, "MacRomanEncoding"))
+  {
+    // Map upper 128
+    memcpy(encoding + 128, mac_roman, sizeof(mac_roman));
+  }
+
+```
+
+Then we loop through the differences array, keeping track of the current index
+within the encoding array.  A number indicates a new index while a name is the
+Unicode glyph for the current index:
+
+```c
+  typedef struct name_map_s
+  {
+    const char    *name;                // Character name
+    int           unicode;              // Unicode value
+  } name_map_t;
+
+  static name_map_t unicode_map[1051];  // List of glyph names
+
+  if (differences)
+  {
+    // Apply differences
+    size_t      count = pdfioArrayGetSize(differences);
+                                        // Number of differences
+    const char  *name;                  // Character name
+    size_t      idx = 0;                // Index in encoding array
+
+    for (i = 0; i < count; i ++)
+    {
+      switch (pdfioArrayGetType(differences, i))
+      {
+        case PDFIO_VALTYPE_NUMBER :
+            // Get the index of the next character...
+            idx = (size_t)pdfioArrayGetNumber(differences, i);
+            break;
+
+        case PDFIO_VALTYPE_NAME :
+            // Lookup name and apply to encoding...
+            if (idx < 0 || idx > 255)
+              break;
+
+            name = pdfioArrayGetName(differences, i);
+            for (j = 0; j < (sizeof(unicode_map) / sizeof(unicode_map[0])); j ++)
+            {
+              if (!strcmp(name, unicode_map[j].name))
+              {
+                encoding[idx] = unicode_map[j].unicode;
+                break;
+              }
+            }
+            idx ++;
+            break;
+
+        default :
+            // Do nothing for other values
+            break;
+      }
+    }
+  }
+}
 ```
 
 
diff --git a/examples/pdf2text.c b/examples/pdf2text.c
index 7e69aad..151ca83 100644
--- a/examples/pdf2text.c
+++ b/examples/pdf2text.c
@@ -1089,6 +1089,7 @@ static name_map_t	unicode_map[] =
 
 static void	load_encoding(pdfio_obj_t *page_obj, const char *name, int encoding[256]);
 static void	put_utf8(int ch);
+static void	puts_utf16(const char *s);
 
 
 //
@@ -1154,7 +1155,7 @@ main(int  argc,				// I - Number of command-line arguments
           // End of an array for justified text...
           in_array = false;
         }
-        else if (!first && (isdigit(buffer[0]) || buffer[0] == '-') && fabs(atof(buffer)) > 100)
+        else if (!first && in_array && (isdigit(buffer[0]) || buffer[0] == '-') && fabs(atof(buffer)) > 100)
         {
           // Whitespace in a justified text block...
           putchar(' ');
@@ -1162,12 +1163,18 @@ main(int  argc,				// I - Number of command-line arguments
 	else if (buffer[0] == '(')
 	{
           // Text string using an 8-bit encoding
-	  if (first)
-	    first = false;
+	  first = false;
 
           for (bufptr = buffer + 1; *bufptr; bufptr ++)
             put_utf8(encoding[*bufptr & 255]);
 	}
+	else if (buffer[0] == '<')
+	{
+          // Unicode text string
+	  first = false;
+
+          puts_utf16(buffer + 1);
+	}
 	else if (buffer[0] == '/')
 	{
 	  // Save name...
@@ -1267,6 +1274,7 @@ load_encoding(
 
   if ((font_dict = pdfioDictGetDict(resources_dict, "Font")) == NULL)
   {
+    // Font resources not a dictionary, see if it is an object...
     if ((font_obj = pdfioDictGetObj(resources_dict, "Font")) != NULL)
       font_dict = pdfioObjGetDict(font_obj);
 
@@ -1362,3 +1370,51 @@ put_utf8(int ch)			// I - Character
     putchar(0x80 | (ch & 0x3f));
   }
 }
+
+
+//
+// 'puts_utf16()' - Output a hex-encoded UTF-16 string.
+//
+
+static void
+puts_utf16(const char *s)		// I - Hex string
+{
+  size_t	length = strlen(s) / 4;	// Length of string
+  int		ch;			// Character
+  char		temp[5];		// Hex characters
+
+
+  temp[4] = '\0';
+
+  while (length > 0)
+  {
+    // Get the next Unicode character...
+    temp[0] = *s++;
+    temp[1] = *s++;
+    temp[2] = *s++;
+    temp[3] = *s++;
+    length --;
+
+    if ((ch = strtol(temp, NULL, 16)) < 0)
+      break;
+
+    if (ch >= 0xd800 && ch <= 0xdbff && length > 0)
+    {
+      // Multi-word UTF-16 char...
+      int lch;			// Lower bits
+
+      temp[0] = *s++;
+      temp[1] = *s++;
+      temp[2] = *s++;
+      temp[3] = *s++;
+      length --;
+
+      if ((lch = strtol(temp, NULL, 16)) < 0 || lch < 0xdc00 || lch >= 0xdfff)
+	break;
+
+      ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000;
+    }
+
+    put_utf8(ch);
+  }
+}
\ No newline at end of file
diff --git a/pdfio-content.c b/pdfio-content.c
index 9888a53..5557a51 100644
--- a/pdfio-content.c
+++ b/pdfio-content.c
@@ -3797,9 +3797,19 @@ write_string(pdfio_stream_t *st,	// I - Stream
 
     if (unicode)
     {
-      // Write a two-byte character...
-      if (!pdfioStreamPrintf(st, "%04X", ch))
-	return (false);
+      // Write UTF-16 in hex...
+      if (ch < 0x100000)
+      {
+        // Two-byte UTF-16
+	if (!pdfioStreamPrintf(st, "%04X", ch))
+	  return (false);
+      }
+      else
+      {
+        // Four-byte UTF-16
+	if (!pdfioStreamPrintf(st, "%04X%04X", 0xd800 | ((ch >> 10) & 0x03ff), 0xdc00 | (ch & 0x03ff)))
+	  return (false);
+      }
     }
     else
     {
diff --git a/pdfio-stream.c b/pdfio-stream.c
index 1306eb5..4200d9a 100644
--- a/pdfio-stream.c
+++ b/pdfio-stream.c
@@ -377,15 +377,16 @@ pdfioStreamConsume(pdfio_stream_t *st,	// I - Stream
 //
 // 'pdfioStreamGetToken()' - Read a single PDF token from a stream.
 //
-// This function reads a single PDF token from a stream.  Operator tokens,
-// boolean values, and numbers are returned as-is in the provided string buffer.
-// String values start with the opening parenthesis ('(') but have all escaping
-// resolved and the terminating parenthesis removed.  Hexadecimal string values
-// start with the opening angle bracket ('<') and have all whitespace and the
-// terminating angle bracket removed.
+// This function reads a single PDF token from a stream, skipping all whitespace
+// and comments.  Operator tokens, boolean values, and numbers are returned
+// as-is in the provided string buffer.  String values start with the opening
+// parenthesis ('(') but have all escaping resolved and the terminating
+// parenthesis removed.  Hexadecimal string values start with the opening angle
+// bracket ('<') and have all whitespace and the terminating angle bracket
+// removed.
 //
 
-bool					// O - `true` on success, `false` on EOF
+bool					// O - `true` on success, `false` on end-of-stream or error
 pdfioStreamGetToken(
     pdfio_stream_t *st,			// I - Stream
     char           *buffer,		// I - String buffer