2025-07-12 14:04:31 +02:00
9 changed files with 35 additions and 172 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,7 +14,6 @@
 /examples/code128
 /examples/image2pdf
 /examples/md2pdf
-/examples/pdf2text
 /examples/pdfioinfo
 /Makefile
 /packages
@ -22,6 +21,7 @@
 /pdfio.xcodeproj/xcshareddata
 /pdfio-*.tar.gz*
 /pdfio-*.zip*
+/pdfiototext
 /testpdfio
 /testpdfio-*.pdf
 /testttf
--- a/Makefile.in
+++ b/Makefile.in
@ -103,33 +103,15 @@ LIBOBJS		=	\
 			ttf.o
 OBJS		=	\
 			$(LIBOBJS) \
+			pdfiototext.o \
 			testpdfio.o \
 			testttf.o
 TARGETS		=	\
 			$(LIBPDFIO) \
 			$(LIBPDFIO_STATIC) \
+			pdfiototext \
 			testpdfio \
 			testttf
-DOCFILES	=	\
-			doc/pdfio.html \
-			doc/pdfio-512.png \
-			LICENSE \
-			NOTICE
-EXAMPLES	=	\
-			examples/Makefile \
-			examples/Roboto-Bold.ttf \
-			examples/Roboto-Italic.ttf \
-			examples/Roboto-Regular.ttf \
-			examples/RobotoMono-Regular.ttf \
-			examples/code128.c \
-			examples/code128.ttf \
-			examples/image2pdf.c \
-			examples/md2pdf.c \
-			examples/md2pdf.md \
-			examples/mmd.c \
-			examples/mmd.h \
-			examples/pdf2text.c \
-			examples/pdfioinfo.c


 # Make everything
@ -170,14 +152,9 @@ install:	$(TARGETS)
 	$(INSTALL) -c -m 644 pdfio.pc $(BUILDROOT)$(libdir)/pkgconfig
 	echo Installing documentation to $(BUILDROOT)$(datadir)/doc/pdfio...
 	$(INSTALL) -d -m 755 $(BUILDROOT)$(datadir)/doc/pdfio
-	for file in $(DOCFILES); do \
+	for file in doc/pdfio.html doc/pdfio-512.png LICENSE NOTICE; do \
 		$(INSTALL) -c -m 644  $$file $(BUILDROOT)$(datadir)/doc/pdfio; \
 	done
-	echo Installing examples to $(BUILDROOT)$(datadir)/doc/pdfio/examples...
-	$(INSTALL) -d -m 755 $(BUILDROOT)$(datadir)/doc/pdfio/examples
-	for file in $(EXAMPLES); do \
-		$(INSTALL) -c -m 644 $$file $(BUILDROOT)$(datadir)/doc/pdfio/examples; \
-	done
 	echo Installing man page to $(BUILDROOT)$(mandir)/man3...
 	$(INSTALL) -d -m 755 $(BUILDROOT)$(mandir)/man3
 	$(INSTALL) -c -m 644 doc/pdfio.3 $(BUILDROOT)$(mandir)/man3
@ -224,6 +201,12 @@ pdfio1.def: $(LIBOBJS) Makefile
 		grep -v '^_ttf' | sed -e '1,$$s/^_//' | sort >>$@


+# pdfio text extraction (demo, doesn't handle a lot of things yet)
+pdfiototext:		pdfiototext.o libpdfio.a
+	echo Linking $@...
+	$(CC) $(LDFLAGS) -o $@ pdfiototext.o libpdfio.a $(LIBS)
+
+
 # pdfio test program
 testpdfio:		testpdfio.o libpdfio.a
 	echo Linking $@...
--- a/doc/pdfio.3
+++ b/doc/pdfio.3
@ -1081,43 +1081,7 @@ The pdfioinfo.c example program opens a PDF file and prints the title, author, c
      return (0);
    }
 .fi
-.SS Extract Text from PDF File
-.PP
-The pdf2text.c example code extracts non\-Unicode text from a PDF file by scanning each page for strings and text drawing commands. Since it doesn't look at the font encoding or support Unicode text, it is really only useful to extract plain ASCII text from a PDF file. And since it writes text in the order it appears in the page stream, it may not come out in the same order as appears on the page.
-.PP
-The pdfioStreamGetToken function is used to read individual tokens from the page streams. Tokens starting with the open parenthesis are text strings, while PDF operators are left as\-is. We use some simple logic to make sure that we include spaces between text strings and add newlines for the text operators that start a new line in a text block:
-.nf
-
-    pdfio_stream_t *st;              // Page stream
-    bool           first = true;     // First string on line?
-    char           buffer[1024];     // Token buffer
-    
-    // Read PDF tokens from the page stream...
-    while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
-    {
-      if (buffer[0] == '(')
-      {
-        // Text string using an 8\-bit encoding
-        if (first)
-          first = false;
-        else if (buffer[1] != ' ')
-          putchar(' ');
-    
-        fputs(buffer + 1, stdout);
-      }
-      else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") ||
-               !strcmp(buffer, "\\'") || !strcmp(buffer, "\\""))
-      {
-        // Text operators that advance to the next line in the block
-        putchar('\\n');
-        first = true;
-      }
-    }
-    
-    if (!first)
-      putchar('\\n');
-.fi
-.SS Create a PDF File With Text and an Image
+.SS Create PDF File With Text and Image
 .PP
 The image2pdf.c example code creates a PDF file containing a JPEG or PNG image file and optional caption on a single page. The create_pdf_image_file function creates the PDF file, embeds a base font and the named JPEG or PNG image file, and then creates a page with the image centered on the page with any text centered below:
 .nf
@ -2074,7 +2038,7 @@ We then loops through the fragments for the current line, drawing checkboxes, im
        char  targetlink[129];        // Targeted link
    
        targetlink[0] = '#';
-        make_target_name(targetlink + 1, frag\->text, sNzeof(targetlink) \- 1);
+        make_target_name(targetlink + 1, frag\->text, sizeof(targetlink) \- 1);
    
        l\->url = pdfioStringCreate(dd\->pdf, targetlink);
      }
@ -2135,7 +2099,7 @@ Then it formats each cell using the format_block function described previously.
    
    for (col = 0; col < num_cols; col ++)
    {
-      dd\->y = row_y;
+      dd|>y = row_y;
    
      format_block(dd, row\->cells[col], deffont, SIZE_TABLE, cols[col].left,
                   cols[col].right, /*leader*/NULL);
--- a/doc/pdfio.html
+++ b/doc/pdfio.html
@ -276,8 +276,7 @@ span.string {
 </ul></li>
 <li><a href="#examples">Examples</a><ul class="subcontents">
 <li><a href="#read-pdf-metadata">Read PDF Metadata</a></li>
-<li><a href="#extract-text-from-pdf-file">Extract Text from PDF File</a></li>
-<li><a href="#create-a-pdf-file-with-text-and-an-image">Create a PDF File With Text and an Image</a></li>
+<li><a href="#create-pdf-file-with-text-and-image">Create PDF File With Text and Image</a></li>
 <li><a href="#generate-a-code-128-barcode">Generate a Code 128 Barcode</a></li>
 <li><a href="#convert-markdown-to-pdf">Convert Markdown to PDF</a></li>
 </ul></li>
@ -1198,39 +1197,7 @@ main(<span class="reserved">int</span>  argc,                         <span clas
  <span class="reserved">return</span> (<span class="number">0</span>);
 }
 </code></pre>
-<h3 class="title" id="extract-text-from-pdf-file">Extract Text from PDF File</h3>
-<p>The <code>pdf2text.c</code> example code extracts non-Unicode text from a PDF file by scanning each page for strings and text drawing commands. Since it doesn't look at the font encoding or support Unicode text, it is really only useful to extract plain ASCII text from a PDF file. And since it writes text in the order it appears in the page stream, it may not come out in the same order as appears on the page.</p>
-<p>The <a href="#pdfioStreamGetToken"><code>pdfioStreamGetToken</code></a> function is used to read individual tokens from the page streams. Tokens starting with the open parenthesis are text strings, while PDF operators are left as-is. We use some simple logic to make sure that we include spaces between text strings and add newlines for the text operators that start a new line in a text block:</p>
-<pre><code class="language-c">pdfio_stream_t *st;              <span class="comment">// Page stream</span>
-<span class="reserved">bool</span>           first = <span class="reserved">true</span>;     <span class="comment">// First string on line?</span>
-<span class="reserved">char</span>           buffer[<span class="number">1024</span>];     <span class="comment">// Token buffer</span>
-
-<span class="comment">// Read PDF tokens from the page stream...</span>
-<span class="reserved">while</span> (pdfioStreamGetToken(st, buffer, <span class="reserved">sizeof</span>(buffer)))
-{
-  <span class="reserved">if</span> (buffer[<span class="number">0</span>] == <span class="string">'('</span>)
-  {
-    <span class="comment">// Text string using an 8-bit encoding</span>
-    <span class="reserved">if</span> (first)
-      first = <span class="reserved">false</span>;
-    <span class="reserved">else</span> <span class="reserved">if</span> (buffer[<span class="number">1</span>] != <span class="string">' '</span>)
-      putchar(<span class="string">' '</span>);
-
-    fputs(buffer + <span class="number">1</span>, stdout);
-  }
-  <span class="reserved">else</span> <span class="reserved">if</span> (!strcmp(buffer, <span class="string">&quot;Td&quot;</span>) || !strcmp(buffer, <span class="string">&quot;TD&quot;</span>) || !strcmp(buffer, <span class="string">&quot;T*&quot;</span>) ||
-           !strcmp(buffer, <span class="string">&quot;\'&quot;</span>) || !strcmp(buffer, <span class="string">&quot;\&quot;&quot;</span>))
-  {
-    <span class="comment">// Text operators that advance to the next line in the block</span>
-    putchar(<span class="string">'\n'</span>);
-    first = <span class="reserved">true</span>;
-  }
-}
-
-<span class="reserved">if</span> (!first)
-  putchar(<span class="string">'\n'</span>);
-</code></pre>
-<h3 class="title" id="create-a-pdf-file-with-text-and-an-image">Create a PDF File With Text and an Image</h3>
+<h3 class="title" id="create-pdf-file-with-text-and-image">Create PDF File With Text and Image</h3>
 <p>The <code>image2pdf.c</code> example code creates a PDF file containing a JPEG or PNG image file and optional caption on a single page. The <code>create_pdf_image_file</code> function creates the PDF file, embeds a base font and the named JPEG or PNG image file, and then creates a page with the image centered on the page with any text centered below:</p>
 <pre><code class="language-c"><span class="directive">#include &lt;pdfio.h&gt;</span>
 <span class="directive">#include &lt;pdfio-content.h&gt;</span>
@ -2033,7 +2000,7 @@ dd-&gt;y -= margin_top + lineheight;
    <span class="reserved">char</span>  targetlink[<span class="number">129</span>];        <span class="comment">// Targeted link</span>

    targetlink[<span class="number">0</span>] = <span class="string">'#'</span>;
-    make_target_name(targetlink + <span class="number">1</span>, frag-&gt;text, s¾zeof(targetlink) - <span class="number">1</span>);
+    make_target_name(targetlink + <span class="number">1</span>, frag-&gt;text, <span class="reserved">sizeof</span>(targetlink) - <span class="number">1</span>);

    l-&gt;url = pdfioStringCreate(dd-&gt;pdf, targetlink);
  }
@ -2086,7 +2053,7 @@ dd-&gt;y -= margin_top + lineheight;

 <span class="reserved">for</span> (col = <span class="number">0</span>; col &lt; num_cols; col ++)
 {
-  dd-&gt;y = row_y;
+  ddì&gt;y = row_y;

  format_block(dd, row-&gt;cells[col], deffont, SIZE_TABLE, cols[col].left,
               cols[col].right, <span class="comment">/*leader*/</span>NULL);
--- a/doc/pdfio.md
+++ b/doc/pdfio.md
@ -922,56 +922,8 @@ main(int  argc,                         // I - Number of command-line arguments
 ```


-Extract Text from PDF File
--------------------------
-
-The `pdf2text.c` example code extracts non-Unicode text from a PDF file by
-scanning each page for strings and text drawing commands.  Since it doesn't
-look at the font encoding or support Unicode text, it is really only useful to
-extract plain ASCII text from a PDF file.  And since it writes text in the order
-it appears in the page stream, it may not come out in the same order as appears
-on the page.
-
-The [`pdfioStreamGetToken`](@@) function is used to read individual tokens from
-the page streams.  Tokens starting with the open parenthesis are text strings,
-while PDF operators are left as-is.  We use some simple logic to make sure that
-we include spaces between text strings and add newlines for the text operators
-that start a new line in a text block:
-
-```c
-pdfio_stream_t *st;              // Page stream
-bool           first = true;     // First string on line?
-char           buffer[1024];     // Token buffer
-
-// Read PDF tokens from the page stream...
-while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
-{
-  if (buffer[0] == '(')
-  {
-    // Text string using an 8-bit encoding
-    if (first)
-      first = false;
-    else if (buffer[1] != ' ')
-      putchar(' ');
-
-    fputs(buffer + 1, stdout);
-  }
-  else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") ||
-           !strcmp(buffer, "\'") || !strcmp(buffer, "\""))
-  {
-    // Text operators that advance to the next line in the block
-    putchar('\n');
-    first = true;
-  }
-}
-
-if (!first)
-  putchar('\n');
-```
-
-
-Create a PDF File With Text and an Image
----------------------------------------
+Create PDF File With Text and Image
+-----------------------------------

 The `image2pdf.c` example code creates a PDF file containing a JPEG or PNG
 image file and optional caption on a single page.  The `create_pdf_image_file`
--- a/examples/Makefile
+++ b/examples/Makefile
@ -23,7 +23,6 @@ TARGETS		=	\
 			code128 \
 			image2pdf \
 			md2pdf \
-			pdf2text \
 			pdfioinfo


@ -51,11 +50,6 @@ md2pdf:		md2pdf.c mmd.c mmd.h
 	$(CC) $(CFLAGS) -o $@ md2pdf.c mmd.c $(LIBS)


-# pdfio text extraction (demo, doesn't handle a lot of things yet)
-pdf2text:	pdf2text.c
-	$(CC) $(CFLAGS) -o $@ pdf2text.c $(LIBS)
-
-
 # pdfioinfo
 pdfioinfo:	pdfioinfo.c
 	$(CC) $(CFLAGS) -o $@ pdfioinfo.c $(LIBS)
--- a/pdfio_native.nuspec
+++ b/pdfio_native.nuspec
@ -3,7 +3,7 @@
    <metadata>
        <id>pdfio_native</id>
        <title>PDFio Library for VS2019+</title>
-        <version>1.4.0</version>
+        <version>1.3.2</version>
        <authors>Michael R Sweet</authors>
        <owners>michaelrsweet</owners>
        <projectUrl>https://github.com/michaelrsweet/pappl</projectUrl>
@ -16,7 +16,7 @@
        <copyright>Copyright © 2019-2024 by Michael R Sweet</copyright>
        <tags>pdf file native</tags>
 	<dependencies>
-	    <dependency id="pdfio_native.redist" version="1.4.0" />
+	    <dependency id="pdfio_native.redist" version="1.3.2" />
 	    <dependency id="zlib_native.redist" version="1.2.11" />
 	</dependencies>
    </metadata>
--- a/pdfio_native.redist.nuspec
+++ b/pdfio_native.redist.nuspec
@ -3,7 +3,7 @@
    <metadata>
        <id>pdfio_native.redist</id>
        <title>PDFio Library for VS2019+</title>
-        <version>1.4.0</version>
+        <version>1.3.2</version>
        <authors>Michael R Sweet</authors>
        <owners>michaelrsweet</owners>
        <projectUrl>https://github.com/michaelrsweet/pappl</projectUrl>
--- a/examples/pdf2text.c
+++ b/examples/pdf2text.c
@ -1,17 +1,17 @@
 //
 // PDF to text program for PDFio.
 //
-// Copyright © 2022-2024 by Michael R Sweet.
+// Copyright © 2022 by Michael R Sweet.
 //
 // Licensed under Apache License v2.0.  See the file "LICENSE" for more
 // information.
 //
 // Usage:
 //
-//   ./pdf2text FILENAME.pdf > FILENAME.txt
+//   ./pdfiototext FILENAME.pdf > FILENAME.txt
 //

-#include <pdfio.h>
+#include "pdfio.h"
 #include <string.h>


@ -36,14 +36,16 @@ main(int  argc,				// I - Number of command-line arguments
  // Verify command-line arguments...
  if (argc != 2)
  {
-    puts("Usage: pdf2text FILENAME.pdf > FILENAME.txt");
+    puts("Usage: pdfiototext FILENAME.pdf > FILENAME.txt");
    return (1);
  }

  // Open the PDF file...
-  if ((file = pdfioFileOpen(argv[1], /*password_cb*/NULL, /*password_data*/NULL, /*error_cb*/NULL, /*error_data*/NULL)) == NULL)
+  if ((file = pdfioFileOpen(argv[1], NULL, NULL, NULL, NULL)) == NULL)
    return (1);

+//  printf("%s: %u pages\n", argv[1], (unsigned)pdfioFileGetNumPages(file));
+
  // Try grabbing content from all of the pages...
  for (i = 0, num_pages = pdfioFileGetNumPages(file); i < num_pages; i ++)
  {
@ -52,28 +54,29 @@ main(int  argc,				// I - Number of command-line arguments

    num_streams = pdfioPageGetNumStreams(obj);

+//    printf("%s: page%u=%p, num_streams=%u\n", argv[1], (unsigned)i, obj, (unsigned)num_streams);
+
    for (j = 0; j < num_streams; j ++)
    {
      if ((st = pdfioPageOpenStream(obj, j, true)) == NULL)
 	continue;

-      // Read PDF tokens from the page stream...
+//      printf("%s: page%u st%u=%p\n", argv[1], (unsigned)i, (unsigned)j, st);
+
      first = true;
      while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
      {
 	if (buffer[0] == '(')
 	{
-          // Text string using an 8-bit encoding
 	  if (first)
 	    first = false;
-	  else if (buffer[1] != ' ')
+	  else
 	    putchar(' ');

 	  fputs(buffer + 1, stdout);
 	}
 	else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") || !strcmp(buffer, "\'") || !strcmp(buffer, "\""))
 	{
-	  // Text operators that advance to the next line in the block
 	  putchar('\n');
 	  first = true;
 	}