From fd8427d68a5a6b96f8a3d79a189b80b99425396a Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Sun, 22 Dec 2024 21:29:32 -0500 Subject: [PATCH] Add pdf2text example docos, install examples to doc directory. --- Makefile.in | 29 +++++++++++++++++++++++-- doc/pdfio.3 | 42 +++++++++++++++++++++++++++++++++--- doc/pdfio.html | 41 +++++++++++++++++++++++++++++++---- doc/pdfio.md | 52 +++++++++++++++++++++++++++++++++++++++++++-- examples/pdf2text.c | 3 +++ 5 files changed, 156 insertions(+), 11 deletions(-) diff --git a/Makefile.in b/Makefile.in index 6f49c84..52f3698 100644 --- a/Makefile.in +++ b/Makefile.in @@ -110,6 +110,26 @@ TARGETS = \ $(LIBPDFIO_STATIC) \ testpdfio \ testttf +DOCFILES = \ + doc/pdfio.html \ + doc/pdfio-512.png \ + LICENSE \ + NOTICE +EXAMPLES = \ + examples/Makefile \ + examples/Roboto-Bold.ttf \ + examples/Roboto-Italic.ttf \ + examples/Roboto-Regular.ttf \ + examples/RobotoMono-Regular.ttf \ + examples/code128.c \ + examples/code128.ttf \ + examples/image2pdf.c \ + examples/md2pdf.c \ + examples/md2pdf.md \ + examples/mmd.c \ + examples/mmd.h \ + examples/pdf2text.c \ + examples/pdfioinfo.c # Make everything @@ -150,8 +170,13 @@ install: $(TARGETS) $(INSTALL) -c -m 644 pdfio.pc $(BUILDROOT)$(libdir)/pkgconfig echo Installing documentation to $(BUILDROOT)$(datadir)/doc/pdfio... $(INSTALL) -d -m 755 $(BUILDROOT)$(datadir)/doc/pdfio - for file in doc/pdfio.html doc/pdfio-512.png LICENSE NOTICE; do \ - $(INSTALL) -c -m 644 $$file $(BUILDROOT)$(datadir)/doc/pdfio; \ + for file in $(DOCFILES); do \ + $(INSTALL) -c -m 644 $$file $(BUILDROOT)$(datadir)/doc/pdfio; \ + done + echo Installing examples to $(BUILDROOT)$(datadir)/doc/pdfio/examples... + $(INSTALL) -d -m 755 $(BUILDROOT)$(datadir)/doc/pdfio/examples + for file in $(EXAMPLES); do \ + $(INSTALL) -c -m 644 $$file $(BUILDROOT)$(datadir)/doc/pdfio/examples; \ done echo Installing man page to $(BUILDROOT)$(mandir)/man3... $(INSTALL) -d -m 755 $(BUILDROOT)$(mandir)/man3 diff --git a/doc/pdfio.3 b/doc/pdfio.3 index 064ff35..defe5f4 100644 --- a/doc/pdfio.3 +++ b/doc/pdfio.3 @@ -1081,7 +1081,43 @@ The pdfioinfo.c example program opens a PDF file and prints the title, author, c return (0); } .fi -.SS Create PDF File With Text and Image +.SS Extract Text from PDF File +.PP +The pdf2text.c example code extracts non\-Unicode text from a PDF file by scanning each page for strings and text drawing commands. Since it doesn't look at the font encoding or support Unicode text, it is really only useful to extract plain ASCII text from a PDF file. And since it writes text in the order it appears in the page stream, it may not come out in the same order as appears on the page. +.PP +The pdfioStreamGetToken function is used to read individual tokens from the page streams. Tokens starting with the open parenthesis are text strings, while PDF operators are left as\-is. We use some simple logic to make sure that we include spaces between text strings and add newlines for the text operators that start a new line in a text block: +.nf + + pdfio_stream_t *st; // Page stream + bool first = true; // First string on line? + char buffer[1024]; // Token buffer + + // Read PDF tokens from the page stream... + while (pdfioStreamGetToken(st, buffer, sizeof(buffer))) + { + if (buffer[0] == '(') + { + // Text string using an 8\-bit encoding + if (first) + first = false; + else if (buffer[1] != ' ') + putchar(' '); + + fputs(buffer + 1, stdout); + } + else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") || + !strcmp(buffer, "\\'") || !strcmp(buffer, "\\"")) + { + // Text operators that advance to the next line in the block + putchar('\\n'); + first = true; + } + } + + if (!first) + putchar('\\n'); +.fi +.SS Create a PDF File With Text and an Image .PP The image2pdf.c example code creates a PDF file containing a JPEG or PNG image file and optional caption on a single page. The create_pdf_image_file function creates the PDF file, embeds a base font and the named JPEG or PNG image file, and then creates a page with the image centered on the page with any text centered below: .nf @@ -2038,7 +2074,7 @@ We then loops through the fragments for the current line, drawing checkboxes, im char targetlink[129]; // Targeted link targetlink[0] = '#'; - make_target_name(targetlink + 1, frag\->text, sizeof(targetlink) \- 1); + make_target_name(targetlink + 1, frag\->text, sNzeof(targetlink) \- 1); l\->url = pdfioStringCreate(dd\->pdf, targetlink); } @@ -2099,7 +2135,7 @@ Then it formats each cell using the format_block function described previously. for (col = 0; col < num_cols; col ++) { - dd|>y = row_y; + dd\->y = row_y; format_block(dd, row\->cells[col], deffont, SIZE_TABLE, cols[col].left, cols[col].right, /*leader*/NULL); diff --git a/doc/pdfio.html b/doc/pdfio.html index 8a1bf10..9427a95 100644 --- a/doc/pdfio.html +++ b/doc/pdfio.html @@ -276,7 +276,8 @@ span.string {
  • Examples
  • @@ -1197,7 +1198,39 @@ main(int argc, return (0); } -

    Create PDF File With Text and Image

    +

    Extract Text from PDF File

    +

    The pdf2text.c example code extracts non-Unicode text from a PDF file by scanning each page for strings and text drawing commands. Since it doesn't look at the font encoding or support Unicode text, it is really only useful to extract plain ASCII text from a PDF file. And since it writes text in the order it appears in the page stream, it may not come out in the same order as appears on the page.

    +

    The pdfioStreamGetToken function is used to read individual tokens from the page streams. Tokens starting with the open parenthesis are text strings, while PDF operators are left as-is. We use some simple logic to make sure that we include spaces between text strings and add newlines for the text operators that start a new line in a text block:

    +
    pdfio_stream_t *st;              // Page stream
    +bool           first = true;     // First string on line?
    +char           buffer[1024];     // Token buffer
    +
    +// Read PDF tokens from the page stream...
    +while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
    +{
    +  if (buffer[0] == '(')
    +  {
    +    // Text string using an 8-bit encoding
    +    if (first)
    +      first = false;
    +    else if (buffer[1] != ' ')
    +      putchar(' ');
    +
    +    fputs(buffer + 1, stdout);
    +  }
    +  else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") ||
    +           !strcmp(buffer, "\'") || !strcmp(buffer, "\""))
    +  {
    +    // Text operators that advance to the next line in the block
    +    putchar('\n');
    +    first = true;
    +  }
    +}
    +
    +if (!first)
    +  putchar('\n');
    +
    +

    Create a PDF File With Text and an Image

    The image2pdf.c example code creates a PDF file containing a JPEG or PNG image file and optional caption on a single page. The create_pdf_image_file function creates the PDF file, embeds a base font and the named JPEG or PNG image file, and then creates a page with the image centered on the page with any text centered below:

    #include <pdfio.h>
     #include <pdfio-content.h>
    @@ -2000,7 +2033,7 @@ dd->y -= margin_top + lineheight;
         char  targetlink[129];        // Targeted link
     
         targetlink[0] = '#';
    -    make_target_name(targetlink + 1, frag->text, sizeof(targetlink) - 1);
    +    make_target_name(targetlink + 1, frag->text, s¾zeof(targetlink) - 1);
     
         l->url = pdfioStringCreate(dd->pdf, targetlink);
       }
    @@ -2053,7 +2086,7 @@ dd->y -= margin_top + lineheight;
     
     for (col = 0; col < num_cols; col ++)
     {
    -  ddì>y = row_y;
    +  dd->y = row_y;
     
       format_block(dd, row->cells[col], deffont, SIZE_TABLE, cols[col].left,
                    cols[col].right, /*leader*/NULL);
    diff --git a/doc/pdfio.md b/doc/pdfio.md
    index aad6da6..6e75a6d 100644
    --- a/doc/pdfio.md
    +++ b/doc/pdfio.md
    @@ -922,8 +922,56 @@ main(int  argc,                         // I - Number of command-line arguments
     ```
     
     
    -Create PDF File With Text and Image
    ------------------------------------
    +Extract Text from PDF File
    +--------------------------
    +
    +The `pdf2text.c` example code extracts non-Unicode text from a PDF file by
    +scanning each page for strings and text drawing commands.  Since it doesn't
    +look at the font encoding or support Unicode text, it is really only useful to
    +extract plain ASCII text from a PDF file.  And since it writes text in the order
    +it appears in the page stream, it may not come out in the same order as appears
    +on the page.
    +
    +The [`pdfioStreamGetToken`](@@) function is used to read individual tokens from
    +the page streams.  Tokens starting with the open parenthesis are text strings,
    +while PDF operators are left as-is.  We use some simple logic to make sure that
    +we include spaces between text strings and add newlines for the text operators
    +that start a new line in a text block:
    +
    +```c
    +pdfio_stream_t *st;              // Page stream
    +bool           first = true;     // First string on line?
    +char           buffer[1024];     // Token buffer
    +
    +// Read PDF tokens from the page stream...
    +while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
    +{
    +  if (buffer[0] == '(')
    +  {
    +    // Text string using an 8-bit encoding
    +    if (first)
    +      first = false;
    +    else if (buffer[1] != ' ')
    +      putchar(' ');
    +
    +    fputs(buffer + 1, stdout);
    +  }
    +  else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") ||
    +           !strcmp(buffer, "\'") || !strcmp(buffer, "\""))
    +  {
    +    // Text operators that advance to the next line in the block
    +    putchar('\n');
    +    first = true;
    +  }
    +}
    +
    +if (!first)
    +  putchar('\n');
    +```
    +
    +
    +Create a PDF File With Text and an Image
    +----------------------------------------
     
     The `image2pdf.c` example code creates a PDF file containing a JPEG or PNG
     image file and optional caption on a single page.  The `create_pdf_image_file`
    diff --git a/examples/pdf2text.c b/examples/pdf2text.c
    index 4bad097..c1a8f40 100644
    --- a/examples/pdf2text.c
    +++ b/examples/pdf2text.c
    @@ -57,11 +57,13 @@ main(int  argc,				// I - Number of command-line arguments
           if ((st = pdfioPageOpenStream(obj, j, true)) == NULL)
     	continue;
     
    +      // Read PDF tokens from the page stream...
           first = true;
           while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
           {
     	if (buffer[0] == '(')
     	{
    +          // Text string using an 8-bit encoding
     	  if (first)
     	    first = false;
     	  else if (buffer[1] != ' ')
    @@ -71,6 +73,7 @@ main(int  argc,				// I - Number of command-line arguments
     	}
     	else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") || !strcmp(buffer, "\'") || !strcmp(buffer, "\""))
     	{
    +	  // Text operators that advance to the next line in the block
     	  putchar('\n');
     	  first = true;
     	}