From f7f2969e3aacd1c3f92ec453dd49e2990d6d1e24 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Tue, 1 Mar 2022 09:18:56 -0500 Subject: [PATCH] Fix pdfioStreamGetToken implementation (wasn't flushing input), update pdfiototext code to better handle different text operators that affect the location of the text. --- CHANGES.md | 3 ++- pdfio-stream.c | 8 ++++++-- pdfiototext.c | 17 +++++++++++------ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 21fa4af..de9231c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,7 +6,8 @@ v1.0.1 (Month DD, YYYY) ----------------------- - Added missing `pdfioPageGetNumStreams` and `pdfioPageOpenStream` functions. -- Added pdfiototext demo utility. +- Added demo pdfiototext utility. +- Fixed bug in `pdfioStreamGetToken`. v1.0.0 (December 14, 2021) diff --git a/pdfio-stream.c b/pdfio-stream.c index 59c25fc..7e35652 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -1,7 +1,7 @@ // // PDF stream functions for PDFio. // -// Copyright © 2021 by Michael R Sweet. +// Copyright © 2021-2022 by Michael R Sweet. // // Licensed under Apache License v2.0. See the file "LICENSE" for more // information. @@ -372,6 +372,7 @@ pdfioStreamGetToken( size_t bufsize) // I - Size of string buffer { _pdfio_token_t tb; // Token buffer/stack + bool ret; // Return value // Range check input... @@ -381,7 +382,10 @@ pdfioStreamGetToken( // Read using the token engine... _pdfioTokenInit(&tb, st->pdf, (_pdfio_tconsume_cb_t)pdfioStreamConsume, (_pdfio_tpeek_cb_t)pdfioStreamPeek, st); - return (_pdfioTokenRead(&tb, buffer, bufsize)); + ret = _pdfioTokenRead(&tb, buffer, bufsize); + _pdfioTokenFlush(&tb); + + return (ret); } diff --git a/pdfiototext.c b/pdfiototext.c index 13c9d4d..d791739 100644 --- a/pdfiototext.c +++ b/pdfiototext.c @@ -12,6 +12,7 @@ // #include "pdfio.h" +#include // @@ -43,7 +44,7 @@ main(int argc, // I - Number of command-line arguments if ((file = pdfioFileOpen(argv[1], NULL, NULL, NULL, NULL)) == NULL) return (1); - printf("%s: %u pages\n", argv[1], (unsigned)pdfioFileGetNumPages(file)); +// printf("%s: %u pages\n", argv[1], (unsigned)pdfioFileGetNumPages(file)); // Try grabbing content from all of the pages... for (i = 0, num_pages = pdfioFileGetNumPages(file); i < num_pages; i ++) @@ -53,14 +54,14 @@ main(int argc, // I - Number of command-line arguments num_streams = pdfioPageGetNumStreams(obj); - printf("%s: page%u=%p, num_streams=%u\n", argv[1], (unsigned)i, obj, (unsigned)num_streams); +// printf("%s: page%u=%p, num_streams=%u\n", argv[1], (unsigned)i, obj, (unsigned)num_streams); for (j = 0; j < num_streams; j ++) { if ((st = pdfioPageOpenStream(obj, j, true)) == NULL) continue; - printf("%s: page%u st%u=%p\n", argv[1], (unsigned)i, (unsigned)j, st); +// printf("%s: page%u st%u=%p\n", argv[1], (unsigned)i, (unsigned)j, st); first = true; while (pdfioStreamGetToken(st, buffer, sizeof(buffer))) @@ -68,13 +69,17 @@ main(int argc, // I - Number of command-line arguments if (buffer[0] == '(') { if (first) - { - putchar(' '); first = false; - } + else + putchar(' '); fputs(buffer + 1, stdout); } + else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") || !strcmp(buffer, "\'") || !strcmp(buffer, "\"")) + { + putchar('\n'); + first = true; + } } if (!first)