Add missing pdfioPageGetNumStreams and pdfioPageOpenStream functions.

Add initial version of pdfiototext text extraction utility.
This commit is contained in:
Michael R Sweet 2022-02-28 15:00:25 -05:00
parent fa20982e5d
commit 93a3fcea6c
No known key found for this signature in database
GPG Key ID: 999559A027815955
5 changed files with 187 additions and 4 deletions

1
.gitignore vendored
View File

@ -8,6 +8,7 @@
/doc/pdfio.epub
/packages
/pdfio.xcodeproj/xcshareddata
/pdfiototext
/testpdfio
/testpdfio-*.pdf
/x64

View File

@ -2,6 +2,13 @@ Changes in PDFio
================
v1.0.1 (Month DD, YYYY)
-----------------------
- Added missing `pdfioPageGetNumStreams` and `pdfioPageOpenStream` functions.
- Added pdfiototext demo utility.
v1.0.0 (December 14, 2021)
--------------------------

View File

@ -1,7 +1,7 @@
#
# Makefile for PDFio.
#
# Copyright © 2021 by Michael R Sweet.
# Copyright © 2021-2022 by Michael R Sweet.
#
# Licensed under Apache License v2.0. See the file "LICENSE" for more
# information.
@ -26,7 +26,7 @@ DSONAME =
LDFLAGS =
LIBS = -lm -lz
RANLIB = ranlib
VERSION = 1.0.0
VERSION = 1.0.1
prefix = /usr/local
@ -62,10 +62,12 @@ LIBOBJS = \
ttf.o
OBJS = \
$(LIBOBJS) \
pdfiototext.o \
testpdfio.o
TARGETS = \
$(DSONAME) \
libpdfio.a \
pdfiototext \
testpdfio
@ -154,6 +156,11 @@ pdfio1.def: $(LIBOBJS) Makefile
grep -v '^_ttf' | sed -e '1,$$s/^_//' | sort >>$@
# pdfio text extraction demo
pdfiototext: pdfiototext.o libpdfio.a
$(CC) $(LDFLAGS) $(COMMONFLAGS) -o $@ pdfiototext.o libpdfio.a $(LIBS)
# pdfio test program
testpdfio: testpdfio.o libpdfio.a
$(CC) $(LDFLAGS) $(COMMONFLAGS) -o $@ testpdfio.o libpdfio.a $(LIBS)
@ -167,7 +174,7 @@ ttf.o: ttf.h
# Make documentation using Codedoc <https://www.msweet.org/codedoc>
DOCFLAGS = \
--author "Michael R Sweet" \
--copyright "Copyright (c) 2021 by Michael R Sweet" \
--copyright "Copyright (c) 2021-2022 by Michael R Sweet" \
--docversion $(VERSION)
.PHONY: doc

View File

@ -1,7 +1,7 @@
//
// PDF page functions for PDFio.
//
// Copyright © 2021 by Michael R Sweet.
// Copyright © 2021-2022 by Michael R Sweet.
//
// Licensed under Apache License v2.0. See the file "LICENSE" for more
// information.
@ -14,6 +14,13 @@
#include "pdfio-private.h"
//
// Local functions...
//
static _pdfio_value_t *get_contents(pdfio_obj_t *page);
//
// 'pdfioPageCopy()' - Copy a page to a PDF file.
//
@ -47,3 +54,74 @@ pdfioPageCopy(pdfio_file_t *pdf, // I - PDF file
else
return (_pdfioFileAddPage(pdf, dstpage));
}
//
// 'pdfioPageGetNumStreams()' - Get the number of content streams for a page object.
//
size_t // O - Number of streams
pdfioPageGetNumStreams(
pdfio_obj_t *page) // I - Page object
{
_pdfio_value_t *contents = get_contents(page);
// Contents value
if (!contents)
return (0);
else if (contents->type == PDFIO_VALTYPE_ARRAY)
return (pdfioArrayGetSize(contents->value.array));
else
return (1);
}
//
// 'pdfioPageOpenStream()' - Open a content stream for a page.
//
pdfio_stream_t * // O - Stream
pdfioPageOpenStream(
pdfio_obj_t *page, // I - Page object
size_t n, // I - Stream index (0-based)
bool decode) // I - `true` to decode/decompress stream
{
_pdfio_value_t *contents = get_contents(page);
// Contents value
if (!contents)
return (NULL);
else if (contents->type == PDFIO_VALTYPE_ARRAY && n < pdfioArrayGetSize(contents->value.array))
return (pdfioObjOpenStream(pdfioArrayGetObj(contents->value.array, n), decode));
else if (n)
return (NULL);
else
return (pdfioObjOpenStream(pdfioFileFindObj(page->pdf, contents->value.indirect.number), decode));
}
//
// 'get_contents()' - Get a page's Contents value.
//
static _pdfio_value_t * // O - Value or NULL on error
get_contents(pdfio_obj_t *page) // I - Page object
{
// Range check input...
if (!page)
return (NULL);
// Load the page object as needed...
if (page->value.type == PDFIO_VALTYPE_NONE)
{
if (!_pdfioObjLoad(page))
return (NULL);
}
if (page->value.type != PDFIO_VALTYPE_DICT)
return (NULL);
return (_pdfioDictGetValue(page->value.value.dict, "Contents"));
}

90
pdfiototext.c Normal file
View File

@ -0,0 +1,90 @@
//
// PDF to text program for PDFio.
//
// Copyright © 2022 by Michael R Sweet.
//
// Licensed under Apache License v2.0. See the file "LICENSE" for more
// information.
//
// Usage:
//
// ./pdfiototext FILENAME.pdf > FILENAME.txt
//
#include "pdfio.h"
//
// 'main()' - Main entry.
//
int // O - Exit status
main(int argc, // I - Number of command-line arguments
char *argv[]) // I - Command-line arguments
{
pdfio_file_t *file; // PDF file
size_t i, j, // Looping vars
num_pages, // Number of pages
num_streams; // Number of streams for page
pdfio_obj_t *obj; // Current page object
pdfio_stream_t *st; // Current page content stream
char buffer[1024]; // String buffer
bool first; // First string token?
// Verify command-line arguments...
if (argc != 2)
{
puts("Usage: pdfiototext FILENAME.pdf > FILENAME.txt");
return (1);
}
// Open the PDF file...
if ((file = pdfioFileOpen(argv[1], NULL, NULL, NULL, NULL)) == NULL)
return (1);
printf("%s: %u pages\n", argv[1], (unsigned)pdfioFileGetNumPages(file));
// Try grabbing content from all of the pages...
for (i = 0, num_pages = pdfioFileGetNumPages(file); i < num_pages; i ++)
{
if ((obj = pdfioFileGetPage(file, i)) == NULL)
continue;
num_streams = pdfioPageGetNumStreams(obj);
printf("%s: page%u=%p, num_streams=%u\n", argv[1], (unsigned)i, obj, (unsigned)num_streams);
for (j = 0; j < num_streams; j ++)
{
if ((st = pdfioPageOpenStream(obj, j, true)) == NULL)
continue;
printf("%s: page%u st%u=%p\n", argv[1], (unsigned)i, (unsigned)j, st);
first = true;
while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
{
if (buffer[0] == '(')
{
if (first)
{
putchar(' ');
first = false;
}
fputs(buffer + 1, stdout);
}
}
if (!first)
putchar('\n');
pdfioStreamClose(st);
}
}
pdfioFileClose(file);
return (0);
}