From e107b94c8387f4d78b203eb9799a2b673cf13c31 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Sat, 1 May 2021 17:50:52 -0400 Subject: [PATCH] Work on token parsing. --- Makefile | 1 + pdfio-common.c | 91 +++++++++++-- pdfio-dict.c | 25 ++++ pdfio-file.c | 232 +++++++++++++++++++------------- pdfio-private.h | 14 +- pdfio-stream.c | 17 ++- pdfio-token.c | 62 +++++++++ pdfio.h | 2 + pdfio.xcodeproj/project.pbxproj | 4 + 9 files changed, 338 insertions(+), 110 deletions(-) create mode 100644 pdfio-token.c diff --git a/Makefile b/Makefile index 7f3adf3..4082b37 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,7 @@ LIBOBJS = \ pdfio-page.o \ pdfio-stream.o \ pdfio-string.o \ + pdfio-token.o \ pdfio-value.o OBJS = \ $(LIBOBJS) \ diff --git a/pdfio-common.c b/pdfio-common.c index f69049b..9f08037 100644 --- a/pdfio-common.c +++ b/pdfio-common.c @@ -20,7 +20,7 @@ static bool fill_buffer(pdfio_file_t *pdf); static ssize_t read_buffer(pdfio_file_t *pdf, char *buffer, size_t bytes); -static bool write_buffer(pdfio_file_t *pdf, const char *buffer, size_t bytes); +static bool write_buffer(pdfio_file_t *pdf, const void *buffer, size_t bytes); // @@ -105,6 +105,19 @@ _pdfioFileGetChar(pdfio_file_t *pdf) // I - PDF file } +// +// '_pdfioFileGetToken()' - Get a token from a PDF file. +// + +bool // O - `true` on success, `false` on failure +_pdfioFileGetToken(pdfio_file_t *pdf, // I - PDF file + char *buffer,// I - String buffer + size_t bufsize)// I - Size of string buffer +{ + return (_pdfioTokenRead(buffer, bufsize, (_pdfio_token_cb_t)_pdfioFilePeek, (_pdfio_token_cb_t)_pdfioFileRead, pdf)); +} + + // // '_pdfioFileGets()' - Read a line from a PDF file. // @@ -159,6 +172,62 @@ _pdfioFileGets(pdfio_file_t *pdf, // I - PDF file } +// +// '_pdfioFilePeek()' - Peek at upcoming data in a PDF file. +// + +ssize_t // O - Number of bytes returned +_pdfioFilePeek(pdfio_file_t *pdf, // I - PDF file + void *buffer, // I - Buffer + size_t bytes) // I - Size of bufffer +{ + ssize_t total; // Total bytes available + + + // See how much data is buffered up... + if (pdf->bufptr >= pdf->bufend) + { + // Fill the buffer... + if (!fill_buffer(pdf)) + return (-1); + } + + if ((total = pdf->bufend - pdf->bufptr) < (ssize_t)bytes && total < (ssize_t)(sizeof(pdf->buffer) / 2)) + { + // Yes, try reading more... + ssize_t rbytes; // Bytes read + + memmove(pdf->buffer, pdf->bufptr, total); + pdf->bufpos += pdf->bufptr - pdf->buffer; + pdf->bufptr = pdf->buffer; + pdf->bufend = pdf->buffer + total; + + // Read until we have bytes or a non-recoverable error... + while ((rbytes = read(pdf->fd, pdf->bufend, sizeof(pdf->buffer) - (size_t)total)) < 0) + { + if (errno != EINTR && errno != EAGAIN) + break; + } + + if (rbytes > 0) + { + // Expand the buffer... + pdf->bufend += rbytes; + total += rbytes; + } + } + + // Copy anything we have to the buffer... + if (total > (ssize_t)bytes) + total = (ssize_t)bytes; + + if (total > 0) + memcpy(buffer, pdf->bufptr, total); + + return (total); +} + + // // '_pdfioFilePrintf()' - Write a formatted string to a PDF file. // @@ -201,15 +270,17 @@ _pdfioFilePuts(pdfio_file_t *pdf, // I - PDF file ssize_t // O - Number of bytes read or `-1` on error _pdfioFileRead(pdfio_file_t *pdf, // I - PDF file - char *buffer, // I - Read buffer + void *buffer, // I - Read buffer size_t bytes) // I - Number of bytes to read { + char *bufptr = (char *)buffer; + // Pointer into buffer ssize_t total, // Total bytes read rbytes; // Bytes read this time // Loop until we have read all of the requested bytes or hit an error... - for (total = 0; bytes > 0; total += rbytes, bytes -= (size_t)rbytes, buffer += rbytes) + for (total = 0; bytes > 0; total += rbytes, bytes -= (size_t)rbytes, bufptr += rbytes) { // First read from the file buffer... if ((rbytes = pdf->bufend - pdf->bufptr) > 0) @@ -217,7 +288,7 @@ _pdfioFileRead(pdfio_file_t *pdf, // I - PDF file if ((size_t)rbytes > bytes) rbytes = (ssize_t)bytes; - memcpy(buffer, pdf->bufptr, rbytes); + memcpy(bufptr, pdf->bufptr, rbytes); pdf->bufptr += rbytes; continue; } @@ -226,7 +297,7 @@ _pdfioFileRead(pdfio_file_t *pdf, // I - PDF file if (bytes > 1024) { // Read directly from the file... - if ((rbytes = read_buffer(pdf, buffer, bytes)) > 0) + if ((rbytes = read_buffer(pdf, bufptr, bytes)) > 0) { pdf->bufpos += rbytes; continue; @@ -320,7 +391,7 @@ _pdfioFileTell(pdfio_file_t *pdf) // I - PDF file bool // O - `true` on success and `false` on error _pdfioFileWrite(pdfio_file_t *pdf, // I - PDF file - const char *buffer, // I - Write buffer + const void *buffer, // I - Write buffer size_t bytes) // I - Bytes to write { // See if the data will fit in the write buffer... @@ -417,16 +488,18 @@ read_buffer(pdfio_file_t *pdf, // I - PDF file static bool // O - `true` on success and `false` on error write_buffer(pdfio_file_t *pdf, // I - PDF file - const char *buffer, // I - Write buffer + const void *buffer, // I - Write buffer size_t bytes) // I - Bytes to write { + const char *bufptr = (const char *)buffer; + // Pointer into buffer ssize_t wbytes; // Bytes written... // Write to the file... while (bytes > 0) { - while ((wbytes = write(pdf->fd, buffer, bytes)) < 0) + while ((wbytes = write(pdf->fd, bufptr, bytes)) < 0) { // Stop if we have an error that shouldn't be retried... if (errno != EINTR && errno != EAGAIN) @@ -440,7 +513,7 @@ write_buffer(pdfio_file_t *pdf, // I - PDF file return (false); } - buffer += wbytes; + bufptr += wbytes; bytes -= (size_t)wbytes; } diff --git a/pdfio-dict.c b/pdfio-dict.c index 30bd31d..28f0f19 100644 --- a/pdfio-dict.c +++ b/pdfio-dict.c @@ -332,6 +332,31 @@ _pdfioDictGetValue(pdfio_dict_t *dict, // I - Dictionary } +// +// '_pdfioDictRead()' - Read a dictionary from a PDF file. +// +// At this point we've seen the initial "<<"... +// + +pdfio_dict_t * // O - New dictionary +_pdfioDictRead(pdfio_file_t *pdf) // I - PDF file +{ + pdfio_dict_t *dict; // New dictionary + char token[8192], // Token buffer + key[256]; // Dictionary key + _pdfio_value_t value; // Dictionary value + + + (void)pdf; + (void)dict; + (void)token; + (void)key; + (void)value; + + return (NULL); +} + + // // 'pdfioDictSetArray()' - Set a key array in a dictionary. // diff --git a/pdfio-file.c b/pdfio-file.c index 8f8c5d1..5f1273b 100644 --- a/pdfio-file.c +++ b/pdfio-file.c @@ -22,6 +22,7 @@ // static pdfio_obj_t *add_object(pdfio_file_t *pdf); +static bool load_xref(pdfio_file_t *pdf, off_t xref_offset); static bool write_trailer(pdfio_file_t *pdf); @@ -220,6 +221,17 @@ pdfioFileCreatePage(pdfio_file_t *pdf, // I - PDF file } +// +// 'pdfioFileGetID()' - Get the PDF file's ID strings. +// + +pdfio_array_t * // O - Array with binary strings +pdfioFileGetID(pdfio_file_t *pdf) // I - PDF file +{ + return (pdf ? pdfioDictGetArray(pdf->trailer, "ID") : NULL); +} + + // // 'pdfioFileGetName()' - Get a PDF's filename. // @@ -285,7 +297,7 @@ pdfioFileGetPage(pdfio_file_t *pdf, // I - PDF file // -// '()' - Get the PDF version number for a PDF file. +// 'pdfioFileGetVersion()' - Get the PDF version number for a PDF file. // const char * // O - Version number or `NULL` @@ -379,104 +391,9 @@ pdfioFileOpen( xref_offset = (off_t)strtol(ptr + 9, NULL, 10); - if (_pdfioFileSeek(pdf, xref_offset, SEEK_SET) != xref_offset) - { - _pdfioFileError(pdf, "Unable to seek to start of xref table."); + if (!load_xref(pdf, xref_offset)) goto error; - } - if (!_pdfioFileGets(pdf, line, sizeof(line))) - { - _pdfioFileError(pdf, "Unable to read start of xref table."); - goto error; - } - - if (strcmp(line, "xref")) - { - _pdfioFileError(pdf, "Bad xref table header '%s'.", line); - goto error; - } - - // Read the xref tables - while (_pdfioFileGets(pdf, line, sizeof(line))) - { - intmax_t number, // Object number - num_objects; // Number of objects - - if (!strcmp(line, "trailer")) - break; - - if (sscanf(line, "%jd%jd", &number, &num_objects) != 2) - { - _pdfioFileError(pdf, "Malformed xref table section '%s'.", line); - goto error; - } - - // Read this group of objects... - for (; num_objects > 0; num_objects --, number ++) - { - intmax_t offset; // Offset in file - int generation; // Generation number - pdfio_obj_t *obj; // Object - - // Read a line from the file and validate it... - if (_pdfioFileRead(pdf, line, 20) != 20) - goto error; - line[20] = '\0'; - - if (strcmp(line + 18, "\r\n") && strcmp(line + 18, " \n") && strcmp(line + 18, " \r")) - { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - goto error; - } - line[18] = '\0'; - - // Parse the line - if ((offset = strtoimax(line, &ptr, 10)) < 0) - { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - goto error; - } - - if ((generation = (int)strtol(ptr, &ptr, 10)) < 0 || generation > 65535) - { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - goto error; - } - - if (*ptr != ' ') - { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - goto error; - } - - ptr ++; - if (*ptr != 'f' && *ptr != 'n') - { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - goto error; - } - - if (*ptr == 'f') - continue; // Don't care about free objects... - - // Create a placeholder for the object in memory... - if ((obj = add_object(pdf)) == NULL) - goto error; - - obj->number = (size_t)number; - obj->generation = (unsigned short)generation; - obj->offset = offset; - } - } - - if (strcmp(line, "trailer")) - { - _pdfioFileError(pdf, "Missing trailer."); - goto error; - } - - // TODO: Read trailer dict... return (pdf); @@ -528,6 +445,127 @@ add_object(pdfio_file_t *pdf) // I - PDF file } +// +// 'load_xref()' - Load an XREF table... +// + +static bool // O - `true` on success, `false` on failure +load_xref(pdfio_file_t *pdf, // I - PDF file + off_t xref_offset) // I - Offset to xref +{ + bool done = false; // Are we done? + char line[1024], // Line from file + *ptr; // Pointer into line + + + while (!done) + { + if (_pdfioFileSeek(pdf, xref_offset, SEEK_SET) != xref_offset) + { + _pdfioFileError(pdf, "Unable to seek to start of xref table."); + return (false); + } + + if (!_pdfioFileGets(pdf, line, sizeof(line))) + { + _pdfioFileError(pdf, "Unable to read start of xref table."); + return (false); + } + + if (strcmp(line, "xref")) + { + _pdfioFileError(pdf, "Bad xref table header '%s'.", line); + return (false); + } + + // Read the xref tables + while (_pdfioFileGets(pdf, line, sizeof(line))) + { + intmax_t number, // Object number + num_objects; // Number of objects + + if (!strcmp(line, "trailer")) + break; + + if (sscanf(line, "%jd%jd", &number, &num_objects) != 2) + { + _pdfioFileError(pdf, "Malformed xref table section '%s'.", line); + return (false); + } + + // Read this group of objects... + for (; num_objects > 0; num_objects --, number ++) + { + intmax_t offset; // Offset in file + int generation; // Generation number + pdfio_obj_t *obj; // Object + + // Read a line from the file and validate it... + if (_pdfioFileRead(pdf, line, 20) != 20) + return (false); + + line[20] = '\0'; + + if (strcmp(line + 18, "\r\n") && strcmp(line + 18, " \n") && strcmp(line + 18, " \r")) + { + _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); + return (false); + } + line[18] = '\0'; + + // Parse the line + if ((offset = strtoimax(line, &ptr, 10)) < 0) + { + _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); + return (false); + } + + if ((generation = (int)strtol(ptr, &ptr, 10)) < 0 || generation > 65535) + { + _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); + return (false); + } + + if (*ptr != ' ') + { + _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); + return (false); + } + + ptr ++; + if (*ptr != 'f' && *ptr != 'n') + { + _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); + return (false); + } + + if (*ptr == 'f') + continue; // Don't care about free objects... + + // Create a placeholder for the object in memory... + if ((obj = add_object(pdf)) == NULL) + return (false); + + obj->number = (size_t)number; + obj->generation = (unsigned short)generation; + obj->offset = offset; + } + } + + if (strcmp(line, "trailer")) + { + _pdfioFileError(pdf, "Missing trailer."); + return (false); + } + + // TODO: Read trailer dict... + done = true; + } + + return (true); +} + + // // 'write_trailer()' - Write the PDF catalog object, xref table, and trailer. // diff --git a/pdfio-private.h b/pdfio-private.h index c0d228f..74e18b8 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -120,6 +120,9 @@ struct _pdfio_file_s // PDF file structure *bufptr, // Pointer into buffer *bufend; // End of buffer off_t bufpos; // Position in file for start of buffer + pdfio_dict_t *trailer; // Trailer dictionary + pdfio_obj_t *root; // Root object/dictionary + pdfio_obj_t *info; // Information object/dictionary // Allocated data elements size_t num_arrays, // Number of arrays @@ -162,6 +165,8 @@ struct _pdfio_stream_s // Stream z_stream flate; // Flate filter state }; +typedef ssize_t (*_pdfio_token_cb_t)(void *data, void *buffer, size_t bufsize); + // // Functions... @@ -173,6 +178,7 @@ extern bool _pdfioArrayWrite(pdfio_array_t *a) PDFIO_INTERNAL; extern void _pdfioDictDelete(pdfio_dict_t *dict) PDFIO_INTERNAL; extern _pdfio_value_t *_pdfioDictGetValue(pdfio_dict_t *dict, const char *key) PDFIO_INTERNAL; +extern pdfio_dict_t *_pdfioDictRead(pdfio_file_t *pdf) PDFIO_INTERNAL; extern bool _pdfioDictSetValue(pdfio_dict_t *dict, const char *key, _pdfio_value_t *value) PDFIO_INTERNAL; extern bool _pdfioDictWrite(pdfio_dict_t *dict, off_t *length) PDFIO_INTERNAL; @@ -180,13 +186,15 @@ extern bool _pdfioFileDefaultError(pdfio_file_t *pdf, const char *message, void extern bool _pdfioFileError(pdfio_file_t *pdf, const char *format, ...) PDFIO_FORMAT(2,3) PDFIO_INTERNAL; extern bool _pdfioFileFlush(pdfio_file_t *pdf) PDFIO_INTERNAL; extern int _pdfioFileGetChar(pdfio_file_t *pdf) PDFIO_INTERNAL; +extern bool _pdfioFileGetToken(pdfio_file_t *pdf, char *buffer, size_t bufsize) PDFIO_INTERNAL; extern bool _pdfioFileGets(pdfio_file_t *pdf, char *buffer, size_t bufsize) PDFIO_INTERNAL; +extern ssize_t _pdfioFilePeek(pdfio_file_t *pdf, void *buffer, size_t bytes) PDFIO_INTERNAL; extern bool _pdfioFilePrintf(pdfio_file_t *pdf, const char *format, ...) PDFIO_FORMAT(2,3) PDFIO_INTERNAL; extern bool _pdfioFilePuts(pdfio_file_t *pdf, const char *s) PDFIO_INTERNAL; -extern ssize_t _pdfioFileRead(pdfio_file_t *pdf, char *buffer, size_t bytes) PDFIO_INTERNAL; +extern ssize_t _pdfioFileRead(pdfio_file_t *pdf, void *buffer, size_t bytes) PDFIO_INTERNAL; extern off_t _pdfioFileSeek(pdfio_file_t *pdf, off_t offset, int whence) PDFIO_INTERNAL; extern off_t _pdfioFileTell(pdfio_file_t *pdf) PDFIO_INTERNAL; -extern bool _pdfioFileWrite(pdfio_file_t *pdf, const char *buffer, size_t bytes) PDFIO_INTERNAL; +extern bool _pdfioFileWrite(pdfio_file_t *pdf, const void *buffer, size_t bytes) PDFIO_INTERNAL; extern void _pdfioObjDelete(pdfio_obj_t *obj) PDFIO_INTERNAL; @@ -194,6 +202,8 @@ extern void _pdfioStreamDelete(pdfio_stream_t *st) PDFIO_INTERNAL; extern bool _pdfioStringIsAllocated(pdfio_file_t *pdf, const char *s) PDFIO_INTERNAL; +extern bool _pdfioTokenRead(char *buffer, size_t bufsize, _pdfio_token_cb_t peek_cb, _pdfio_token_cb_t read_cb, void *data); + extern _pdfio_value_t *_pdfioValueCopy(pdfio_file_t *pdfdst, _pdfio_value_t *vdst, pdfio_file_t *pdfsrc, _pdfio_value_t *vsrc) PDFIO_INTERNAL; extern void _pdfioValueDelete(_pdfio_value_t *v) PDFIO_INTERNAL; extern bool _pdfioValueWrite(pdfio_file_t *pdf, _pdfio_value_t *v) PDFIO_INTERNAL; diff --git a/pdfio-stream.c b/pdfio-stream.c index 6264f31..e3e6408 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -56,13 +56,26 @@ pdfioStreamGetToken( pdfio_stream_t *st, // I - Stream char *buffer, // I - String buffer size_t bufsize) // I - Size of string buffer +{ + return (_pdfioTokenRead(buffer, bufsize, (_pdfio_token_cb_t)pdfioStreamPeek, (_pdfio_token_cb_t)pdfioStreamRead, st)); +} + + +// +// 'pdfioStreamPeek()' - Peek at data in a stream. +// + +ssize_t // O - Bytes returned or `-1` on error +pdfioStreamPeek(pdfio_stream_t *st, // I - Stream + void *buffer, // I - Buffer + size_t bytes) // I - Size of buffer { // TODO: Implement me (void)st; (void)buffer; - (void)bufsize; + (void)bytes; - return (false); + return (-1); } diff --git a/pdfio-token.c b/pdfio-token.c new file mode 100644 index 0000000..25a6b71 --- /dev/null +++ b/pdfio-token.c @@ -0,0 +1,62 @@ +// +// PDF token parsing functions for pdfio. +// +// Copyright © 2021 by Michael R Sweet. +// +// Licensed under Apache License v2.0. See the file "LICENSE" for more +// information. +// + +// +// Include necessary headers... +// + +#include "pdfio-private.h" + + +// +// This file parses PDF language syntax: +// +// << dict >> "<<" and ">>" delimit a dictionary +// (string) "(" and ")" delimit a string +// [array] "[" and "]" delimit an array +// "<" and ">" delimit a hex string +// {...} "{" and "}" delimit ??? +// /name "/" starts a name with any special characters +// quoted as "#HH" where HH is the byte value in hex. +// %comment "%" starts a comment to the end of a line +// keyword A keyword consists of upper/lowercase letters +// [-+]?[0-9]*(.[0-9]*)? A number optionally starts with "+" or "-". +// +// Newlines are CR, LF, or CR LF. +// +// Strings and names are returned with the leading delimiter ("(string", +// "