From 278ddb7fa7cfa2b52f7506671de072bdc6b30897 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Wed, 23 Apr 2025 14:43:14 -0400 Subject: [PATCH] Clarify error callback API, and actually use the return value. Improve repair implementation. --- doc/pdfio.3 | 35 ++++++++---- doc/pdfio.html | 30 +++++++---- doc/pdfio.md | 12 +++-- pdfio-common.c | 4 +- pdfio-dict.c | 6 ++- pdfio-file.c | 141 +++++++++++++++++++++++++++++++++++++------------ testpdfio.c | 2 +- 7 files changed, 168 insertions(+), 62 deletions(-) diff --git a/doc/pdfio.3 b/doc/pdfio.3 index a0f1b17..9f64cd0 100644 --- a/doc/pdfio.3 +++ b/doc/pdfio.3 @@ -1,4 +1,4 @@ -.TH pdfio 3 "pdf read/write library" "2025-04-13" "pdf read/write library" +.TH pdfio 3 "pdf read/write library" "2025-04-23" "pdf read/write library" .SH NAME pdfio \- pdf read/write library .SH Introduction @@ -325,7 +325,7 @@ where the five arguments to the function are the filename ("myinputfile.pdf"), a } .fi .PP -The error callback is called for both errors and warnings and accepts the pdfio_file_t pointer, a message string, and the callback pointer value, for example: +The error callback is called for both errors and warnings and accepts the pdfio_file_t pointer, a message string, and the callback pointer value. It returns true to continue processing the file or false to stop, for example: .nf bool @@ -335,12 +335,15 @@ The error callback is called for both errors and warnings and accepts the pdfio_ fprintf(stderr, "%s: %s\\n", pdfioFileGetName(pdf), message); - // Return false to treat warnings as errors - return (false); + // Return true for warning messages (continue) and false for errors (stop) + return (!strncmp(message, "WARNING:", 8)); } .fi .PP The default error callback (NULL) does the equivalent of the above. +.PP +Note: Many errors are unrecoverable, so PDFio ignores the return value from the error callback and always stops processing the PDF file. Warning messages start with the prefix "WARNING:" while errors have no prefix. + .PP Each PDF file contains one or more pages. The pdfioFileGetNumPages function returns the number of pages in the file while the pdfioFileGetPage function gets the specified page in the PDF file: .nf @@ -3910,8 +3913,9 @@ CropBox for pages in the PDF file - if \fBNULL\fR then a default "Universal" siz of 8.27x11in (the intersection of US Letter and ISO A4) is used. .PP The "error_cb" and "error_cbdata" arguments specify an error handler callback -and its data pointer - if \fBNULL\fR the default error handler is used that -writes error messages to \fBstderr\fR. +and its data pointer - if \fBNULL\fR then the default error handler is used that +writes error messages to \fBstderr\fR. The error handler callback should return +\fBtrue\fR to continue writing the PDF file or \fBfalse\fR to stop. .SS pdfioFileCreateArrayObj Create a new object in a PDF file containing an array. .PP @@ -4152,8 +4156,9 @@ CropBox for pages in the PDF file - if \fBNULL\fR then a default "Universal" siz of 8.27x11in (the intersection of US Letter and ISO A4) is used. .PP The "error_cb" and "error_cbdata" arguments specify an error handler callback -and its data pointer - if \fBNULL\fR the default error handler is used that -writes error messages to \fBstderr\fR. +and its data pointer - if \fBNULL\fR then the default error handler is used that +writes error messages to \fBstderr\fR. The error handler callback should return +\fBtrue\fR to continue writing the PDF file or \fBfalse\fR to stop. .PP .IP 5 \fINote\fR: Files created using this API are slightly larger than those @@ -4392,8 +4397,18 @@ cancel the open. If \fBNULL\fR is specified for the callback function and the PDF file requires a password, the open will always fail. .PP The "error_cb" and "error_cbdata" arguments specify an error handler callback -and its data pointer - if \fBNULL\fR the default error handler is used that -writes error messages to \fBstderr\fR. +and its data pointer - if \fBNULL\fR then the default error handler is used that +writes error messages to \fBstderr\fR. The error handler callback should return +\fBtrue\fR to continue reading the PDF file or \fBfalse\fR to stop. +.PP +.IP 5 +Note: Error messages starting with "WARNING:" are actually warning +.IP 5 +messages - the callback should normally return \fBtrue\fR to allow PDFio to +.IP 5 +try to resolve the issue. In addition, some errors are unrecoverable and +.IP 5 +ignore the return value of the error callback. .SS pdfioFileSetAuthor Set the author for a PDF file. .PP diff --git a/doc/pdfio.html b/doc/pdfio.html index a586517..3285b28 100644 --- a/doc/pdfio.html +++ b/doc/pdfio.html @@ -732,7 +732,7 @@ password_cb(void *data, con return ("Password42"); } -

The error callback is called for both errors and warnings and accepts the pdfio_file_t pointer, a message string, and the callback pointer value, for example:

+

The error callback is called for both errors and warnings and accepts the pdfio_file_t pointer, a message string, and the callback pointer value. It returns true to continue processing the file or false to stop, for example:

bool
 error_cb(pdfio_file_t *pdf, const char *message, void *data)
 {
@@ -740,11 +740,14 @@ error_cb(pdfio_file_t *pdf, const "%s: %s\n", pdfioFileGetName(pdf), message);
 
-  // Return false to treat warnings as errors
-  return (false);
+  // Return true for warning messages (continue) and false for errors (stop)
+  return (!strncmp(message, "WARNING:", 8));
 }
 

The default error callback (NULL) does the equivalent of the above.

+
+

Note: Many errors are unrecoverable, so PDFio ignores the return value from the error callback and always stops processing the PDF file. Warning messages start with the prefix "WARNING:" while errors have no prefix.

+

Each PDF file contains one or more pages. The pdfioFileGetNumPages function returns the number of pages in the file while the pdfioFileGetPage function gets the specified page in the PDF file:

pdfio_file_t *pdf;   // PDF file
 size_t       i;      // Looping var
@@ -4129,8 +4132,9 @@ CropBox for pages in the PDF file - if NULL then a default "Un
 of 8.27x11in (the intersection of US Letter and ISO A4) is used.

The "error_cb" and "error_cbdata" arguments specify an error handler callback -and its data pointer - if NULL the default error handler is used that -writes error messages to stderr.

+and its data pointer - if NULL then the default error handler is used that +writes error messages to stderr. The error handler callback should return +true to continue writing the PDF file or false to stop.

pdfioFileCreateArrayObj

Create a new object in a PDF file containing an array.

@@ -4434,8 +4438,9 @@ CropBox for pages in the PDF file - if NULL then a default "Un of 8.27x11in (the intersection of US Letter and ISO A4) is used.

The "error_cb" and "error_cbdata" arguments specify an error handler callback -and its data pointer - if NULL the default error handler is used that -writes error messages to stderr.
+and its data pointer - if NULL then the default error handler is used that +writes error messages to stderr. The error handler callback should return +true to continue writing the PDF file or false to stop.

Note: Files created using this API are slightly larger than those @@ -4772,8 +4777,15 @@ cancel the open. If NULL is specified for the callback function an PDF file requires a password, the open will always fail.

The "error_cb" and "error_cbdata" arguments specify an error handler callback -and its data pointer - if NULL the default error handler is used that -writes error messages to stderr.

+and its data pointer - if NULL then the default error handler is used that +writes error messages to stderr. The error handler callback should return +true to continue reading the PDF file or false to stop.
+
+

+Note: Error messages starting with "WARNING:" are actually warning +messages - the callback should normally return true to allow PDFio to +try to resolve the issue. In addition, some errors are unrecoverable and +ignore the return value of the error callback.

pdfioFileSetAuthor

Set the author for a PDF file.

diff --git a/doc/pdfio.md b/doc/pdfio.md index 711c16d..8709d33 100644 --- a/doc/pdfio.md +++ b/doc/pdfio.md @@ -343,8 +343,8 @@ password_cb(void *data, const char *filename) ``` The error callback is called for both errors and warnings and accepts the -`pdfio_file_t` pointer, a message string, and the callback pointer value, for -example: +`pdfio_file_t` pointer, a message string, and the callback pointer value. It +returns `true` to continue processing the file or `false` to stop, for example: ```c bool @@ -354,13 +354,17 @@ error_cb(pdfio_file_t *pdf, const char *message, void *data) fprintf(stderr, "%s: %s\n", pdfioFileGetName(pdf), message); - // Return false to treat warnings as errors - return (false); + // Return true for warning messages (continue) and false for errors (stop) + return (!strncmp(message, "WARNING:", 8)); } ``` The default error callback (`NULL`) does the equivalent of the above. +> Note: Many errors are unrecoverable, so PDFio ignores the return value from +> the error callback and always stops processing the PDF file. Warning messages +> start with the prefix "WARNING:" while errors have no prefix. + Each PDF file contains one or more pages. The [`pdfioFileGetNumPages`](@@) function returns the number of pages in the file while the [`pdfioFileGetPage`](@@) function gets the specified page in the PDF file: diff --git a/pdfio-common.c b/pdfio-common.c index 3eead49..f173669 100644 --- a/pdfio-common.c +++ b/pdfio-common.c @@ -47,7 +47,7 @@ _pdfioFileConsume(pdfio_file_t *pdf, // I - PDF file // `false` to halt. // -bool // O - `false` to stop +bool // O - `false` to stop, `true` to continue _pdfioFileDefaultError( pdfio_file_t *pdf, // I - PDF file const char *message, // I - Error message @@ -57,7 +57,7 @@ _pdfioFileDefaultError( fprintf(stderr, "%s: %s\n", pdf->filename, message); - return (false); + return (!strncmp(message, "WARNING:", 8)); } diff --git a/pdfio-dict.c b/pdfio-dict.c index a81df4b..642871c 100644 --- a/pdfio-dict.c +++ b/pdfio-dict.c @@ -643,9 +643,11 @@ _pdfioDictRead(pdfio_file_t *pdf, // I - PDF file { // Issue 118: Discard duplicate key/value pairs, in the future this will // be a warning message... - _pdfioFileError(pdf, "WARNING: Discarding value for duplicate dictionary key '%s'.", key + 1); _pdfioValueDelete(&value); - continue; + if (_pdfioFileError(pdf, "WARNING: Discarding value for duplicate dictionary key '%s'.", key + 1)) + continue; + else + break; } else if (!_pdfioDictSetValue(dict, pdfioStringCreate(pdf, key + 1), &value)) break; diff --git a/pdfio-file.c b/pdfio-file.c index de2a379..ac0fb48 100644 --- a/pdfio-file.c +++ b/pdfio-file.c @@ -188,8 +188,9 @@ pdfioFileClose(pdfio_file_t *pdf) // I - PDF file // of 8.27x11in (the intersection of US Letter and ISO A4) is used. // // The "error_cb" and "error_cbdata" arguments specify an error handler callback -// and its data pointer - if `NULL` the default error handler is used that -// writes error messages to `stderr`. +// and its data pointer - if `NULL` then the default error handler is used that +// writes error messages to `stderr`. The error handler callback should return +// `true` to continue writing the PDF file or `false` to stop. // pdfio_file_t * // O - PDF file or `NULL` on error @@ -426,8 +427,9 @@ _pdfioFileCreateObj( // of 8.27x11in (the intersection of US Letter and ISO A4) is used. // // The "error_cb" and "error_cbdata" arguments specify an error handler callback -// and its data pointer - if `NULL` the default error handler is used that -// writes error messages to `stderr`. +// and its data pointer - if `NULL` then the default error handler is used that +// writes error messages to `stderr`. The error handler callback should return +// `true` to continue writing the PDF file or `false` to stop. // // > *Note*: Files created using this API are slightly larger than those // > created using the @link pdfioFileCreate@ function since stream lengths are @@ -1019,8 +1021,14 @@ pdfioFileGetVersion( // PDF file requires a password, the open will always fail. // // The "error_cb" and "error_cbdata" arguments specify an error handler callback -// and its data pointer - if `NULL` the default error handler is used that -// writes error messages to `stderr`. +// and its data pointer - if `NULL` then the default error handler is used that +// writes error messages to `stderr`. The error handler callback should return +// `true` to continue reading the PDF file or `false` to stop. +// +// > Note: Error messages starting with "WARNING:" are actually warning +// > messages - the callback should normally return `true` to allow PDFio to +// > try to resolve the issue. In addition, some errors are unrecoverable and +// > ignore the return value of the error callback. // pdfio_file_t * // O - PDF file @@ -1119,16 +1127,20 @@ pdfioFileOpen( if (ptr >= end) { _pdfioFileError(pdf, "Unable to find start of xref table."); - goto error; - } - xref_offset = (off_t)strtol(ptr + 9, NULL, 10); - - if (!load_xref(pdf, xref_offset, password_cb, password_cbdata)) - { if (!repair_xref(pdf, password_cb, password_cbdata)) goto error; } + else + { + xref_offset = (off_t)strtol(ptr + 9, NULL, 10); + + if (!load_xref(pdf, xref_offset, password_cb, password_cbdata)) + { + if (!repair_xref(pdf, password_cb, password_cbdata)) + goto error; + } + } return (pdf); @@ -1755,7 +1767,10 @@ load_pages(pdfio_file_t *pdf, // I - PDF file } if ((type = pdfioDictGetName(dict, "Type")) == NULL || (strcmp(type, "Pages") && strcmp(type, "Page"))) - return (false); + { + if (!_pdfioFileError(pdf, "WARNING: No Type value for pages object.")) + return (false); + } // If there is a Kids array, then this is a parent node and we have to look // at the child objects... @@ -1948,7 +1963,9 @@ load_xref( w_2 = w[0]; w_3 = w[0] + w[1]; - if (w[1] == 0 || w[2] > 4 || w[0] > sizeof(buffer) || w[1] > sizeof(buffer) || w[2] > sizeof(buffer) || w_total > sizeof(buffer)) + PDFIO_DEBUG("W=[%u %u %u], w_total=%u\n", (unsigned)w[0], (unsigned)w[1], (unsigned)w[2], (unsigned)w_total); + + if (pdfioArrayGetSize(w_array) > 3 || w[1] == 0 || w[2] > 4 || w[0] > sizeof(buffer) || w[1] > sizeof(buffer) || w[2] > sizeof(buffer) || w_total > sizeof(buffer)) { _pdfioFileError(pdf, "Cross-reference stream has invalid W key [%u %u %u].", (unsigned)w[0], (unsigned)w[1], (unsigned)w[2]); return (false); @@ -1977,7 +1994,20 @@ load_xref( { count --; - PDFIO_DEBUG("load_xref: number=%u %02X%02X%02X%02X%02X\n", (unsigned)number, buffer[0], buffer[1], buffer[2], buffer[3], buffer[4]); +#ifdef DEBUG + if (w_total > 5) + PDFIO_DEBUG("load_xref: number=%u %02X%02X%02X%02X%02X...\n", (unsigned)number, buffer[0], buffer[1], buffer[2], buffer[3], buffer[4]); + else if (w_total == 5) + PDFIO_DEBUG("load_xref: number=%u %02X%02X%02X%02X%02X\n", (unsigned)number, buffer[0], buffer[1], buffer[2], buffer[3], buffer[4]); + else if (w_total == 4) + PDFIO_DEBUG("load_xref: number=%u %02X%02X%02X%02X\n", (unsigned)number, buffer[0], buffer[1], buffer[2], buffer[3]); + else if (w_total == 3) + PDFIO_DEBUG("load_xref: number=%u %02X%02X%02X\n", (unsigned)number, buffer[0], buffer[1], buffer[2]); + else if (w_total == 2) + PDFIO_DEBUG("load_xref: number=%u %02X%02X\n", (unsigned)number, buffer[0], buffer[1]); + else + PDFIO_DEBUG("load_xref: number=%u %02X\n", (unsigned)number, buffer[0]); +#endif // DEBUG // Check whether this is an object definition... if (w[0] > 0) @@ -2288,7 +2318,7 @@ repair_xref( pdfio_password_cb_t password_cb, // I - Password callback or `NULL` for none void *password_data) // I - Password callback data, if any { - char line[16384], // Line from file + char line[65536], // Line from file *ptr; // Pointer into line off_t line_offset; // Offset in file intmax_t number; // Object number @@ -2296,10 +2326,13 @@ repair_xref( size_t i; // Looping var size_t num_sobjs = 0; // Number of object streams pdfio_obj_t *sobjs[16384]; // Object streams to load + pdfio_dict_t *backup_trailer = NULL; // Backup trailer dictionary + pdfio_obj_t *pages_obj; // Pages object // Let caller know something is wrong... - _pdfioFileError(pdf, "WARNING: Cross-reference table is damaged, attempting to rebuild."); + if (!_pdfioFileError(pdf, "WARNING: Cross-reference table is damaged, attempting to rebuild.")) + return (false); // Read from the beginning of the file, looking for if ((line_offset = _pdfioFileSeek(pdf, 0, SEEK_SET)) < 0) @@ -2311,6 +2344,8 @@ repair_xref( if (line[0] >= '1' && line[0] <= '9') { // Maybe, look some more... + PDFIO_DEBUG("repair_xref: line=\"%s\"\n", line); + if ((number = strtoimax(line, &ptr, 10)) >= 1 && (generation = (int)strtol(ptr, &ptr, 10)) >= 0 && generation < 65536) { while (isspace(*ptr & 255)) @@ -2322,7 +2357,7 @@ repair_xref( pdfio_obj_t *obj; // Object _pdfio_token_t tb; // Token buffer/stack - PDFIO_DEBUG("OBJECT %ld %d at offset %ld\n", (long)number, generation, (long)line_offset); + PDFIO_DEBUG("repair_xref: OBJECT %ld %d at offset %ld\n", (long)number, generation, (long)line_offset); if ((obj = add_obj(pdf, (size_t)number, (unsigned short)generation, line_offset)) == NULL) { @@ -2338,27 +2373,46 @@ repair_xref( return (false); } - if (_pdfioTokenGet(&tb, line, sizeof(line)) && strcmp(line, "stream")) + if (_pdfioTokenGet(&tb, line, sizeof(line))) { const char *type = pdfioObjGetType(obj); // Object type _pdfioTokenFlush(&tb); - obj->stream_offset = _pdfioFileTell(pdf); - if (type && !strcmp(type, "ObjStm") && num_sobjs < (sizeof(sobjs) / sizeof(sobjs[0]))) - { - sobjs[num_sobjs] = obj; - num_sobjs ++; - } - - if (type && !strcmp(type, "XRef") && !pdf->trailer_dict) + if (type && !strcmp(line, "stream")) { - // Save the trailer dictionary... - pdf->trailer_dict = pdfioObjGetDict(obj); - pdf->encrypt_obj = pdfioDictGetObj(pdf->trailer_dict, "Encrypt"); - pdf->id_array = pdfioDictGetArray(pdf->trailer_dict, "ID"); - } + // Possible object or XRef stream... + obj->stream_offset = _pdfioFileTell(pdf); + + if (!strcmp(type, "ObjStm") && num_sobjs < (sizeof(sobjs) / sizeof(sobjs[0]))) + { + PDFIO_DEBUG("repair_xref: Object stream...\n"); + sobjs[num_sobjs] = obj; + num_sobjs ++; + } + + if (!strcmp(type, "XRef") && !pdf->trailer_dict) + { + // Save the trailer dictionary... + PDFIO_DEBUG("repair_xref: XRef stream...\n"); + pdf->trailer_dict = pdfioObjGetDict(obj); + pdf->encrypt_obj = pdfioDictGetObj(pdf->trailer_dict, "Encrypt"); + pdf->id_array = pdfioDictGetArray(pdf->trailer_dict, "ID"); + } + } + else if (type && !strcmp(line, "endobj")) + { + // Possible catalog or pages object... + if (!strcmp(type, "Catalog")) + { + PDFIO_DEBUG("repair_xref: Catalog (root) object...\n"); + if (!backup_trailer) + backup_trailer = pdfioDictCreate(pdf); + + pdfioDictSetObj(backup_trailer, "Root", obj); + } + } } } } @@ -2369,6 +2423,8 @@ repair_xref( _pdfio_token_t tb; // Token buffer/stack _pdfio_value_t trailer; // Trailer + PDFIO_DEBUG("repair_xref: line=\"%s\"\n", line); + if (line[7]) { // Probably the start of the trailer dictionary, rewind the file so @@ -2376,7 +2432,7 @@ repair_xref( _pdfioFileSeek(pdf, line_offset + 7, SEEK_SET); } - PDFIO_DEBUG("TRAILER at offset %ld\n", (long)line_offset); + PDFIO_DEBUG("repair_xref: TRAILER at offset %ld\n", (long)line_offset); _pdfioTokenInit(&tb, pdf, (_pdfio_tconsume_cb_t)_pdfioFileConsume, (_pdfio_tpeek_cb_t)_pdfioFilePeek, pdf); if (!_pdfioValueRead(pdf, NULL, &tb, &trailer, 0)) @@ -2396,6 +2452,8 @@ repair_xref( { // Save the trailer dictionary and grab the root (catalog) and info // objects... + PDFIO_DEBUG("repair_xref: Using this trailer dictionary.\n"); + pdf->trailer_dict = trailer.value.dict; pdf->encrypt_obj = pdfioDictGetObj(pdf->trailer_dict, "Encrypt"); pdf->id_array = pdfioDictGetArray(pdf->trailer_dict, "ID"); @@ -2406,11 +2464,18 @@ repair_xref( line_offset = _pdfioFileTell(pdf); } + PDFIO_DEBUG("repair_xref: Stopped at line_offset=%lu\n", (unsigned long)line_offset); + + if (!pdf->trailer_dict && backup_trailer) + pdf->trailer_dict = backup_trailer; + // If the trailer contains an Encrypt key, try unlocking the file... if (pdf->encrypt_obj && !_pdfioCryptoUnlock(pdf, password_cb, password_data)) return (false); // Load any stream objects... + PDFIO_DEBUG("repair_xref: Found %lu stream objects.\n", (unsigned long)num_sobjs); + for (i = 0; i < num_sobjs; i ++) { if (!load_obj_stream(sobjs[i])) @@ -2429,8 +2494,16 @@ repair_xref( PDFIO_DEBUG("repair_xref: Root=%p(%lu)\n", pdf->root_obj, (unsigned long)pdf->root_obj->number); + if ((pages_obj = pdfioDictGetObj(pdfioObjGetDict(pdf->root_obj), "Pages")) == NULL) + { + _pdfioFileError(pdf, "Missing Pages object."); + return (false); + } + + PDFIO_DEBUG("repair_xref: Pages=%p(%lu)\n", pages_obj, (unsigned long)pages_obj->number); + // Load pages... - return (load_pages(pdf, pdfioDictGetObj(pdfioObjGetDict(pdf->root_obj), "Pages"), 0)); + return (load_pages(pdf, pages_obj, 0)); } diff --git a/testpdfio.c b/testpdfio.c index ae29084..70b8a08 100644 --- a/testpdfio.c +++ b/testpdfio.c @@ -1333,7 +1333,7 @@ error_cb(pdfio_file_t *pdf, // I - PDF file testMessage("%s", message); // Continue to catch more errors... - return (false); + return (true); }