From cad8f450ab0d267db33e47c6a171bb0c372a87d8 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Thu, 24 Apr 2025 11:09:54 -0400 Subject: [PATCH] Multiple fixes to allow PDFio to read more edge-case PDFs. - Update _pdfioFileGets to allow for really long lines where it doesn't matter if we lose the end of the line. - Update "startxref" detection at the end of the file. - Refactor repair logic so that you just get a single WARNING about the repair (debug messages available for testing) - Allow whitespace after the "obj" in the object header. - Make sure to close xref stream on error. - Update predictor code to support Colors <= 32 (some implementations set Colors to the number of bytes per record in the xref stream, which prevents the predictor from doing anything...) - Allow CR CR in xref table. - Clear old trailer/root/pages/etc. objects when repairing, update existing objects that were already found in load_xref. - Don't set current object in pdfioObjectCreate/OpenStream if the stream can't be created/opened. --- pdfio-common.c | 11 +-- pdfio-file.c | 188 +++++++++++++++++++++++++++++------------------- pdfio-object.c | 17 +++-- pdfio-private.h | 2 +- pdfio-stream.c | 4 +- test-corpus.sh | 10 +++ 6 files changed, 148 insertions(+), 84 deletions(-) diff --git a/pdfio-common.c b/pdfio-common.c index f173669..e88b081 100644 --- a/pdfio-common.c +++ b/pdfio-common.c @@ -134,19 +134,20 @@ _pdfioFileGetChar(pdfio_file_t *pdf) // I - PDF file bool // O - `true` on success, `false` on error _pdfioFileGets(pdfio_file_t *pdf, // I - PDF file char *buffer, // I - Line buffer - size_t bufsize) // I - Size of line buffer + size_t bufsize, // I - Size of line buffer + bool discard) // I - OK to discard excess line chars? { bool eol = false; // End of line? char *bufptr = buffer, // Pointer into buffer *bufend = buffer + bufsize - 1; // Pointer to end of buffer - PDFIO_DEBUG("_pdfioFileGets(pdf=%p, buffer=%p, bufsize=%lu) bufpos=%ld, buffer=%p, bufptr=%p, bufend=%p, offset=%lu\n", pdf, buffer, (unsigned long)bufsize, (long)pdf->bufpos, pdf->buffer, pdf->bufptr, pdf->bufend, (unsigned long)(pdf->bufpos + (pdf->bufptr - pdf->buffer))); + PDFIO_DEBUG("_pdfioFileGets(pdf=%p, buffer=%p, bufsize=%lu, discard=%s) bufpos=%ld, buffer=%p, bufptr=%p, bufend=%p, offset=%lu\n", pdf, buffer, (unsigned long)bufsize, discard ? "true" : "false", (long)pdf->bufpos, pdf->buffer, pdf->bufptr, pdf->bufend, (unsigned long)(pdf->bufpos + (pdf->bufptr - pdf->buffer))); while (!eol) { // If there are characters ready in the buffer, use them... - while (!eol && pdf->bufptr < pdf->bufend && bufptr < bufend) + while (!eol && pdf->bufptr < pdf->bufend) { char ch = *(pdf->bufptr++); // Next character in buffer @@ -168,8 +169,10 @@ _pdfioFileGets(pdfio_file_t *pdf, // I - PDF file pdf->bufptr ++; } } - else + else if (bufptr < bufend) *bufptr++ = ch; + else if (!discard) + break; } // Fill the read buffer as needed... diff --git a/pdfio-file.c b/pdfio-file.c index ac0fb48..7d7f27a 100644 --- a/pdfio-file.c +++ b/pdfio-file.c @@ -1089,7 +1089,7 @@ pdfioFileOpen( } // Read the header from the first line... - if (!_pdfioFileGets(pdf, line, sizeof(line))) + if (!_pdfioFileGets(pdf, line, sizeof(line), true)) goto error; if ((strncmp(line, "%PDF-1.", 7) && strncmp(line, "%PDF-2.", 7)) || !isdigit(line[7] & 255)) @@ -1103,7 +1103,7 @@ pdfioFileOpen( pdf->version = strdup(line + 5); // Grab the last 1k of the file to find the start of the xref table... - if (_pdfioFileSeek(pdf, -1024, SEEK_END) < 0) + if (_pdfioFileSeek(pdf, 1 - sizeof(line), SEEK_END) < 0) { _pdfioFileError(pdf, "Unable to read startxref data."); goto error; @@ -1115,31 +1115,35 @@ pdfioFileOpen( goto error; } + PDFIO_DEBUG("pdfioOpen: Read %d bytes at end of file.\n", (int)bytes); + line[bytes] = '\0'; end = line + bytes - 9; for (ptr = line; ptr < end; ptr ++) { - if (!memcmp(ptr, "startxref", 9)) + if (!strncmp(ptr, "startxref", 9) && !strstr(ptr + 9, "startxref") && strtol(ptr + 9, NULL, 10) > 0) break; } if (ptr >= end) { - _pdfioFileError(pdf, "Unable to find start of xref table."); + if (!_pdfioFileError(pdf, "WARNING: Unable to find start of cross-reference table, will attempt to rebuild.")) + goto error; if (!repair_xref(pdf, password_cb, password_cbdata)) goto error; } else { + PDFIO_DEBUG("pdfioFileOpen: line=%p,ptr=%p(\"%s\")\n", line, ptr, ptr); + xref_offset = (off_t)strtol(ptr + 9, NULL, 10); + PDFIO_DEBUG("pdfioFileOpen: xref_offset=%lu\n", (unsigned long)xref_offset); + if (!load_xref(pdf, xref_offset, password_cb, password_cbdata)) - { - if (!repair_xref(pdf, password_cb, password_cbdata)) - goto error; - } + goto error; } return (pdf); @@ -1837,31 +1841,32 @@ load_xref( int generation; // Generation number _pdfio_token_t tb; // Token buffer/stack off_t line_offset; // Offset to start of line + pdfio_obj_t *pages_obj; // Pages object while (!done) { if (_pdfioFileSeek(pdf, xref_offset, SEEK_SET) != xref_offset) { - _pdfioFileError(pdf, "Unable to seek to start of xref table."); - return (false); + PDFIO_DEBUG("load_xref: Unable to seek to %lu.\n", (unsigned long)xref_offset); + goto repair; } do { line_offset = _pdfioFileTell(pdf); - if (!_pdfioFileGets(pdf, line, sizeof(line))) + if (!_pdfioFileGets(pdf, line, sizeof(line), true)) { - _pdfioFileError(pdf, "Unable to read start of xref table."); - return (false); + PDFIO_DEBUG("load_xref: Unable to read line at offset %lu.\n", (unsigned long)line_offset); + goto repair; } } while (!line[0]); PDFIO_DEBUG("load_xref: line_offset=%lu, line='%s'\n", (unsigned long)line_offset, line); - if (isdigit(line[0] & 255) && strlen(line) > 4 && (!strcmp(line + strlen(line) - 4, " obj") || ((ptr = strstr(line, " obj")) != NULL && ptr[4] == '<'))) + if (isdigit(line[0] & 255) && strlen(line) > 4 && (!strcmp(line + strlen(line) - 4, " obj") || ((ptr = strstr(line, " obj")) != NULL && (ptr[4] == '<' || isspace(ptr[4]))))) { // Cross-reference stream pdfio_obj_t *obj; // Object @@ -1883,14 +1888,14 @@ load_xref( if ((number = strtoimax(line, &ptr, 10)) < 1) { - _pdfioFileError(pdf, "Bad xref table header '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: Unable to scan object number.\n"); + goto repair; } if ((generation = (int)strtol(ptr, &ptr, 10)) < 0 || (generation > 65535 && number != 0)) { - _pdfioFileError(pdf, "Bad xref table header '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: Unable to scan generation number (%u).\n", (unsigned)generation); + goto repair; } while (isspace(*ptr & 255)) @@ -1898,14 +1903,14 @@ load_xref( if (strncmp(ptr, "obj", 3)) { - _pdfioFileError(pdf, "Bad xref table header '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: No 'obj' after object number and generation (saw '%s').\n", ptr); + goto repair; } if (_pdfioFileSeek(pdf, line_offset + (off_t)(ptr + 3 - line), SEEK_SET) < 0) { - _pdfioFileError(pdf, "Unable to seek to xref object %lu %u.", (unsigned long)number, (unsigned)generation); - return (false); + PDFIO_DEBUG("load_xref: Unable to seek to start of cross-reference object dictionary.\n"); + goto repair; } PDFIO_DEBUG("load_xref: Loading object %lu %u.\n", (unsigned long)number, (unsigned)generation); @@ -1920,21 +1925,21 @@ load_xref( if (!_pdfioValueRead(pdf, obj, &tb, &trailer, 0)) { - _pdfioFileError(pdf, "Unable to read cross-reference stream dictionary."); - return (false); + PDFIO_DEBUG("load_xref: Unable to read cross-reference object dictionary.\n"); + goto repair; } else if (trailer.type != PDFIO_VALTYPE_DICT) { - _pdfioFileError(pdf, "Cross-reference stream does not have a dictionary."); - return (false); + PDFIO_DEBUG("load_xref: Expected dictionary for cross-reference object (type=%d).", trailer.type); + goto repair; } obj->value = trailer; if (!_pdfioTokenGet(&tb, line, sizeof(line)) || strcmp(line, "stream")) { - _pdfioFileError(pdf, "Unable to get stream after xref dictionary."); - return (false); + PDFIO_DEBUG("load_xref: No stream token after dictionary (got '%s').\n", line); + goto repair; } PDFIO_DEBUG("load_xref: tb.bufptr=%p, tb.bufend=%p, tb.bufptr[0]=0x%02x, tb.bufptr[0]=0x%02x\n", tb.bufptr, tb.bufend, tb.bufptr[0], tb.bufptr[1]); @@ -1952,8 +1957,8 @@ load_xref( if ((w_array = pdfioDictGetArray(trailer.value.dict, "W")) == NULL) { - _pdfioFileError(pdf, "Cross-reference stream does not have required W key."); - return (false); + PDFIO_DEBUG("load_xref: Missing W array in cross-reference objection dictionary.\n"); + goto repair; } w[0] = (size_t)pdfioArrayGetNumber(w_array, 0); @@ -1967,14 +1972,14 @@ load_xref( if (pdfioArrayGetSize(w_array) > 3 || w[1] == 0 || w[2] > 4 || w[0] > sizeof(buffer) || w[1] > sizeof(buffer) || w[2] > sizeof(buffer) || w_total > sizeof(buffer)) { - _pdfioFileError(pdf, "Cross-reference stream has invalid W key [%u %u %u].", (unsigned)w[0], (unsigned)w[1], (unsigned)w[2]); - return (false); + PDFIO_DEBUG("load_xref: Bad W array in cross-reference objection dictionary.\n"); + goto repair; } if ((st = pdfioObjOpenStream(obj, true)) == NULL) { - _pdfioFileError(pdf, "Unable to open cross-reference stream."); - return (false); + PDFIO_DEBUG("load_xref: Unable to open cross-reference stream.\n"); + goto repair; } for (index_n = 0; index_n < index_count; index_n += 2) @@ -2089,6 +2094,7 @@ load_xref( else { _pdfioFileError(pdf, "Too many object streams."); + pdfioStreamClose(st); return (false); } } @@ -2097,7 +2103,10 @@ load_xref( { // Add this object... if (!add_obj(pdf, (size_t)number, (unsigned short)generation, (off_t)offset)) + { + pdfioStreamClose(st); return (false); + } } number ++; @@ -2145,7 +2154,7 @@ load_xref( // Offset of current line PDFIO_DEBUG("load_xref: Reading xref table starting at offset %lu\n", (unsigned long)trailer_offset); - while (_pdfioFileGets(pdf, line, sizeof(line))) + while (_pdfioFileGets(pdf, line, sizeof(line), false)) { PDFIO_DEBUG("load_xref: '%s' at offset %lu\n", line, (unsigned long)trailer_offset); @@ -2170,8 +2179,8 @@ load_xref( if (sscanf(line, "%jd%jd", &number, &num_objects) != 2) { - _pdfioFileError(pdf, "Malformed xref table section '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: Unable to scan START COUNT from line.\n"); + goto repair; } // Read this group of objects... @@ -2179,41 +2188,45 @@ load_xref( { // Read a line from the file and validate it... if (_pdfioFileRead(pdf, line, 20) != 20) - return (false); + { + PDFIO_DEBUG("load_xref: Unable to read 20 byte xref record.\n"); + goto repair; + } line[20] = '\0'; - if (strcmp(line + 18, "\r\n") && strcmp(line + 18, " \n") && strcmp(line + 18, " \r")) + if (strcmp(line + 18, "\r\n") && strcmp(line + 18, "\r\r") && strcmp(line + 18, " \n") && strcmp(line + 18, " \r")) { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: Bad end-of-line <%02X%02X>\n", line[18], line[19]); + goto repair; } + line[18] = '\0'; // Parse the line if ((offset = strtoimax(line, &ptr, 10)) < 0) { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: Unable to scan offset.\n"); + goto repair; } if ((generation = (int)strtol(ptr, &ptr, 10)) < 0 || (generation > 65535 && offset != 0)) { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: Unable to scan generation (%u).\n", (unsigned)generation); + goto repair; } if (*ptr != ' ') { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: Missing space before type.\n"); + goto repair; } ptr ++; if (*ptr != 'f' && *ptr != 'n') { - _pdfioFileError(pdf, "Malformed xref table entry '%s'.", line); - return (false); + PDFIO_DEBUG("load_xref: Bad type '%c'.\n", *ptr); + goto repair; } if (*ptr == 'f') @@ -2232,21 +2245,21 @@ load_xref( if (strncmp(line, "trailer", 7)) { - _pdfioFileError(pdf, "Missing trailer."); - return (false); + PDFIO_DEBUG("load_xref: No trailer after xref table.\n"); + goto repair; } _pdfioTokenInit(&tb, pdf, (_pdfio_tconsume_cb_t)_pdfioFileConsume, (_pdfio_tpeek_cb_t)_pdfioFilePeek, pdf); if (!_pdfioValueRead(pdf, NULL, &tb, &trailer, 0)) { - _pdfioFileError(pdf, "Unable to read trailer dictionary."); - return (false); + PDFIO_DEBUG("load_xref: Unable to read trailer dictionary.\n"); + goto repair; } else if (trailer.type != PDFIO_VALTYPE_DICT) { - _pdfioFileError(pdf, "Trailer is not a dictionary."); - return (false); + PDFIO_DEBUG("load_xref: Trailer not a dictionary (type=%d).\n", trailer.type); + goto repair; } PDFIO_DEBUG("load_xref: Got trailer dict.\n"); @@ -2268,8 +2281,7 @@ load_xref( } else { - _pdfioFileError(pdf, "Bad xref table header '%s'.", line); - return (false); + goto repair; } PDFIO_DEBUG("load_xref: Contents of trailer dictionary:\n"); @@ -2298,13 +2310,31 @@ load_xref( if ((pdf->root_obj = pdfioDictGetObj(pdf->trailer_dict, "Root")) == NULL) { - _pdfioFileError(pdf, "Missing Root object."); - return (false); + PDFIO_DEBUG("load_xref: Missing Root object.\n"); + goto repair; } PDFIO_DEBUG("load_xref: Root=%p(%lu)\n", pdf->root_obj, (unsigned long)pdf->root_obj->number); - return (load_pages(pdf, pdfioDictGetObj(pdfioObjGetDict(pdf->root_obj), "Pages"), 0)); + if ((pages_obj = pdfioDictGetObj(pdfioObjGetDict(pdf->root_obj), "Pages")) == NULL) + { + PDFIO_DEBUG("load_xref: Missing Pages object.\n"); + goto repair; + } + + PDFIO_DEBUG("load_xref: Pages=%p(%lu)\n", pdf->root_obj, (unsigned long)pdf->root_obj->number); + + return (load_pages(pdf, pages_obj, 0)); + + // If we get here the cross-reference table is busted - try repairing if the + // error callback says to proceed... + + repair: + + if (_pdfioFileError(pdf, "WARNING: Cross-reference is damaged, will attempt to rebuild.")) + return (repair_xref(pdf, password_cb, password_data)); + else + return (false); } @@ -2318,7 +2348,7 @@ repair_xref( pdfio_password_cb_t password_cb, // I - Password callback or `NULL` for none void *password_data) // I - Password callback data, if any { - char line[65536], // Line from file + char line[1024], // Line from file *ptr; // Pointer into line off_t line_offset; // Offset in file intmax_t number; // Object number @@ -2330,22 +2360,23 @@ repair_xref( pdfio_obj_t *pages_obj; // Pages object - // Let caller know something is wrong... - if (!_pdfioFileError(pdf, "WARNING: Cross-reference table is damaged, attempting to rebuild.")) - return (false); + // Clear trailer data... + pdf->trailer_dict = NULL; + pdf->root_obj = NULL; + pdf->info_obj = NULL; + pdf->pages_obj = NULL; + pdf->encrypt_obj = NULL; - // Read from the beginning of the file, looking for + // Read from the beginning of the file, looking for objects... if ((line_offset = _pdfioFileSeek(pdf, 0, SEEK_SET)) < 0) return (false); - while (_pdfioFileGets(pdf, line, sizeof(line))) + while (_pdfioFileGets(pdf, line, sizeof(line), true)) { // See if this is the start of an object... if (line[0] >= '1' && line[0] <= '9') { // Maybe, look some more... - PDFIO_DEBUG("repair_xref: line=\"%s\"\n", line); - if ((number = strtoimax(line, &ptr, 10)) >= 1 && (generation = (int)strtol(ptr, &ptr, 10)) >= 0 && generation < 65536) { while (isspace(*ptr & 255)) @@ -2359,18 +2390,31 @@ repair_xref( PDFIO_DEBUG("repair_xref: OBJECT %ld %d at offset %ld\n", (long)number, generation, (long)line_offset); - if ((obj = add_obj(pdf, (size_t)number, (unsigned short)generation, line_offset)) == NULL) + if ((obj = pdfioFileFindObj(pdf, (size_t)number)) != NULL) + { + obj->offset = line_offset; + } + else if ((obj = add_obj(pdf, (size_t)number, (unsigned short)generation, line_offset)) == NULL) { _pdfioFileError(pdf, "Unable to allocate memory for object."); return (false); } + if (ptr[3]) + { + // Probably the start of the object dictionary, rewind the file so + // we can read it... + _pdfioFileSeek(pdf, line_offset + (ptr - line + 3), SEEK_SET); + } + _pdfioTokenInit(&tb, pdf, (_pdfio_tconsume_cb_t)_pdfioFileConsume, (_pdfio_tpeek_cb_t)_pdfioFilePeek, pdf); if (!_pdfioValueRead(pdf, obj, &tb, &obj->value, 0)) { - _pdfioFileError(pdf, "Unable to read cross-reference stream dictionary."); - return (false); + if (!_pdfioFileError(pdf, "WARNING: Unable to read object dictionary/value.")) + return (false); + else + continue; } if (_pdfioTokenGet(&tb, line, sizeof(line))) @@ -2448,7 +2492,7 @@ repair_xref( _pdfioTokenFlush(&tb); - if (!pdf->trailer_dict) + if (_pdfioDictGetValue(trailer.value.dict, "Root")) { // Save the trailer dictionary and grab the root (catalog) and info // objects... diff --git a/pdfio-object.c b/pdfio-object.c index 0598abc..58a977e 100644 --- a/pdfio-object.c +++ b/pdfio-object.c @@ -141,6 +141,7 @@ pdfioObjCreateStream( pdfio_obj_t *obj, // I - Object pdfio_filter_t filter) // I - Type of compression to apply { + pdfio_stream_t *st; // Stream pdfio_obj_t *length_obj = NULL; // Length object, if any @@ -194,11 +195,13 @@ pdfioObjCreateStream( if (!_pdfioFilePuts(obj->pdf, "stream\n")) return (NULL); - obj->stream_offset = _pdfioFileTell(obj->pdf); - obj->pdf->current_obj = obj; + obj->stream_offset = _pdfioFileTell(obj->pdf); // Return the new stream... - return (_pdfioStreamCreate(obj, length_obj, 0, filter)); + if ((st = _pdfioStreamCreate(obj, length_obj, 0, filter)) != NULL) + obj->pdf->current_obj = obj; + + return (st); } @@ -534,6 +537,9 @@ pdfio_stream_t * // O - Stream or `NULL` on error pdfioObjOpenStream(pdfio_obj_t *obj, // I - Object bool decode) // I - Decode/decompress data? { + pdfio_stream_t *st; // Stream + + // Range check input... if (!obj) return (NULL); @@ -556,9 +562,10 @@ pdfioObjOpenStream(pdfio_obj_t *obj, // I - Object return (NULL); // Open the stream... - obj->pdf->current_obj = obj; + if ((st = _pdfioStreamOpen(obj, decode)) != NULL) + obj->pdf->current_obj = obj; - return (_pdfioStreamOpen(obj, decode)); + return (st); } diff --git a/pdfio-private.h b/pdfio-private.h index a4b6998..890c780 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -385,7 +385,7 @@ extern bool _pdfioFileError(pdfio_file_t *pdf, const char *format, ...) _PDFIO_ extern pdfio_obj_t *_pdfioFileFindMappedObj(pdfio_file_t *pdf, pdfio_file_t *src_pdf, size_t src_number) _PDFIO_INTERNAL; extern bool _pdfioFileFlush(pdfio_file_t *pdf) _PDFIO_INTERNAL; extern int _pdfioFileGetChar(pdfio_file_t *pdf) _PDFIO_INTERNAL; -extern bool _pdfioFileGets(pdfio_file_t *pdf, char *buffer, size_t bufsize) _PDFIO_INTERNAL; +extern bool _pdfioFileGets(pdfio_file_t *pdf, char *buffer, size_t bufsize, bool discard) _PDFIO_INTERNAL; extern ssize_t _pdfioFilePeek(pdfio_file_t *pdf, void *buffer, size_t bytes) _PDFIO_INTERNAL; extern bool _pdfioFilePrintf(pdfio_file_t *pdf, const char *format, ...) _PDFIO_INTERNAL; extern bool _pdfioFilePuts(pdfio_file_t *pdf, const char *s) _PDFIO_INTERNAL; diff --git a/pdfio-stream.c b/pdfio-stream.c index 3c9285b..e4e6703 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -259,7 +259,7 @@ _pdfioStreamCreate( { colors = 1; } - else if (colors < 0 || colors > 4) + else if (colors < 0 || colors > 32) { _pdfioFileError(st->pdf, "Unsupported Colors value %d.", colors); free(st); @@ -532,7 +532,7 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object { colors = 1; } - else if (colors < 0 || colors > 4) + else if (colors < 0 || colors > 32) { _pdfioFileError(st->pdf, "Unsupported Colors value %d.", colors); goto error; diff --git a/test-corpus.sh b/test-corpus.sh index ff33086..2ef0b05 100755 --- a/test-corpus.sh +++ b/test-corpus.sh @@ -18,12 +18,22 @@ if test $# = 0; then fi for file in $(find "$@" -name \*.pdf -print); do + # Don't worry about test files containing MIME garbage... + (head -4 $file | grep -q Content-Type) && continue; + + # Or test files containing MacBinary garbage... + (file $file | grep -q MacBinary) && continue; + + # Don't worry about test files that Xpdf can't handle... pdfinfo $file >/dev/null 2>&1 || continue; + # Run testpdfio to test loading the file... ./testpdfio $file >$file.log 2>&1 if test $? = 0; then + # Passed rm -f $file.log else + # Failed, preserve log and write filename to stdout... echo $file fi done