From 65098b5509fd25480b77c77145ddaa80b9e02421 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Fri, 16 Jan 2026 15:57:43 -0500 Subject: [PATCH] Fix implementation of LZWDecode filter to account for the EarlyChange parameter (somewhat buried, very frustrating...) Add some debugging and update the test suite to find page metadata in any parent page object. --- pdfio-lzw.c | 29 ++++++++++++++++++++--------- pdfio-page.c | 16 +++++++++++++++- pdfio-private.h | 5 +++-- pdfio-stream.c | 20 ++++++++++++++++---- testpdfio.c | 30 ++++++++++++++++++++---------- 5 files changed, 74 insertions(+), 26 deletions(-) diff --git a/pdfio-lzw.c b/pdfio-lzw.c index 50243ec..beb4a13 100644 --- a/pdfio-lzw.c +++ b/pdfio-lzw.c @@ -28,7 +28,8 @@ static int lzw_get_code(_pdfio_lzw_t *lzw); // _pdfio_lzw_t * // O - LZW state -_pdfioLZWCreate(int code_size) // I - Data code size in bits (typically 8 for PDF, 2-8 for GIF) +_pdfioLZWCreate(int code_size, // I - Data code size in bits (typically 8 for PDF, 2-8 for GIF) + int early) // I - Number of early codes { _pdfio_lzw_t *lzw; // LZW state @@ -38,6 +39,7 @@ _pdfioLZWCreate(int code_size) // I - Data code size in bits (typically 8 for P lzw->def_code_size = code_size + 1; lzw->clear_code = (short)(1 << code_size); lzw->eod_code = lzw->clear_code + 1; + lzw->early = early; lzw_clear(lzw); } @@ -81,6 +83,7 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state { *(lzw->next_out++) = *(--lzw->stptr); lzw->avail_out --; + PDFIO_DEBUG("_pdfioLZWInflate: Unrolled value %d, stptr=%p(%ld), avail_out=%u\n", *(lzw->stptr), (void *)lzw->stptr, lzw->stptr - lzw->stack, (unsigned)lzw->avail_out); } // Loop as long as we have room in the output buffer and data in the input @@ -121,19 +124,20 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state continue; } - PDFIO_DEBUG("_pdfioLZWInflate: in_code=%d.\n", in_code); + PDFIO_DEBUG("_pdfioLZWInflate: in_code=%d, old_code=%d.\n", in_code, lzw->old_code); cur_code = in_code; if (cur_code >= lzw->next_code) { + PDFIO_DEBUG("_pdfioLZWInflate: New cur_code=%d, next_code=%d\n", cur_code, lzw->next_code); *(lzw->stptr++) = lzw->first_code; cur_code = lzw->old_code; } while (cur_code >= lzw->clear_code) { - PDFIO_DEBUG("_pdfioLZWInflate: cur_code=%d\n", cur_code); + PDFIO_DEBUG("_pdfioLZWInflate: cur_code=%d (%d,%d)\n", cur_code, lzw->table[cur_code].prefix_code, lzw->table[cur_code].suffix); // Protect against overflow/loops... if (lzw->stptr >= (lzw->stack + sizeof(lzw->stack) / sizeof(lzw->stack[0]))) @@ -168,16 +172,17 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state if ((cur_code = lzw->next_code) < 4096) { - PDFIO_DEBUG("_pdfioLZWInflate: Adding code %d (%d,%d)\n", cur_code, lzw->old_code, lzw->first_code); + PDFIO_DEBUG("_pdfioLZWInflate: Adding code %d (%d,%d), next_size_code=%d\n", cur_code, lzw->old_code, lzw->first_code, lzw->next_size_code); lzw->table[cur_code].prefix_code = lzw->old_code; lzw->table[cur_code].suffix = lzw->first_code; lzw->next_code ++; - if (lzw->next_code >= lzw->next_size_code && lzw->next_size_code < 4096) + if (lzw->next_code >= lzw->next_size_code && lzw->cur_code_size < 12) { - lzw->next_size_code *= 2; lzw->cur_code_size ++; + lzw->next_size_code = (1 << lzw->cur_code_size) - lzw->early; + PDFIO_DEBUG("_pdfioLZWInflate: Increased code size to %u, next_size_code=%u\n", lzw->cur_code_size, lzw->next_size_code); } } @@ -187,6 +192,7 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state { *(lzw->next_out++) = *(--lzw->stptr); lzw->avail_out --; + PDFIO_DEBUG("_pdfioLZWInflate: Unrolled value %d, stptr=%p(%ld), avail_out=%u\n", *(lzw->stptr), (void *)lzw->stptr, lzw->stptr - lzw->stack, (unsigned)lzw->avail_out); } } @@ -208,7 +214,7 @@ lzw_clear(_pdfio_lzw_t *lzw) // I - LZW state lzw->cur_code_size = lzw->def_code_size; lzw->next_code = lzw->clear_code + 2; - lzw->next_size_code = 2 * lzw->clear_code; + lzw->next_size_code = (1 << lzw->cur_code_size) - lzw->early; lzw->first_code = 0xffff; lzw->old_code = 0xffff; @@ -281,7 +287,7 @@ lzw_get_code(_pdfio_lzw_t *lzw) // I - LZW state } } - PDFIO_DEBUG("lzw_get_code: in_bit=%u, in_bits=%u, in_bytes=<...%02X%02X...>, cur_code_size=%u\n", lzw->in_bit, lzw->in_bits, lzw->in_bytes[lzw->in_bit / 8], lzw->in_bytes[lzw->in_bit / 8 + 1], lzw->cur_code_size); + PDFIO_DEBUG("lzw_get_code: in_bit=%u, in_bits=%u, in_bytes=<...%02X%02X%02X...>, cur_code_size=%u\n", lzw->in_bit, lzw->in_bits, lzw->in_bytes[lzw->in_bit / 8], lzw->in_bytes[lzw->in_bit / 8 + 1], lzw->in_bytes[lzw->in_bit / 8 + 2], lzw->cur_code_size); // Now extract the code from the buffer... for (code = 0, in_bit = lzw->in_bit, remaining = lzw->cur_code_size; remaining > 0; in_bit += bits, remaining -= bits) @@ -303,7 +309,12 @@ lzw_get_code(_pdfio_lzw_t *lzw) // I - LZW state // Save the updated position in the input buffer and return the code... lzw->in_bit = in_bit; - PDFIO_DEBUG("lzw_get_code: Returning %u.\n", code); +#ifdef DEBUG + if (code >= 0x20 && code < 0x7f) + PDFIO_DEBUG("lzw_get_code: Returning %u('%c').\n", code, code); + else + PDFIO_DEBUG("lzw_get_code: Returning %u.\n", code); +#endif // DEBUG return ((int)code); } diff --git a/pdfio-page.c b/pdfio-page.c index 94e6ff8..a7b45de 100644 --- a/pdfio-page.c +++ b/pdfio-page.c @@ -1,7 +1,7 @@ // // PDF page functions for PDFio. // -// Copyright © 2021-2022 by Michael R Sweet. +// Copyright © 2021-2026 by Michael R Sweet. // // Licensed under Apache License v2.0. See the file "LICENSE" for more // information. @@ -87,14 +87,28 @@ pdfioPageOpenStream( // Contents value + PDFIO_DEBUG("pdfioPageOpenStream(page=%p(%lu), n=%lu, decode=%s)\n", (void *)page, page ? (unsigned long)page->number : 0, (unsigned long)n, decode ? "true" : "false"); + if (!contents) + { + PDFIO_DEBUG("pdfioPageOpenStream: No contents.\n"); return (NULL); + } else if (contents->type == PDFIO_VALTYPE_ARRAY && n < pdfioArrayGetSize(contents->value.array)) + { + PDFIO_DEBUG("pdfioPageOpenStream: Contents is array, opening numbered content stream.\n"); return (pdfioObjOpenStream(pdfioArrayGetObj(contents->value.array, n), decode)); + } else if (n) + { + PDFIO_DEBUG("pdfioPageOpenStream: Numbered stream does not exist.\n"); return (NULL); + } else + { + PDFIO_DEBUG("pdfioPageOpenStream: Opening single content stream.\n"); return (pdfioObjOpenStream(pdfioFileFindObj(page->pdf, contents->value.indirect.number), decode)); + } } diff --git a/pdfio-private.h b/pdfio-private.h index 3a2a618..95bd087 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -227,7 +227,8 @@ typedef struct _pdfio_lzw_s // LZW state uint8_t *next_out; // Next output byte size_t avail_out; // Available output bytes uint8_t cur_code_size, // Current code size - def_code_size; // Initial/default code size + def_code_size, // Initial/default code size + early; // Early code change offset uint16_t clear_code, // Clear code eod_code, // End code next_code, // Next code to be used @@ -451,7 +452,7 @@ extern off_t _pdfioFileSeek(pdfio_file_t *pdf, off_t offset, int whence) _PDFIO extern off_t _pdfioFileTell(pdfio_file_t *pdf) _PDFIO_INTERNAL; extern bool _pdfioFileWrite(pdfio_file_t *pdf, const void *buffer, size_t bytes) _PDFIO_INTERNAL; -extern _pdfio_lzw_t *_pdfioLZWCreate(int def_code_size) _PDFIO_INTERNAL; +extern _pdfio_lzw_t *_pdfioLZWCreate(int def_code_size, int early) _PDFIO_INTERNAL; extern void _pdfioLZWDelete(_pdfio_lzw_t *lzw) _PDFIO_INTERNAL; extern bool _pdfioLZWInflate(_pdfio_lzw_t *lzw) _PDFIO_INTERNAL; diff --git a/pdfio-stream.c b/pdfio-stream.c index f075eec..99541e8 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -610,7 +610,7 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object st->cbsize = 4096; if ((st->cbuffer = malloc(st->cbsize)) == NULL) { - _pdfioFileError(st->pdf, "Unable to allocate %lu bytes for Flate compression buffer.", (unsigned long)st->cbsize); + _pdfioFileError(st->pdf, "Unable to allocate %lu bytes for FlateDecode decompression buffer.", (unsigned long)st->cbsize); goto error; } @@ -633,16 +633,28 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object if ((status = inflateInit(&(st->flate))) != Z_OK) { - _pdfioFileError(st->pdf, "Unable to start Flate filter: %s", zstrerror(status)); + _pdfioFileError(st->pdf, "Unable to start FlateDecode filter: %s", zstrerror(status)); goto error; } } else { // LZW decompression... - if ((st->lzw = _pdfioLZWCreate(/*code_size*/8)) == NULL) + int early = 1; + + if (pdfioDictGetType(params, "EarlyChange") == PDFIO_VALTYPE_NUMBER) { - _pdfioFileError(st->pdf, "Unable to initialize LZW filter: %s", strerror(errno)); + early = (int)pdfioDictGetNumber(params, "EarlyChange"); + if (early < 0 || early > 100) + { + _pdfioFileError(st->pdf, "Bad EarlyChange value %d for LZWDecode filter.", early); + goto error; + } + } + + if ((st->lzw = _pdfioLZWCreate(/*code_size*/8, early)) == NULL) + { + _pdfioFileError(st->pdf, "Unable to initialize LZWDecode filter: %s", strerror(errno)); goto error; } diff --git a/testpdfio.c b/testpdfio.c index 0c95fae..9028d0d 100644 --- a/testpdfio.c +++ b/testpdfio.c @@ -12,6 +12,7 @@ // // Options: // +// --decode Decode object stream // --help Show help // --password PASSWORD Set access password // --verbose Be verbose @@ -34,7 +35,7 @@ static int do_crypto_tests(void); static int do_lzw_tests(void); static int do_pdfa_tests(void); -static int do_test_file(const char *filename, const char *outfile, int objnum, const char *password, bool verbose); +static int do_test_file(const char *filename, const char *outfile, int objnum, const char *password, bool decode, bool verbose); static int do_unit_tests(void); static int draw_image(pdfio_stream_t *st, const char *name, double x, double y, double w, double h, const char *label); static bool error_cb(pdfio_file_t *pdf, const char *message, bool *error); @@ -75,11 +76,16 @@ main(int argc, // I - Number of command-line arguments { int i; // Looping var const char *password = NULL; // Password - bool verbose = false; // Be verbose? + bool decode = false, // Decode object stream? + verbose = false; // Be verbose? for (i = 1; i < argc; i ++) { - if (!strcmp(argv[i], "--help")) + if (!strcmp(argv[i], "--decode")) + { + decode = true; + } + else if (!strcmp(argv[i], "--help")) { return (usage(stdout)); } @@ -108,14 +114,14 @@ main(int argc, // I - Number of command-line arguments else if ((i + 1) < argc && isdigit(argv[i + 1][0] & 255)) { // filename.pdf object-number - if (do_test_file(argv[i], /*outfile*/NULL, atoi(argv[i + 1]), password, verbose)) + if (do_test_file(argv[i], /*outfile*/NULL, atoi(argv[i + 1]), password, decode, verbose)) ret = 1; i ++; } else { - if (do_test_file(argv[i], argv[i + 1], /*objnum*/0, password, verbose)) + if (do_test_file(argv[i], argv[i + 1], /*objnum*/0, password, decode, verbose)) ret = 1; if (argv[i + 1]) @@ -405,7 +411,7 @@ do_lzw_tests(void) testBegin("_pdfioLZWCreate(8)"); - testEnd((lzw = _pdfioLZWCreate(/*code_size*/8)) != NULL); + testEnd((lzw = _pdfioLZWCreate(/*code_size*/8, /*early*/1)) != NULL); if (!lzw) return (1); @@ -538,6 +544,7 @@ do_test_file(const char *filename, // I - PDF filename const char *outfile, // I - Output filename, if any int objnum, // I - Object number to dump, if any const char *password, // I - Password for file + bool decode, // I - Decode object? bool verbose) // I - Be verbose? { int status = 0; // Exit status @@ -586,7 +593,7 @@ do_test_file(const char *filename, // I - PDF filename filter = pdfioDictGetName(dict, "Filter"); - if ((st = pdfioObjOpenStream(obj, filter && !strcmp(filter, "FlateDecode"))) == NULL) + if ((st = pdfioObjOpenStream(obj, decode || (filter && !strcmp(filter, "FlateDecode")))) == NULL) { _pdfioValueDebug(&obj->value, stdout); putchar('\n'); @@ -655,10 +662,13 @@ do_test_file(const char *filename, // I - PDF filename if (!pdfioDictGetRect(dict, "MediaBox", &media_box)) { - if ((obj = pdfioDictGetObj(dict, "Parent")) != NULL) + pdfio_obj_t *parent; // Parent object + + while ((parent = pdfioDictGetObj(dict, "Parent")) != NULL) { - dict = pdfioObjGetDict(obj); - pdfioDictGetRect(dict, "MediaBox", &media_box); + dict = pdfioObjGetDict(parent); + if (pdfioDictGetRect(dict, "MediaBox", &media_box)) + break; } }