diff --git a/pdfio-lzw.c b/pdfio-lzw.c index 50243ec..beb4a13 100644 --- a/pdfio-lzw.c +++ b/pdfio-lzw.c @@ -28,7 +28,8 @@ static int lzw_get_code(_pdfio_lzw_t *lzw); // _pdfio_lzw_t * // O - LZW state -_pdfioLZWCreate(int code_size) // I - Data code size in bits (typically 8 for PDF, 2-8 for GIF) +_pdfioLZWCreate(int code_size, // I - Data code size in bits (typically 8 for PDF, 2-8 for GIF) + int early) // I - Number of early codes { _pdfio_lzw_t *lzw; // LZW state @@ -38,6 +39,7 @@ _pdfioLZWCreate(int code_size) // I - Data code size in bits (typically 8 for P lzw->def_code_size = code_size + 1; lzw->clear_code = (short)(1 << code_size); lzw->eod_code = lzw->clear_code + 1; + lzw->early = early; lzw_clear(lzw); } @@ -81,6 +83,7 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state { *(lzw->next_out++) = *(--lzw->stptr); lzw->avail_out --; + PDFIO_DEBUG("_pdfioLZWInflate: Unrolled value %d, stptr=%p(%ld), avail_out=%u\n", *(lzw->stptr), (void *)lzw->stptr, lzw->stptr - lzw->stack, (unsigned)lzw->avail_out); } // Loop as long as we have room in the output buffer and data in the input @@ -121,19 +124,20 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state continue; } - PDFIO_DEBUG("_pdfioLZWInflate: in_code=%d.\n", in_code); + PDFIO_DEBUG("_pdfioLZWInflate: in_code=%d, old_code=%d.\n", in_code, lzw->old_code); cur_code = in_code; if (cur_code >= lzw->next_code) { + PDFIO_DEBUG("_pdfioLZWInflate: New cur_code=%d, next_code=%d\n", cur_code, lzw->next_code); *(lzw->stptr++) = lzw->first_code; cur_code = lzw->old_code; } while (cur_code >= lzw->clear_code) { - PDFIO_DEBUG("_pdfioLZWInflate: cur_code=%d\n", cur_code); + PDFIO_DEBUG("_pdfioLZWInflate: cur_code=%d (%d,%d)\n", cur_code, lzw->table[cur_code].prefix_code, lzw->table[cur_code].suffix); // Protect against overflow/loops... if (lzw->stptr >= (lzw->stack + sizeof(lzw->stack) / sizeof(lzw->stack[0]))) @@ -168,16 +172,17 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state if ((cur_code = lzw->next_code) < 4096) { - PDFIO_DEBUG("_pdfioLZWInflate: Adding code %d (%d,%d)\n", cur_code, lzw->old_code, lzw->first_code); + PDFIO_DEBUG("_pdfioLZWInflate: Adding code %d (%d,%d), next_size_code=%d\n", cur_code, lzw->old_code, lzw->first_code, lzw->next_size_code); lzw->table[cur_code].prefix_code = lzw->old_code; lzw->table[cur_code].suffix = lzw->first_code; lzw->next_code ++; - if (lzw->next_code >= lzw->next_size_code && lzw->next_size_code < 4096) + if (lzw->next_code >= lzw->next_size_code && lzw->cur_code_size < 12) { - lzw->next_size_code *= 2; lzw->cur_code_size ++; + lzw->next_size_code = (1 << lzw->cur_code_size) - lzw->early; + PDFIO_DEBUG("_pdfioLZWInflate: Increased code size to %u, next_size_code=%u\n", lzw->cur_code_size, lzw->next_size_code); } } @@ -187,6 +192,7 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state { *(lzw->next_out++) = *(--lzw->stptr); lzw->avail_out --; + PDFIO_DEBUG("_pdfioLZWInflate: Unrolled value %d, stptr=%p(%ld), avail_out=%u\n", *(lzw->stptr), (void *)lzw->stptr, lzw->stptr - lzw->stack, (unsigned)lzw->avail_out); } } @@ -208,7 +214,7 @@ lzw_clear(_pdfio_lzw_t *lzw) // I - LZW state lzw->cur_code_size = lzw->def_code_size; lzw->next_code = lzw->clear_code + 2; - lzw->next_size_code = 2 * lzw->clear_code; + lzw->next_size_code = (1 << lzw->cur_code_size) - lzw->early; lzw->first_code = 0xffff; lzw->old_code = 0xffff; @@ -281,7 +287,7 @@ lzw_get_code(_pdfio_lzw_t *lzw) // I - LZW state } } - PDFIO_DEBUG("lzw_get_code: in_bit=%u, in_bits=%u, in_bytes=<...%02X%02X...>, cur_code_size=%u\n", lzw->in_bit, lzw->in_bits, lzw->in_bytes[lzw->in_bit / 8], lzw->in_bytes[lzw->in_bit / 8 + 1], lzw->cur_code_size); + PDFIO_DEBUG("lzw_get_code: in_bit=%u, in_bits=%u, in_bytes=<...%02X%02X%02X...>, cur_code_size=%u\n", lzw->in_bit, lzw->in_bits, lzw->in_bytes[lzw->in_bit / 8], lzw->in_bytes[lzw->in_bit / 8 + 1], lzw->in_bytes[lzw->in_bit / 8 + 2], lzw->cur_code_size); // Now extract the code from the buffer... for (code = 0, in_bit = lzw->in_bit, remaining = lzw->cur_code_size; remaining > 0; in_bit += bits, remaining -= bits) @@ -303,7 +309,12 @@ lzw_get_code(_pdfio_lzw_t *lzw) // I - LZW state // Save the updated position in the input buffer and return the code... lzw->in_bit = in_bit; - PDFIO_DEBUG("lzw_get_code: Returning %u.\n", code); +#ifdef DEBUG + if (code >= 0x20 && code < 0x7f) + PDFIO_DEBUG("lzw_get_code: Returning %u('%c').\n", code, code); + else + PDFIO_DEBUG("lzw_get_code: Returning %u.\n", code); +#endif // DEBUG return ((int)code); } diff --git a/pdfio-page.c b/pdfio-page.c index 94e6ff8..a7b45de 100644 --- a/pdfio-page.c +++ b/pdfio-page.c @@ -1,7 +1,7 @@ // // PDF page functions for PDFio. // -// Copyright © 2021-2022 by Michael R Sweet. +// Copyright © 2021-2026 by Michael R Sweet. // // Licensed under Apache License v2.0. See the file "LICENSE" for more // information. @@ -87,14 +87,28 @@ pdfioPageOpenStream( // Contents value + PDFIO_DEBUG("pdfioPageOpenStream(page=%p(%lu), n=%lu, decode=%s)\n", (void *)page, page ? (unsigned long)page->number : 0, (unsigned long)n, decode ? "true" : "false"); + if (!contents) + { + PDFIO_DEBUG("pdfioPageOpenStream: No contents.\n"); return (NULL); + } else if (contents->type == PDFIO_VALTYPE_ARRAY && n < pdfioArrayGetSize(contents->value.array)) + { + PDFIO_DEBUG("pdfioPageOpenStream: Contents is array, opening numbered content stream.\n"); return (pdfioObjOpenStream(pdfioArrayGetObj(contents->value.array, n), decode)); + } else if (n) + { + PDFIO_DEBUG("pdfioPageOpenStream: Numbered stream does not exist.\n"); return (NULL); + } else + { + PDFIO_DEBUG("pdfioPageOpenStream: Opening single content stream.\n"); return (pdfioObjOpenStream(pdfioFileFindObj(page->pdf, contents->value.indirect.number), decode)); + } } diff --git a/pdfio-private.h b/pdfio-private.h index 3a2a618..95bd087 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -227,7 +227,8 @@ typedef struct _pdfio_lzw_s // LZW state uint8_t *next_out; // Next output byte size_t avail_out; // Available output bytes uint8_t cur_code_size, // Current code size - def_code_size; // Initial/default code size + def_code_size, // Initial/default code size + early; // Early code change offset uint16_t clear_code, // Clear code eod_code, // End code next_code, // Next code to be used @@ -451,7 +452,7 @@ extern off_t _pdfioFileSeek(pdfio_file_t *pdf, off_t offset, int whence) _PDFIO extern off_t _pdfioFileTell(pdfio_file_t *pdf) _PDFIO_INTERNAL; extern bool _pdfioFileWrite(pdfio_file_t *pdf, const void *buffer, size_t bytes) _PDFIO_INTERNAL; -extern _pdfio_lzw_t *_pdfioLZWCreate(int def_code_size) _PDFIO_INTERNAL; +extern _pdfio_lzw_t *_pdfioLZWCreate(int def_code_size, int early) _PDFIO_INTERNAL; extern void _pdfioLZWDelete(_pdfio_lzw_t *lzw) _PDFIO_INTERNAL; extern bool _pdfioLZWInflate(_pdfio_lzw_t *lzw) _PDFIO_INTERNAL; diff --git a/pdfio-stream.c b/pdfio-stream.c index f075eec..99541e8 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -610,7 +610,7 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object st->cbsize = 4096; if ((st->cbuffer = malloc(st->cbsize)) == NULL) { - _pdfioFileError(st->pdf, "Unable to allocate %lu bytes for Flate compression buffer.", (unsigned long)st->cbsize); + _pdfioFileError(st->pdf, "Unable to allocate %lu bytes for FlateDecode decompression buffer.", (unsigned long)st->cbsize); goto error; } @@ -633,16 +633,28 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object if ((status = inflateInit(&(st->flate))) != Z_OK) { - _pdfioFileError(st->pdf, "Unable to start Flate filter: %s", zstrerror(status)); + _pdfioFileError(st->pdf, "Unable to start FlateDecode filter: %s", zstrerror(status)); goto error; } } else { // LZW decompression... - if ((st->lzw = _pdfioLZWCreate(/*code_size*/8)) == NULL) + int early = 1; + + if (pdfioDictGetType(params, "EarlyChange") == PDFIO_VALTYPE_NUMBER) { - _pdfioFileError(st->pdf, "Unable to initialize LZW filter: %s", strerror(errno)); + early = (int)pdfioDictGetNumber(params, "EarlyChange"); + if (early < 0 || early > 100) + { + _pdfioFileError(st->pdf, "Bad EarlyChange value %d for LZWDecode filter.", early); + goto error; + } + } + + if ((st->lzw = _pdfioLZWCreate(/*code_size*/8, early)) == NULL) + { + _pdfioFileError(st->pdf, "Unable to initialize LZWDecode filter: %s", strerror(errno)); goto error; } diff --git a/testpdfio.c b/testpdfio.c index 0c95fae..9028d0d 100644 --- a/testpdfio.c +++ b/testpdfio.c @@ -12,6 +12,7 @@ // // Options: // +// --decode Decode object stream // --help Show help // --password PASSWORD Set access password // --verbose Be verbose @@ -34,7 +35,7 @@ static int do_crypto_tests(void); static int do_lzw_tests(void); static int do_pdfa_tests(void); -static int do_test_file(const char *filename, const char *outfile, int objnum, const char *password, bool verbose); +static int do_test_file(const char *filename, const char *outfile, int objnum, const char *password, bool decode, bool verbose); static int do_unit_tests(void); static int draw_image(pdfio_stream_t *st, const char *name, double x, double y, double w, double h, const char *label); static bool error_cb(pdfio_file_t *pdf, const char *message, bool *error); @@ -75,11 +76,16 @@ main(int argc, // I - Number of command-line arguments { int i; // Looping var const char *password = NULL; // Password - bool verbose = false; // Be verbose? + bool decode = false, // Decode object stream? + verbose = false; // Be verbose? for (i = 1; i < argc; i ++) { - if (!strcmp(argv[i], "--help")) + if (!strcmp(argv[i], "--decode")) + { + decode = true; + } + else if (!strcmp(argv[i], "--help")) { return (usage(stdout)); } @@ -108,14 +114,14 @@ main(int argc, // I - Number of command-line arguments else if ((i + 1) < argc && isdigit(argv[i + 1][0] & 255)) { // filename.pdf object-number - if (do_test_file(argv[i], /*outfile*/NULL, atoi(argv[i + 1]), password, verbose)) + if (do_test_file(argv[i], /*outfile*/NULL, atoi(argv[i + 1]), password, decode, verbose)) ret = 1; i ++; } else { - if (do_test_file(argv[i], argv[i + 1], /*objnum*/0, password, verbose)) + if (do_test_file(argv[i], argv[i + 1], /*objnum*/0, password, decode, verbose)) ret = 1; if (argv[i + 1]) @@ -405,7 +411,7 @@ do_lzw_tests(void) testBegin("_pdfioLZWCreate(8)"); - testEnd((lzw = _pdfioLZWCreate(/*code_size*/8)) != NULL); + testEnd((lzw = _pdfioLZWCreate(/*code_size*/8, /*early*/1)) != NULL); if (!lzw) return (1); @@ -538,6 +544,7 @@ do_test_file(const char *filename, // I - PDF filename const char *outfile, // I - Output filename, if any int objnum, // I - Object number to dump, if any const char *password, // I - Password for file + bool decode, // I - Decode object? bool verbose) // I - Be verbose? { int status = 0; // Exit status @@ -586,7 +593,7 @@ do_test_file(const char *filename, // I - PDF filename filter = pdfioDictGetName(dict, "Filter"); - if ((st = pdfioObjOpenStream(obj, filter && !strcmp(filter, "FlateDecode"))) == NULL) + if ((st = pdfioObjOpenStream(obj, decode || (filter && !strcmp(filter, "FlateDecode")))) == NULL) { _pdfioValueDebug(&obj->value, stdout); putchar('\n'); @@ -655,10 +662,13 @@ do_test_file(const char *filename, // I - PDF filename if (!pdfioDictGetRect(dict, "MediaBox", &media_box)) { - if ((obj = pdfioDictGetObj(dict, "Parent")) != NULL) + pdfio_obj_t *parent; // Parent object + + while ((parent = pdfioDictGetObj(dict, "Parent")) != NULL) { - dict = pdfioObjGetDict(obj); - pdfioDictGetRect(dict, "MediaBox", &media_box); + dict = pdfioObjGetDict(parent); + if (pdfioDictGetRect(dict, "MediaBox", &media_box)) + break; } }