Fix implementation of LZWDecode filter to account for the EarlyChange parameter

(somewhat buried, very frustrating...)

Add some debugging and update the test suite to find page metadata in any parent
page object.
This commit is contained in:
Michael R Sweet
2026-01-16 15:57:43 -05:00
parent e6e0b84dfc
commit 65098b5509
5 changed files with 74 additions and 26 deletions

View File

@@ -28,7 +28,8 @@ static int lzw_get_code(_pdfio_lzw_t *lzw);
//
_pdfio_lzw_t * // O - LZW state
_pdfioLZWCreate(int code_size) // I - Data code size in bits (typically 8 for PDF, 2-8 for GIF)
_pdfioLZWCreate(int code_size, // I - Data code size in bits (typically 8 for PDF, 2-8 for GIF)
int early) // I - Number of early codes
{
_pdfio_lzw_t *lzw; // LZW state
@@ -38,6 +39,7 @@ _pdfioLZWCreate(int code_size) // I - Data code size in bits (typically 8 for P
lzw->def_code_size = code_size + 1;
lzw->clear_code = (short)(1 << code_size);
lzw->eod_code = lzw->clear_code + 1;
lzw->early = early;
lzw_clear(lzw);
}
@@ -81,6 +83,7 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state
{
*(lzw->next_out++) = *(--lzw->stptr);
lzw->avail_out --;
PDFIO_DEBUG("_pdfioLZWInflate: Unrolled value %d, stptr=%p(%ld), avail_out=%u\n", *(lzw->stptr), (void *)lzw->stptr, lzw->stptr - lzw->stack, (unsigned)lzw->avail_out);
}
// Loop as long as we have room in the output buffer and data in the input
@@ -121,19 +124,20 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state
continue;
}
PDFIO_DEBUG("_pdfioLZWInflate: in_code=%d.\n", in_code);
PDFIO_DEBUG("_pdfioLZWInflate: in_code=%d, old_code=%d.\n", in_code, lzw->old_code);
cur_code = in_code;
if (cur_code >= lzw->next_code)
{
PDFIO_DEBUG("_pdfioLZWInflate: New cur_code=%d, next_code=%d\n", cur_code, lzw->next_code);
*(lzw->stptr++) = lzw->first_code;
cur_code = lzw->old_code;
}
while (cur_code >= lzw->clear_code)
{
PDFIO_DEBUG("_pdfioLZWInflate: cur_code=%d\n", cur_code);
PDFIO_DEBUG("_pdfioLZWInflate: cur_code=%d (%d,%d)\n", cur_code, lzw->table[cur_code].prefix_code, lzw->table[cur_code].suffix);
// Protect against overflow/loops...
if (lzw->stptr >= (lzw->stack + sizeof(lzw->stack) / sizeof(lzw->stack[0])))
@@ -168,16 +172,17 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state
if ((cur_code = lzw->next_code) < 4096)
{
PDFIO_DEBUG("_pdfioLZWInflate: Adding code %d (%d,%d)\n", cur_code, lzw->old_code, lzw->first_code);
PDFIO_DEBUG("_pdfioLZWInflate: Adding code %d (%d,%d), next_size_code=%d\n", cur_code, lzw->old_code, lzw->first_code, lzw->next_size_code);
lzw->table[cur_code].prefix_code = lzw->old_code;
lzw->table[cur_code].suffix = lzw->first_code;
lzw->next_code ++;
if (lzw->next_code >= lzw->next_size_code && lzw->next_size_code < 4096)
if (lzw->next_code >= lzw->next_size_code && lzw->cur_code_size < 12)
{
lzw->next_size_code *= 2;
lzw->cur_code_size ++;
lzw->next_size_code = (1 << lzw->cur_code_size) - lzw->early;
PDFIO_DEBUG("_pdfioLZWInflate: Increased code size to %u, next_size_code=%u\n", lzw->cur_code_size, lzw->next_size_code);
}
}
@@ -187,6 +192,7 @@ _pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state
{
*(lzw->next_out++) = *(--lzw->stptr);
lzw->avail_out --;
PDFIO_DEBUG("_pdfioLZWInflate: Unrolled value %d, stptr=%p(%ld), avail_out=%u\n", *(lzw->stptr), (void *)lzw->stptr, lzw->stptr - lzw->stack, (unsigned)lzw->avail_out);
}
}
@@ -208,7 +214,7 @@ lzw_clear(_pdfio_lzw_t *lzw) // I - LZW state
lzw->cur_code_size = lzw->def_code_size;
lzw->next_code = lzw->clear_code + 2;
lzw->next_size_code = 2 * lzw->clear_code;
lzw->next_size_code = (1 << lzw->cur_code_size) - lzw->early;
lzw->first_code = 0xffff;
lzw->old_code = 0xffff;
@@ -281,7 +287,7 @@ lzw_get_code(_pdfio_lzw_t *lzw) // I - LZW state
}
}
PDFIO_DEBUG("lzw_get_code: in_bit=%u, in_bits=%u, in_bytes=<...%02X%02X...>, cur_code_size=%u\n", lzw->in_bit, lzw->in_bits, lzw->in_bytes[lzw->in_bit / 8], lzw->in_bytes[lzw->in_bit / 8 + 1], lzw->cur_code_size);
PDFIO_DEBUG("lzw_get_code: in_bit=%u, in_bits=%u, in_bytes=<...%02X%02X%02X...>, cur_code_size=%u\n", lzw->in_bit, lzw->in_bits, lzw->in_bytes[lzw->in_bit / 8], lzw->in_bytes[lzw->in_bit / 8 + 1], lzw->in_bytes[lzw->in_bit / 8 + 2], lzw->cur_code_size);
// Now extract the code from the buffer...
for (code = 0, in_bit = lzw->in_bit, remaining = lzw->cur_code_size; remaining > 0; in_bit += bits, remaining -= bits)
@@ -303,7 +309,12 @@ lzw_get_code(_pdfio_lzw_t *lzw) // I - LZW state
// Save the updated position in the input buffer and return the code...
lzw->in_bit = in_bit;
#ifdef DEBUG
if (code >= 0x20 && code < 0x7f)
PDFIO_DEBUG("lzw_get_code: Returning %u('%c').\n", code, code);
else
PDFIO_DEBUG("lzw_get_code: Returning %u.\n", code);
#endif // DEBUG
return ((int)code);
}

View File

@@ -1,7 +1,7 @@
//
// PDF page functions for PDFio.
//
// Copyright © 2021-2022 by Michael R Sweet.
// Copyright © 2021-2026 by Michael R Sweet.
//
// Licensed under Apache License v2.0. See the file "LICENSE" for more
// information.
@@ -87,14 +87,28 @@ pdfioPageOpenStream(
// Contents value
PDFIO_DEBUG("pdfioPageOpenStream(page=%p(%lu), n=%lu, decode=%s)\n", (void *)page, page ? (unsigned long)page->number : 0, (unsigned long)n, decode ? "true" : "false");
if (!contents)
{
PDFIO_DEBUG("pdfioPageOpenStream: No contents.\n");
return (NULL);
}
else if (contents->type == PDFIO_VALTYPE_ARRAY && n < pdfioArrayGetSize(contents->value.array))
{
PDFIO_DEBUG("pdfioPageOpenStream: Contents is array, opening numbered content stream.\n");
return (pdfioObjOpenStream(pdfioArrayGetObj(contents->value.array, n), decode));
}
else if (n)
{
PDFIO_DEBUG("pdfioPageOpenStream: Numbered stream does not exist.\n");
return (NULL);
}
else
{
PDFIO_DEBUG("pdfioPageOpenStream: Opening single content stream.\n");
return (pdfioObjOpenStream(pdfioFileFindObj(page->pdf, contents->value.indirect.number), decode));
}
}

View File

@@ -227,7 +227,8 @@ typedef struct _pdfio_lzw_s // LZW state
uint8_t *next_out; // Next output byte
size_t avail_out; // Available output bytes
uint8_t cur_code_size, // Current code size
def_code_size; // Initial/default code size
def_code_size, // Initial/default code size
early; // Early code change offset
uint16_t clear_code, // Clear code
eod_code, // End code
next_code, // Next code to be used
@@ -451,7 +452,7 @@ extern off_t _pdfioFileSeek(pdfio_file_t *pdf, off_t offset, int whence) _PDFIO
extern off_t _pdfioFileTell(pdfio_file_t *pdf) _PDFIO_INTERNAL;
extern bool _pdfioFileWrite(pdfio_file_t *pdf, const void *buffer, size_t bytes) _PDFIO_INTERNAL;
extern _pdfio_lzw_t *_pdfioLZWCreate(int def_code_size) _PDFIO_INTERNAL;
extern _pdfio_lzw_t *_pdfioLZWCreate(int def_code_size, int early) _PDFIO_INTERNAL;
extern void _pdfioLZWDelete(_pdfio_lzw_t *lzw) _PDFIO_INTERNAL;
extern bool _pdfioLZWInflate(_pdfio_lzw_t *lzw) _PDFIO_INTERNAL;

View File

@@ -610,7 +610,7 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object
st->cbsize = 4096;
if ((st->cbuffer = malloc(st->cbsize)) == NULL)
{
_pdfioFileError(st->pdf, "Unable to allocate %lu bytes for Flate compression buffer.", (unsigned long)st->cbsize);
_pdfioFileError(st->pdf, "Unable to allocate %lu bytes for FlateDecode decompression buffer.", (unsigned long)st->cbsize);
goto error;
}
@@ -633,16 +633,28 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object
if ((status = inflateInit(&(st->flate))) != Z_OK)
{
_pdfioFileError(st->pdf, "Unable to start Flate filter: %s", zstrerror(status));
_pdfioFileError(st->pdf, "Unable to start FlateDecode filter: %s", zstrerror(status));
goto error;
}
}
else
{
// LZW decompression...
if ((st->lzw = _pdfioLZWCreate(/*code_size*/8)) == NULL)
int early = 1;
if (pdfioDictGetType(params, "EarlyChange") == PDFIO_VALTYPE_NUMBER)
{
_pdfioFileError(st->pdf, "Unable to initialize LZW filter: %s", strerror(errno));
early = (int)pdfioDictGetNumber(params, "EarlyChange");
if (early < 0 || early > 100)
{
_pdfioFileError(st->pdf, "Bad EarlyChange value %d for LZWDecode filter.", early);
goto error;
}
}
if ((st->lzw = _pdfioLZWCreate(/*code_size*/8, early)) == NULL)
{
_pdfioFileError(st->pdf, "Unable to initialize LZWDecode filter: %s", strerror(errno));
goto error;
}

View File

@@ -12,6 +12,7 @@
//
// Options:
//
// --decode Decode object stream
// --help Show help
// --password PASSWORD Set access password
// --verbose Be verbose
@@ -34,7 +35,7 @@
static int do_crypto_tests(void);
static int do_lzw_tests(void);
static int do_pdfa_tests(void);
static int do_test_file(const char *filename, const char *outfile, int objnum, const char *password, bool verbose);
static int do_test_file(const char *filename, const char *outfile, int objnum, const char *password, bool decode, bool verbose);
static int do_unit_tests(void);
static int draw_image(pdfio_stream_t *st, const char *name, double x, double y, double w, double h, const char *label);
static bool error_cb(pdfio_file_t *pdf, const char *message, bool *error);
@@ -75,11 +76,16 @@ main(int argc, // I - Number of command-line arguments
{
int i; // Looping var
const char *password = NULL; // Password
bool verbose = false; // Be verbose?
bool decode = false, // Decode object stream?
verbose = false; // Be verbose?
for (i = 1; i < argc; i ++)
{
if (!strcmp(argv[i], "--help"))
if (!strcmp(argv[i], "--decode"))
{
decode = true;
}
else if (!strcmp(argv[i], "--help"))
{
return (usage(stdout));
}
@@ -108,14 +114,14 @@ main(int argc, // I - Number of command-line arguments
else if ((i + 1) < argc && isdigit(argv[i + 1][0] & 255))
{
// filename.pdf object-number
if (do_test_file(argv[i], /*outfile*/NULL, atoi(argv[i + 1]), password, verbose))
if (do_test_file(argv[i], /*outfile*/NULL, atoi(argv[i + 1]), password, decode, verbose))
ret = 1;
i ++;
}
else
{
if (do_test_file(argv[i], argv[i + 1], /*objnum*/0, password, verbose))
if (do_test_file(argv[i], argv[i + 1], /*objnum*/0, password, decode, verbose))
ret = 1;
if (argv[i + 1])
@@ -405,7 +411,7 @@ do_lzw_tests(void)
testBegin("_pdfioLZWCreate(8)");
testEnd((lzw = _pdfioLZWCreate(/*code_size*/8)) != NULL);
testEnd((lzw = _pdfioLZWCreate(/*code_size*/8, /*early*/1)) != NULL);
if (!lzw)
return (1);
@@ -538,6 +544,7 @@ do_test_file(const char *filename, // I - PDF filename
const char *outfile, // I - Output filename, if any
int objnum, // I - Object number to dump, if any
const char *password, // I - Password for file
bool decode, // I - Decode object?
bool verbose) // I - Be verbose?
{
int status = 0; // Exit status
@@ -586,7 +593,7 @@ do_test_file(const char *filename, // I - PDF filename
filter = pdfioDictGetName(dict, "Filter");
if ((st = pdfioObjOpenStream(obj, filter && !strcmp(filter, "FlateDecode"))) == NULL)
if ((st = pdfioObjOpenStream(obj, decode || (filter && !strcmp(filter, "FlateDecode")))) == NULL)
{
_pdfioValueDebug(&obj->value, stdout);
putchar('\n');
@@ -655,10 +662,13 @@ do_test_file(const char *filename, // I - PDF filename
if (!pdfioDictGetRect(dict, "MediaBox", &media_box))
{
if ((obj = pdfioDictGetObj(dict, "Parent")) != NULL)
pdfio_obj_t *parent; // Parent object
while ((parent = pdfioDictGetObj(dict, "Parent")) != NULL)
{
dict = pdfioObjGetDict(obj);
pdfioDictGetRect(dict, "MediaBox", &media_box);
dict = pdfioObjGetDict(parent);
if (pdfioDictGetRect(dict, "MediaBox", &media_box))
break;
}
}