From 09520d250f36f785d7de5c4710d6d66df47b9607 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Fri, 16 Jan 2026 09:53:51 -0500 Subject: [PATCH] Add support for LZWDecode filter, needs more testing (Issue #11) --- CHANGES.md | 1 + Makefile.in | 3 +- pdfio-lzw.c | 309 ++++++++++++++++++++++++++++++++++++++++++++++++ pdfio-private.h | 37 +++++- pdfio-stream.c | 293 ++++++++++++++++++++++++--------------------- pdfio.h | 6 +- pdfio.vcxproj | 1 + test.h | 10 ++ testpdfio.c | 101 ++++++++++++++++ 9 files changed, 620 insertions(+), 141 deletions(-) create mode 100644 pdfio-lzw.c diff --git a/CHANGES.md b/CHANGES.md index 4dbc0c5..ce2b071 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,6 +8,7 @@ v1.7.0 - YYYY-MM-DD - Now use TTF 1.1 or later for font support. - Added support for basic compound stream filters for ASCII85Decode support (Issue #11) +- Added support for LZWDecode filters (Issue #11) - Fixed a buffer overflow in the (still not enabled) AES-256 code. diff --git a/Makefile.in b/Makefile.in index 867e6ca..5bc1842 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,7 +1,7 @@ # # Makefile for PDFio. # -# Copyright © 2021-2025 by Michael R Sweet. +# Copyright © 2021-2026 by Michael R Sweet. # # Licensed under Apache License v2.0. See the file "LICENSE" for more # information. @@ -91,6 +91,7 @@ PUBOBJS = \ pdfio-crypto.o \ pdfio-dict.o \ pdfio-file.o \ + pdfio-lzw.o \ pdfio-md5.o \ pdfio-object.o \ pdfio-page.o \ diff --git a/pdfio-lzw.c b/pdfio-lzw.c new file mode 100644 index 0000000..50243ec --- /dev/null +++ b/pdfio-lzw.c @@ -0,0 +1,309 @@ +// +// LZW decoding functions for PDFio. +// +// This code is used to support (legacy) PDF object streams using the LZWDecode +// filter as well as when embedding (legacy) GIF images. None of this is public +// API and we only support reading (decoding) since FlateDecode is superior in +// every way. +// +// Copyright © 2026 by Michael R Sweet. +// +// Licensed under Apache License v2.0. See the file "LICENSE" for more +// information. +// + +#include "pdfio-private.h" + + +// +// Local functions... +// + +static void lzw_clear(_pdfio_lzw_t *lzw); +static int lzw_get_code(_pdfio_lzw_t *lzw); + + +// +// '_pdfioLZWCreate()' - Create a LZW decompressor. +// + +_pdfio_lzw_t * // O - LZW state +_pdfioLZWCreate(int code_size) // I - Data code size in bits (typically 8 for PDF, 2-8 for GIF) +{ + _pdfio_lzw_t *lzw; // LZW state + + + if ((lzw = (_pdfio_lzw_t *)calloc(1, sizeof(_pdfio_lzw_t))) != NULL) + { + lzw->def_code_size = code_size + 1; + lzw->clear_code = (short)(1 << code_size); + lzw->eod_code = lzw->clear_code + 1; + + lzw_clear(lzw); + } + + return (lzw); +} + + +// +// '_pdfioLZWDelete()' - Delete a LZW decompressor. +// + +void +_pdfioLZWDelete(_pdfio_lzw_t *lzw) // I - LZW state +{ + free(lzw); +} + + +// +// '_pdfioLZWInflate()' - Decompress pending input data. +// + +bool // O - `true` on success, `false` on error +_pdfioLZWInflate(_pdfio_lzw_t *lzw) // I - LZW state +{ + int cur_code, // Current code + in_code; // Input code + + + // Stop if we already saw the "end of data" code... + if (lzw->saw_eod) + { + PDFIO_DEBUG("_pdfioLZWInflate: EOD, returning false.\n"); + lzw->error = "End of data."; + return (false); + } + + // Copy pending compressed data to the output buffer... + while (lzw->stptr > lzw->stack && lzw->avail_out > 0) + { + *(lzw->next_out++) = *(--lzw->stptr); + lzw->avail_out --; + } + + // Loop as long as we have room in the output buffer and data in the input + // buffer... + while (lzw->avail_out > 0) + { + if ((in_code = lzw_get_code(lzw)) < 0) + { + // Out of data, stop now... + PDFIO_DEBUG("_pdfioLZWInflate: Out of data.\n"); + break; + } + else if (in_code == lzw->clear_code) + { + // Clear the compression tables and reset... + lzw_clear(lzw); + PDFIO_DEBUG("_pdfioLZWInflate: Clear.\n"); + continue; + } + else if (in_code == lzw->eod_code) + { + // End of data... + lzw->saw_eod = true; + PDFIO_DEBUG("_pdfioLZWInflate: EOD.\n"); + break; + } + + // If we get this far we have something to write to the output buffer and/or + // stack... + if (lzw->first_code == 0xffff) + { + // First code... + lzw->first_code = lzw->old_code = in_code; + *(lzw->next_out++) = in_code; + lzw->avail_out --; + + PDFIO_DEBUG("_pdfioLZWInflate: first_code=%d.\n", in_code); + continue; + } + + PDFIO_DEBUG("_pdfioLZWInflate: in_code=%d.\n", in_code); + + cur_code = in_code; + + if (cur_code >= lzw->next_code) + { + *(lzw->stptr++) = lzw->first_code; + cur_code = lzw->old_code; + } + + while (cur_code >= lzw->clear_code) + { + PDFIO_DEBUG("_pdfioLZWInflate: cur_code=%d\n", cur_code); + + // Protect against overflow/loops... + if (lzw->stptr >= (lzw->stack + sizeof(lzw->stack) / sizeof(lzw->stack[0]))) + { + PDFIO_DEBUG("_pdfioLZWInflate: Stack overflow, returning false.\n"); + lzw->error = "Output overflow."; + return (false); + } + + // Add this character to the output stack and move to the next character + // in the sequence... + *(lzw->stptr++) = lzw->table[cur_code].suffix; + + if (cur_code == lzw->table[cur_code].prefix_code) + { + PDFIO_DEBUG("_pdfioLZWInflate: Table loop on code %d, returning false.\n", cur_code); + lzw->error = "Table loop detected."; + return (false); + } + + cur_code = lzw->table[cur_code].prefix_code; + } + + if (lzw->stptr >= (lzw->stack + sizeof(lzw->stack) / sizeof(lzw->stack[0]))) + { + PDFIO_DEBUG("_pdfioLZWInflate: Stack overflow, returning false.\n"); + lzw->error = "Output overflow."; + return (false); + } + + *(lzw->stptr++) = lzw->first_code = lzw->table[cur_code].suffix; + + if ((cur_code = lzw->next_code) < 4096) + { + PDFIO_DEBUG("_pdfioLZWInflate: Adding code %d (%d,%d)\n", cur_code, lzw->old_code, lzw->first_code); + + lzw->table[cur_code].prefix_code = lzw->old_code; + lzw->table[cur_code].suffix = lzw->first_code; + lzw->next_code ++; + + if (lzw->next_code >= lzw->next_size_code && lzw->next_size_code < 4096) + { + lzw->next_size_code *= 2; + lzw->cur_code_size ++; + } + } + + lzw->old_code = (uint16_t)in_code; + + while (lzw->stptr > lzw->stack && lzw->avail_out > 0) + { + *(lzw->next_out++) = *(--lzw->stptr); + lzw->avail_out --; + } + } + + PDFIO_DEBUG("_pdfioLZWInflate: Returning true, avail_in=%u, avail_out=%u.\n", (unsigned)lzw->avail_in, (unsigned)lzw->avail_out); + + return (true); +} + + +// +// 'lzw_clear()' - Clear the compression table. +// + +static void +lzw_clear(_pdfio_lzw_t *lzw) // I - LZW state +{ + uint16_t i; // Looping var + + + lzw->cur_code_size = lzw->def_code_size; + lzw->next_code = lzw->clear_code + 2; + lzw->next_size_code = 2 * lzw->clear_code; + lzw->first_code = 0xffff; + lzw->old_code = 0xffff; + + memset(lzw->table, 0, sizeof(lzw->table)); + + for (i = 0; i < lzw->clear_code; i ++) + lzw->table[i].suffix = i; + + lzw->stptr = lzw->stack; +} + + +// +// 'lzw_get_code()' - Get a code from the input buffer. +// + +static int // O - Code or -1 if there is not enough data available +lzw_get_code(_pdfio_lzw_t *lzw) // I - LZW state +{ + uint16_t code, // Code + in_bit; // Bit offset in buffer + uint8_t bits, // Bits in current byte + boff, // Bit offset in current byte + byte, // Current byte + remaining; // Remaining bits for code + static uint8_t mask[8] = // Value mask + { + 0xff, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f + }; + + + // Fill input bytes as needed... + if ((lzw->in_bit + lzw->cur_code_size) > lzw->in_bits) + { + uint16_t in_used = lzw->in_bits / 8, + // Number of input bytes + in_offset = lzw->in_bit / 8, + // Offset to current input + in_add; // Number of bytes to "read" + + + if (lzw->avail_in == 0) + { + // No more data + PDFIO_DEBUG("lzw_get_code: No data, returning -1.\n"); + return (-1); + } + + if (in_offset > 0) + { + // Make room in the input buffer + memmove(lzw->in_bytes, lzw->in_bytes + in_offset, in_used - in_offset); + in_used -= in_offset; + lzw->in_bit &= 7; + } + + if ((in_add = sizeof(lzw->in_bytes) - in_used) > lzw->avail_in) + in_add = lzw->avail_in; + + memcpy(lzw->in_bytes + in_used, lzw->next_in, in_add); + lzw->next_in += in_add; + lzw->avail_in -= in_add; + lzw->in_bits = 8 * (in_used + in_add); + + if ((lzw->in_bit + lzw->cur_code_size) > lzw->in_bits) + { + // Not enough data + PDFIO_DEBUG("lzw_get_code: Not enough data, returning -1.\n"); + return (-1); + } + } + + PDFIO_DEBUG("lzw_get_code: in_bit=%u, in_bits=%u, in_bytes=<...%02X%02X...>, cur_code_size=%u\n", lzw->in_bit, lzw->in_bits, lzw->in_bytes[lzw->in_bit / 8], lzw->in_bytes[lzw->in_bit / 8 + 1], lzw->cur_code_size); + + // Now extract the code from the buffer... + for (code = 0, in_bit = lzw->in_bit, remaining = lzw->cur_code_size; remaining > 0; in_bit += bits, remaining -= bits) + { + // See how many bits we can extract from the current byte... + boff = (in_bit & 7); + byte = lzw->in_bytes[in_bit / 8]; + bits = 8 - boff; + if (bits > remaining) + bits = remaining; + + // Get those bits + if (bits == 8) // Full byte from buffer + code = (code << 8) | byte; + else // Partial byte from buffer + code = (code << bits) | ((byte >> (8 - bits - boff)) & mask[bits]); + } + + // Save the updated position in the input buffer and return the code... + lzw->in_bit = in_bit; + + PDFIO_DEBUG("lzw_get_code: Returning %u.\n", code); + + return ((int)code); +} diff --git a/pdfio-private.h b/pdfio-private.h index 4549b63..3a2a618 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -211,6 +211,36 @@ typedef union _pdfio_crypto_ctx_u // Cryptographic contexts } _pdfio_crypto_ctx_t; typedef size_t (*_pdfio_crypto_cb_t)(_pdfio_crypto_ctx_t *ctx, uint8_t *outbuffer, const uint8_t *inbuffer, size_t len); +typedef struct _pdfio_lzws_s // LZW string table +{ + uint16_t prefix_code, // Prefix code + suffix; // Suffix (character) +} _pdfio_lzws_t; + +typedef struct _pdfio_lzw_s // LZW state +{ + uint8_t *next_in; // Next input byte + size_t avail_in; // Available input bytes + uint8_t in_bytes[256]; // Current input bytes + uint16_t in_bit, // Current input bit + in_bits; // Total input bits + uint8_t *next_out; // Next output byte + size_t avail_out; // Available output bytes + uint8_t cur_code_size, // Current code size + def_code_size; // Initial/default code size + uint16_t clear_code, // Clear code + eod_code, // End code + next_code, // Next code to be used + next_size_code, // Code where we need to increase the code size + first_code, // First code in sequence + old_code, // Previous code in sequence + stack[8192], // Output stack + *stptr; // Current stack pointer + _pdfio_lzws_t table[4096]; // String table + bool saw_eod; // Saw end-of-data code? + const char *error; // Error, if any +} _pdfio_lzw_t; + struct _pdfio_array_s { pdfio_file_t *pdf; // PDF file @@ -349,11 +379,12 @@ struct _pdfio_stream_s // Stream *a85decptr, // Pointer into decoded characters *a85decend; // Last decoded character z_stream flate; // Flate filter state + _pdfio_lzw_t *lzw; // LZW filter state _pdfio_predictor_t predictor; // Predictor function, if any size_t pbpixel, // Size of a pixel in bytes pbsize, // Predictor buffer size, if any cbsize; // Compressed data buffer size - unsigned char *cbuffer, // Compressed data buffer + uint8_t *cbuffer, // Compressed data buffer *prbuffer, // Raw buffer (previous line), as needed *psbuffer; // PNG filter buffer, as needed _pdfio_crypto_cb_t crypto_cb; // Encryption/descryption callback, if any @@ -420,6 +451,10 @@ extern off_t _pdfioFileSeek(pdfio_file_t *pdf, off_t offset, int whence) _PDFIO extern off_t _pdfioFileTell(pdfio_file_t *pdf) _PDFIO_INTERNAL; extern bool _pdfioFileWrite(pdfio_file_t *pdf, const void *buffer, size_t bytes) _PDFIO_INTERNAL; +extern _pdfio_lzw_t *_pdfioLZWCreate(int def_code_size) _PDFIO_INTERNAL; +extern void _pdfioLZWDelete(_pdfio_lzw_t *lzw) _PDFIO_INTERNAL; +extern bool _pdfioLZWInflate(_pdfio_lzw_t *lzw) _PDFIO_INTERNAL; + extern void _pdfioObjDelete(pdfio_obj_t *obj) _PDFIO_INTERNAL; extern void *_pdfioObjGetExtension(pdfio_obj_t *obj) _PDFIO_INTERNAL; extern bool _pdfioObjLoad(pdfio_obj_t *obj) _PDFIO_INTERNAL; diff --git a/pdfio-stream.c b/pdfio-stream.c index 8c397e3..3dfeb58 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -15,6 +15,7 @@ // static ssize_t stream_get_bytes(pdfio_stream_t *st, void *buffer, size_t bytes); +static ssize_t stream_inflate(pdfio_stream_t *st, uint8_t *buffer, size_t bytes, bool exactly); static unsigned char stream_paeth(unsigned char a, unsigned char b, unsigned char c); static ssize_t stream_read(pdfio_stream_t *st, char *buffer, size_t bytes); static bool stream_write(pdfio_stream_t *st, const void *buffer, size_t bytes); @@ -40,6 +41,8 @@ pdfioStreamClose(pdfio_stream_t *st) // I - Stream { if (st->filter == PDFIO_FILTER_FLATE) inflateEnd(&(st->flate)); + else if (st->filter == PDFIO_FILTER_LZW) + _pdfioLZWDelete(st->lzw); } else { @@ -523,9 +526,9 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object // No filter, read as-is... st->filter = PDFIO_FILTER_NONE; } - else if (!strcmp(filter, "FlateDecode")) + else if (!strcmp(filter, "FlateDecode") || !strcmp(filter, "LZWDecode")) { - // Flate compression + // Flate or LZW compression pdfio_dict_t *params = pdfioDictGetDict(dict, "DecodeParms"); // Decoding parameters int bpc = (int)pdfioDictGetNumber(params, "BitsPerComponent"); @@ -536,12 +539,11 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object // Number of columns int predictor = (int)pdfioDictGetNumber(params, "Predictor"); // Predictory value, if any - int status; // ZLIB status ssize_t rbytes; // Bytes read - PDFIO_DEBUG("_pdfioStreamOpen: FlateDecode - BitsPerComponent=%d, Colors=%d, Columns=%d, Predictor=%d\n", bpc, colors, columns, predictor); + PDFIO_DEBUG("_pdfioStreamOpen: %s - BitsPerComponent=%d, Colors=%d, Columns=%d, Predictor=%d\n", filter, bpc, colors, columns, predictor); - st->filter = PDFIO_FILTER_FLATE; + st->filter = !strcmp(filter, "FlateDecode") ? PDFIO_FILTER_FLATE : PDFIO_FILTER_LZW; if (bpc == 0) { @@ -613,40 +615,41 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object } PDFIO_DEBUG("_pdfioStreamOpen: pos=%ld\n", (long)_pdfioFileTell(st->pdf)); - if (st->cbsize > st->remaining) - rbytes = _pdfioFileRead(st->pdf, st->cbuffer, st->remaining); - else - rbytes = _pdfioFileRead(st->pdf, st->cbuffer, st->cbsize); - - if (rbytes <= 0) + if ((rbytes = stream_get_bytes(st, st->cbuffer, st->cbsize)) <= 0) { _pdfioFileError(st->pdf, "Unable to read bytes for stream."); goto error; } - if (st->crypto_cb) - rbytes = (ssize_t)(st->crypto_cb)(&st->crypto_ctx, st->cbuffer, st->cbuffer, (size_t)rbytes); - - st->flate.next_in = (Bytef *)st->cbuffer; - st->flate.avail_in = (uInt)rbytes; - - PDFIO_DEBUG("_pdfioStreamOpen: avail_in=%u, cbuffer=<%02X%02X%02X%02X%02X%02X%02X%02X...>\n", st->flate.avail_in, st->cbuffer[0], st->cbuffer[1], st->cbuffer[2], st->cbuffer[3], st->cbuffer[4], st->cbuffer[5], st->cbuffer[6], st->cbuffer[7]); - - if ((status = inflateInit(&(st->flate))) != Z_OK) + if (st->filter == PDFIO_FILTER_FLATE) { - _pdfioFileError(st->pdf, "Unable to start Flate filter: %s", zstrerror(status)); - goto error; - } + // Flate decompression... + int status; // ZLIB status - st->remaining -= st->flate.avail_in; + st->flate.next_in = (Bytef *)st->cbuffer; + st->flate.avail_in = (uInt)rbytes; + + PDFIO_DEBUG("_pdfioStreamOpen: avail_in=%u, cbuffer=<%02X%02X%02X%02X%02X%02X%02X%02X...>\n", st->flate.avail_in, st->cbuffer[0], st->cbuffer[1], st->cbuffer[2], st->cbuffer[3], st->cbuffer[4], st->cbuffer[5], st->cbuffer[6], st->cbuffer[7]); + + if ((status = inflateInit(&(st->flate))) != Z_OK) + { + _pdfioFileError(st->pdf, "Unable to start Flate filter: %s", zstrerror(status)); + goto error; + } + } + else + { + // LZW decompression... + if ((st->lzw = _pdfioLZWCreate(/*code_size*/8)) == NULL) + { + _pdfioFileError(st->pdf, "Unable to initialize LZW filter: %s", strerror(errno)); + goto error; + } + + st->lzw->next_in = st->cbuffer; + st->lzw->avail_in = (size_t)rbytes; + } } -#if 0 // TODO: Implement LZWDecode filter - else if (!strcmp(filter, "LZWDecode")) - { - // LZW compression - st->filter = PDFIO_FILTER_LZW; - } -#endif // 0 else { // Something else we don't support @@ -1143,28 +1146,44 @@ stream_get_bytes( a85val = a85val * 85 + a85ch - '!'; count ++; } + else if (a85ch == 'z' && count == 0) + { + // 'z' == 0's + a85val = 0; + count = 5; + + a85bufptr++; + } + else if (a85ch == '~') + { + break; + } else if (!isspace(a85ch & 255)) { // Invalid ASCII85Decode character... - _pdfioFileError(st->pdf, "Invalid ASCII85Decode character in stream."); + _pdfioFileError(st->pdf, "Invalid ASCII85Decode character '%c' in stream.", a85ch); return (-1); } } + st->a85bufptr = a85bufptr; + + if (*a85bufptr == '~') + break; + if (count < 2) { // Need at least 2 characters to decode a single byte... - _pdfioFileError(st->pdf, "Invalid ASCII85Decode character in stream."); + _pdfioFileError(st->pdf, "Invalid ASCII85Decode sequence in stream."); return (-1); } - st->a85bufptr = a85bufptr; - declen = count - 1; + declen = count - 1; - // Add zero rounds to properly align the decoded value... + // Add rounds to properly align the decoded value... while (count < 5) { - a85val *= 85; + a85val = a85val * 85 + 84; count ++; } @@ -1178,6 +1197,8 @@ stream_get_bytes( st->a85decend = st->a85decode + declen; } + PDFIO_DEBUG("stream_get_bytes: Returning %ld ASCII85 bytes for stream.\n", (long)rbytes); + return (rbytes); } else @@ -1196,11 +1217,100 @@ stream_get_bytes( (st->crypto_cb)(&st->crypto_ctx, (uint8_t *)buffer, (uint8_t *)buffer, (size_t)rbytes); } + PDFIO_DEBUG("stream_get_bytes: Returning %ld raw bytes for stream.\n", (long)rbytes); + return (rbytes); } } +// +// 'stream_inflate()' - Decompress bytes from a stream (Flate or LZW) into the specified buffer. +// + +static ssize_t +stream_inflate(pdfio_stream_t *st, // I - Stream + uint8_t *buffer, // I - Output buffer + size_t bytes, // I - Number of bytes + bool exactly) // I - Require exactly the number of bytes +{ + ssize_t rbytes; // Bytes read + + + // Setup decompression to the output buffer... + if (st->filter == PDFIO_FILTER_FLATE) + { + st->flate.next_out = (Bytef *)buffer; + st->flate.avail_out = (uInt)bytes; + } + else + { + st->lzw->next_out = buffer; + st->lzw->avail_out = bytes; + } + + // Loop to get the bytes... + do + { + if (st->filter == PDFIO_FILTER_FLATE) + { + // Flate decompress + int status; // Status of decompression + + PDFIO_DEBUG("stream_inflate: avail_in=%u, avail_out=%u\n", st->flate.avail_in, st->flate.avail_out); + + if (st->flate.avail_in == 0) + { + // Read more from the file... + if ((rbytes = stream_get_bytes(st, st->cbuffer, st->cbsize)) <= 0) + return (-1); // End of file... + + st->flate.next_in = (Bytef *)st->cbuffer; + st->flate.avail_in = (uInt)rbytes; + } + + if ((status = inflate(&(st->flate), Z_NO_FLUSH)) < Z_OK) + { + PDFIO_DEBUG("stream_inflate: inflate() returned %d\n", status); + _pdfioFileError(st->pdf, "Unable to decompress stream data for object %ld: %s", (long)st->obj->number, zstrerror(status)); + return (-1); + } + + bytes = (size_t)st->flate.avail_out; + } + else + { + // LZW decompress + if (st->lzw->avail_in == 0) + { + // Read more from the file... + if ((rbytes = stream_get_bytes(st, st->cbuffer, st->cbsize)) <= 0) + return (-1); // End of file... + + st->lzw->next_in = st->cbuffer; + st->lzw->avail_in = (size_t)rbytes; + } + + if (!_pdfioLZWInflate(st->lzw) && !st->lzw->saw_eod) + { + _pdfioFileError(st->pdf, "Unable to decompress stream data for object %ld: %s", (long)st->obj->number, st->lzw->error); + return (-1); + } + + bytes = st->lzw->avail_out; + } + } + while (bytes > 0 && exactly); + + if (exactly && bytes > 0) + return (-1); + else if (st->filter == PDFIO_FILTER_FLATE) + return (st->flate.next_out - (Bytef *)buffer); + else + return (st->lzw->next_out - (uint8_t *)buffer); +} + + // // 'stream_paeth()' - PaethPredictor function for PNG decompression filter. // @@ -1228,47 +1338,20 @@ stream_read(pdfio_stream_t *st, // I - Stream char *buffer, // I - Buffer size_t bytes) // I - Number of bytes to read { - uInt avail_in, avail_out; // Previous flate values - - if (st->filter == PDFIO_FILTER_NONE) { // No filtering... return (stream_get_bytes(st, buffer, bytes)); } - else if (st->filter == PDFIO_FILTER_FLATE) + else if (st->filter == PDFIO_FILTER_FLATE || st->filter == PDFIO_FILTER_LZW) { - // Deflate compression... - int status; // Status of decompression - + // Flate or LZW compression... if (st->predictor == _PDFIO_PREDICTOR_NONE) { // Decompress into the buffer... PDFIO_DEBUG("stream_read: No predictor.\n"); - if (st->flate.avail_in == 0) - { - // Read more from the file... - ssize_t rbytes = stream_get_bytes(st, st->cbuffer, st->cbsize); - // Bytes read - - if (rbytes <= 0) - return (-1); // End of file... - - st->flate.next_in = (Bytef *)st->cbuffer; - st->flate.avail_in = (uInt)rbytes; - } - - st->flate.next_out = (Bytef *)buffer; - st->flate.avail_out = (uInt)bytes; - - if ((status = inflate(&(st->flate), Z_NO_FLUSH)) < Z_OK) - { - _pdfioFileError(st->pdf, "Unable to decompress stream data for object %ld: %s", (long)st->obj->number, zstrerror(status)); - return (-1); - } - - return (st->flate.next_out - (Bytef *)buffer); + return (stream_inflate(st, (uint8_t *)buffer, bytes, /*exactly*/false)); } else if (st->predictor == _PDFIO_PREDICTOR_TIFF2) { @@ -1276,9 +1359,9 @@ stream_read(pdfio_stream_t *st, // I - Stream // Size of pixel in bytes remaining = st->pbsize; // Remaining bytes - unsigned char *bufptr = (unsigned char *)buffer, + uint8_t *bufptr = (uint8_t *)buffer, // Pointer into buffer - *bufsecond = (unsigned char *)buffer + pbpixel, + *bufsecond = (uint8_t *)buffer + pbpixel, // Pointer to second pixel in buffer *sptr = st->psbuffer; // Current (raw) line @@ -1291,37 +1374,7 @@ stream_read(pdfio_stream_t *st, // I - Stream return (-1); } - st->flate.next_out = (Bytef *)sptr; - st->flate.avail_out = (uInt)st->pbsize; - - while (st->flate.avail_out > 0) - { - if (st->flate.avail_in == 0) - { - // Read more from the file... - ssize_t rbytes = stream_get_bytes(st, st->cbuffer, st->cbsize); - // Bytes read - - if (rbytes <= 0) - return (-1); // End of file... - - st->flate.next_in = (Bytef *)st->cbuffer; - st->flate.avail_in = (uInt)rbytes; - } - - avail_in = st->flate.avail_in; - avail_out = st->flate.avail_out; - - if ((status = inflate(&(st->flate), Z_NO_FLUSH)) < Z_OK) - { - _pdfioFileError(st->pdf, "Unable to decompress stream data for object %ld: %s", (long)st->obj->number, zstrerror(status)); - return (-1); - } - else if (status == Z_STREAM_END || (avail_in == st->flate.avail_in && avail_out == st->flate.avail_out)) - break; - } - - if (st->flate.avail_out > 0) + if (stream_inflate(st, sptr, st->pbsize, /*exactly*/true) < 0) return (-1); // Early end of stream for (; bufptr < bufsecond; remaining --, sptr ++) @@ -1338,9 +1391,9 @@ stream_read(pdfio_stream_t *st, // I - Stream // Size of pixel in bytes remaining = st->pbsize - 1; // Remaining bytes - unsigned char *bufptr = (unsigned char *)buffer, + uint8_t *bufptr = (uint8_t *)buffer, // Pointer into buffer - *bufsecond = (unsigned char *)buffer + pbpixel, + *bufsecond = (uint8_t *)buffer + pbpixel, // Pointer to second pixel in buffer *sptr = st->psbuffer + 1, // Current (raw) line @@ -1355,40 +1408,10 @@ stream_read(pdfio_stream_t *st, // I - Stream return (-1); } - st->flate.next_out = (Bytef *)sptr - 1; - st->flate.avail_out = (uInt)st->pbsize; - - while (st->flate.avail_out > 0) - { - if (st->flate.avail_in == 0) - { - // Read more from the file... - ssize_t rbytes = stream_get_bytes(st, st->cbuffer, st->cbsize); - // Bytes read - - if (rbytes <= 0) - return (-1); // End of file... - - st->flate.next_in = (Bytef *)st->cbuffer; - st->flate.avail_in = (uInt)rbytes; - } - - avail_in = st->flate.avail_in; - avail_out = st->flate.avail_out; - - if ((status = inflate(&(st->flate), Z_NO_FLUSH)) < Z_OK) - { - _pdfioFileError(st->pdf, "Unable to decompress stream data for object %ld: %s", (long)st->obj->number, zstrerror(status)); - return (-1); - } - else if (status == Z_STREAM_END || (avail_in == st->flate.avail_in && avail_out == st->flate.avail_out)) - break; - } - - if (st->flate.avail_out > 0) + if (stream_inflate(st, sptr - 1, st->pbsize, /*exactly*/true) < 0) { // Early end of stream - PDFIO_DEBUG("stream_read: Early EOF (remaining=%u, avail_in=%d, avail_out=%d, data_type=%d, next_in=<%02X%02X%02X%02X...>).\n", (unsigned)st->remaining, st->flate.avail_in, st->flate.avail_out, st->flate.data_type, st->flate.next_in[0], st->flate.next_in[1], st->flate.next_in[2], st->flate.next_in[3]); + PDFIO_DEBUG("stream_read: Early EOF (remaining=%u).\n", (unsigned)st->remaining); return (-1); } @@ -1491,8 +1514,6 @@ stream_write(pdfio_stream_t *st, // I - Stream outbytes = cbytes; } -// fprintf(stderr, "stream_write: bytes=%u, outbytes=%u\n", (unsigned)bytes, (unsigned)outbytes); - if (!_pdfioFileWrite(st->pdf, st->cbuffer, outbytes)) return (false); diff --git a/pdfio.h b/pdfio.h index f7895f5..41255a0 100644 --- a/pdfio.h +++ b/pdfio.h @@ -1,7 +1,7 @@ // // Public header file for PDFio. // -// Copyright © 2021-2025 by Michael R Sweet. +// Copyright © 2021-2026 by Michael R Sweet. // // Licensed under Apache License v2.0. See the file "LICENSE" for more // information. @@ -72,11 +72,11 @@ typedef enum pdfio_filter_e // Compression/decompression filters for streams PDFIO_FILTER_NONE, // No filter PDFIO_FILTER_ASCIIHEX, // ASCIIHexDecode filter (reading only) PDFIO_FILTER_ASCII85, // ASCII85Decode filter (reading only) - PDFIO_FILTER_CCITTFAX, // CCITTFaxDecode filter + PDFIO_FILTER_CCITTFAX, // CCITTFaxDecode filter (reading only) PDFIO_FILTER_CRYPT, // Encryption filter PDFIO_FILTER_DCT, // DCTDecode (JPEG) filter PDFIO_FILTER_FLATE, // FlateDecode filter - PDFIO_FILTER_JBIG2, // JBIG2Decode filter + PDFIO_FILTER_JBIG2, // JBIG2Decode filter (reading only) PDFIO_FILTER_JPX, // JPXDecode filter (reading only) PDFIO_FILTER_LZW, // LZWDecode filter (reading only) PDFIO_FILTER_RUNLENGTH, // RunLengthDecode filter (reading only) diff --git a/pdfio.vcxproj b/pdfio.vcxproj index 2943025..f6de726 100644 --- a/pdfio.vcxproj +++ b/pdfio.vcxproj @@ -160,6 +160,7 @@ + diff --git a/test.h b/test.h index 454a76c..86f8e87 100644 --- a/test.h +++ b/test.h @@ -96,6 +96,16 @@ static int test_progress; // Current progress static char test_title[1024] = ""; // Current test title +// Add printf syntax checking on supported compilers... +#if defined(__has_extension) || defined(__GNUC__) +# define TEST_FORMAT(a,b) __attribute__ ((__format__(__printf__,a,b))) +static inline void testBegin(const char *title, ...) TEST_FORMAT(1,2); +static inline void testEndMessage(bool pass, const char *message, ...) TEST_FORMAT(2,3); +static inline void testError(const char *error, ...) TEST_FORMAT(1,2); +static inline void testMessage(const char *error, ...) TEST_FORMAT(1,2); +#endif // __has_extension || __GNUC__ + + // Start a test static inline void testBegin(const char *title, ...) // I - printf-style title string diff --git a/testpdfio.c b/testpdfio.c index cca0501..0c95fae 100644 --- a/testpdfio.c +++ b/testpdfio.c @@ -32,6 +32,7 @@ // static int do_crypto_tests(void); +static int do_lzw_tests(void); static int do_pdfa_tests(void); static int do_test_file(const char *filename, const char *outfile, int objnum, const char *password, bool verbose); static int do_unit_tests(void); @@ -382,6 +383,102 @@ do_crypto_tests(void) } +// +// 'do_lzw_tests()' - Test the various LZW functions in PDFio. +// + +static int // O - Exit status +do_lzw_tests(void) +{ + int status = 0; // Exit status + _pdfio_lzw_t *lzw; // LZW state + uint8_t buffer[8192]; // Output buffer + size_t bytes; // Output bytes + static uint8_t iso32000_in[] = // ISO-32000-2 test case input + { + 0x80, 0x0B, 0x60, 0x50, 0x22, 0x0C, 0x0C, 0x85, 0x01 + }; + static uint8_t iso32000_out[] = // ISO-32000-2 test case output + { + 45, 45, 45, 45, 45, 65, 45, 45, 45, 66 + }; + + + testBegin("_pdfioLZWCreate(8)"); + testEnd((lzw = _pdfioLZWCreate(/*code_size*/8)) != NULL); + if (!lzw) + return (1); + + testBegin("_pdfioLZWInflate(ISO 32000-2 test case)"); + + lzw->avail_in = sizeof(iso32000_in); + lzw->next_in = iso32000_in; + + lzw->avail_out = sizeof(buffer); + lzw->next_out = buffer; + + if (!_pdfioLZWInflate(lzw)) + { + testEndMessage(false, "returned false"); + status = 1; + } + else if ((bytes = sizeof(buffer) - lzw->avail_out) != sizeof(iso32000_out)) + { + testEndMessage(false, "got %u bytes, expected %u bytes", (unsigned)bytes, (unsigned)sizeof(iso32000_out)); + status = 1; + } + else if (memcmp(buffer, iso32000_out, bytes)) + { + size_t i; // Looping var + + testEndMessage(false, "got incorrect output"); + + testMessage(" EXPECTED %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X", iso32000_out[0], iso32000_out[1], iso32000_out[2], iso32000_out[3], iso32000_out[4], iso32000_out[5], iso32000_out[6], iso32000_out[7], iso32000_out[8], iso32000_out[9]); + + for (i = 0; i < bytes; i += 8) + { + switch (bytes - i) + { + case 1 : + testMessage(" %s %02X", i == 0 ? "GOT" : " ", buffer[i + 0]); + break; + case 2 : + testMessage(" %s %02X %02X", i == 0 ? "GOT" : " ", buffer[i + 0], buffer[i + 1]); + break; + case 3 : + testMessage(" %s %02X %02X %02X", i == 0 ? "GOT" : " ", buffer[i + 0], buffer[i + 1], buffer[i + 2]); + break; + case 4 : + testMessage(" %s %02X %02X %02X %02X", i == 0 ? "GOT" : " ", buffer[i + 0], buffer[i + 1], buffer[i + 2], buffer[i + 3]); + break; + case 5 : + testMessage(" %s %02X %02X %02X %02X %02X", i == 0 ? "GOT" : " ", buffer[i + 0], buffer[i + 1], buffer[i + 2], buffer[i + 3], buffer[i + 4]); + break; + case 6 : + testMessage(" %s %02X %02X %02X %02X %02X %02X", i == 0 ? "GOT" : " ", buffer[i + 0], buffer[i + 1], buffer[i + 2], buffer[i + 3], buffer[i + 4], buffer[i + 5]); + break; + case 7 : + testMessage(" %s %02X %02X %02X %02X %02X %02X %02X", i == 0 ? "GOT" : " ", buffer[i + 0], buffer[i + 1], buffer[i + 2], buffer[i + 3], buffer[i + 4], buffer[i + 5], buffer[i + 6]); + break; + default : + testMessage(" %s %02X %02X %02X %02X %02X %02X %02X %02X", i == 0 ? "GOT" : " ", buffer[i + 0], buffer[i + 1], buffer[i + 2], buffer[i + 3], buffer[i + 4], buffer[i + 5], buffer[i + 6], buffer[i + 7]); + break; + } + } + + status = 1; + } + else + { + testEnd(true); + } + + _pdfioLZWDelete(lzw); + + return (status); +} + + // // 'do_pdfa_tests()' - Run PDF/A generation and compliance tests. // @@ -1180,6 +1277,10 @@ do_unit_tests(void) if (do_crypto_tests()) return (1); + // Do LZW tests... + if (do_lzw_tests()) + return (1); + // Create a new PDF file... testBegin("pdfioFileCreate(\"testpdfio-out.pdf\", ...)"); if ((outpdf = pdfioFileCreate("testpdfio-out.pdf", /*version*/"1.7", /*media_box*/NULL, /*crop_box*/NULL, (pdfio_error_cb_t)error_cb, &error)) != NULL)