From 44325ce2d9c42d1d139168127bbbe58a78cd95ea Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Fri, 7 May 2021 08:47:49 -0400 Subject: [PATCH] Safe work on streams - still need to implement predictors --- TODO.md | 3 + pdfio-file.c | 89 ++++++++++++++++- pdfio-object.c | 20 +++- pdfio-private.h | 18 +++- pdfio-stream.c | 249 ++++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 355 insertions(+), 24 deletions(-) diff --git a/TODO.md b/TODO.md index 6139bb5..eb6c9a0 100644 --- a/TODO.md +++ b/TODO.md @@ -10,6 +10,9 @@ To-Do List one PDF to another, there are a bunch of resources that also need to be copied. A dictionary with an object reference can't be copied directly as the object number in the new PDF will likely be different than the old one. + - Add _pdfio_map_t with original pdfio_file_t * and object numbers + - Add _pdfioObjCopy function + - Add _pdfioFileGetMappedObject function to get the new object number - Security handlers (RC4 + AES, MD5 + SHA-256) for reading encrypted documents. - Signature generation/validation code - Documentation diff --git a/pdfio-file.c b/pdfio-file.c index ab5dc21..6b4cd32 100644 --- a/pdfio-file.c +++ b/pdfio-file.c @@ -537,14 +537,19 @@ load_xref(pdfio_file_t *pdf, // I - PDF file return (false); } + PDFIO_DEBUG("load_xref: xref_offset=%lu, line='%s'\n", (unsigned long)xref_offset, line); + if (isdigit(line[0] & 255) && strlen(line) > 4 && !strcmp(line + strlen(line) - 4, " obj")) { // Cross-reference stream pdfio_obj_t *obj; // Object + size_t i; // Looping var pdfio_array_t *w_array; // W array size_t w[3]; // Size of each cross-reference field + size_t w_2, // Offset to second field + w_3; // Offset to third field size_t w_total; // Total length - pdfio_stream_t *st; // Stream with + pdfio_stream_t *st; // Stream unsigned char buffer[32]; // Read buffer if ((number = strtoimax(line, &ptr, 10)) < 1) @@ -568,6 +573,8 @@ load_xref(pdfio_file_t *pdf, // I - PDF file return (false); } + PDFIO_DEBUG("load_xref: Loading object %lu %u.\n", (unsigned long)number, (unsigned)generation); + if ((obj = add_obj(pdf, (size_t)number, (unsigned short)generation, xref_offset)) == NULL) { _pdfioFileError(pdf, "Unable to allocate memory for object."); @@ -587,7 +594,85 @@ load_xref(pdfio_file_t *pdf, // I - PDF file obj->value = trailer; - // TODO: read stream + if (!_pdfioFileGetToken(pdf, line, sizeof(line)) || strcmp(line, "stream")) + { + _pdfioFileError(pdf, "Unable to get stream after xref dictionary."); + return (false); + } + + obj->stream_offset = _pdfioFileTell(pdf); + + if ((w_array = pdfioDictGetArray(trailer.value.dict, "W")) == NULL) + { + _pdfioFileError(pdf, "Cross-reference stream does not have required W key."); + return (false); + } + + w[0] = (size_t)pdfioArrayGetNumber(w_array, 0); + w[1] = (size_t)pdfioArrayGetNumber(w_array, 1); + w[2] = (size_t)pdfioArrayGetNumber(w_array, 2); + w_total = w[0] + w[1] + w[2]; + w_2 = w[0]; + w_3 = w[0] + w[1]; + + if (w[1] == 0 || w[2] > 2 || w_total > sizeof(buffer)) + { + _pdfioFileError(pdf, "Cross-reference stream has invalid W key."); + return (false); + } + + if ((st = pdfioObjOpenStream(obj, true)) == NULL) + { + _pdfioFileError(pdf, "Unable to open cross-reference stream."); + return (false); + } + + while (pdfioStreamRead(st, buffer, w_total) > 0) + { + PDFIO_DEBUG("load_xref: %02X%02X%02X%02X%02X\n", buffer[0], buffer[1], buffer[2], buffer[3], buffer[4]); + + // Check whether this is an object definition... + if (w[0] > 0) + { + if (buffer[0] == 0) + { + // Ignore free objects... + continue; + } + else if (buffer[0] == 2) + { + // TODO: Add support for compressed object streams... + // Compressed object... + _pdfioFileError(pdf, "PDF file contains compressed object streams which are not currently supported."); + continue; + } + } + + for (i = 1, offset = buffer[w_2]; i < w[1]; i ++) + offset = (offset << 8) | buffer[w_2 + i]; + + switch (w[2]) + { + default : + generation = 0; + break; + case 1 : + generation = buffer[w_3]; + break; + case 2 : + generation = (buffer[w_3] << 8) | buffer[w_3 + 1]; + break; + } + + // Create a placeholder for the object in memory... + if (pdfioFileFindObject(pdf, (size_t)number)) + continue; // Don't replace newer object... + + if (!add_obj(pdf, (size_t)number, (unsigned short)generation, offset)) + return (false); + } + + pdfioStreamClose(st); } else if (!strcmp(line, "xref")) { diff --git a/pdfio-object.c b/pdfio-object.c index 6a7c834..a9146bf 100644 --- a/pdfio-object.c +++ b/pdfio-object.c @@ -200,9 +200,21 @@ pdfio_stream_t * // O - Stream or `NULL` on error pdfioObjOpenStream(pdfio_obj_t *obj, // I - Object bool decode) // I - Decode/decompress data? { - // TODO: Implement me - (void)obj; - (void)decode; + // Range check input... + if (!obj) + return (NULL); - return (NULL); + // Make sure we've loaded the object dictionary... + if (!obj->value.type) + { + if (!_pdfioObjLoad(obj)) + return (NULL); + } + + // No stream if there is no dict or offset to a stream... + if (obj->value.type != PDFIO_VALTYPE_DICT || !obj->stream_offset) + return (NULL); + + // Open the stream... + return (_pdfioStreamOpen(obj, decode)); } diff --git a/pdfio-private.h b/pdfio-private.h index e7f79c9..aa62575 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -69,6 +69,17 @@ typedef enum _pdfio_mode_e // Read/write mode _PDFIO_MODE_WRITE // Write a PDF file } _pdfio_mode_t; +typedef enum _pdfio_predictor_e // PNG predictor constants +{ + _PDFIO_PREDICTOR_NONE = 1, // No predictor (default) + _PDFIO_PREDICTOR_TIFF2 = 2, // TIFF2 predictor (???) + _PDFIO_PREDICTOR_PNG_NONE = 10, // PNG None predictor (same as `_PDFIO_PREDICTOR_NONE`) + _PDFIO_PREDICTOR_PNG_SUB = 11, // PNG Sub predictor + _PDFIO_PREDICTOR_PNG_UP = 12, // PNG Up predictor + _PDFIO_PREDICTOR_PNG_AVERAGE = 13, // PNG Average predictor + _PDFIO_PREDICTOR_PNG_PAETH = 14 // PNG Paeth predictor +} _pdfio_predictor_t; + typedef struct _pdfio_value_s // Value structure { pdfio_valtype_t type; // Type of value @@ -174,9 +185,12 @@ struct _pdfio_stream_s // Stream pdfio_file_t *pdf; // PDF file pdfio_obj_t *obj; // Object pdfio_filter_t filter; // Compression/decompression filter - char buffer[8192]; // Read/write buffer - size_t bufused; // Number of bytes in buffer + size_t remaining; // Remaining bytes in stream + char buffer[8192], // Read/write buffer + *bufptr, // Current position in buffer + *bufend; // End of buffer z_stream flate; // Flate filter state + char cbuffer[4096]; // Compressed data buffer }; typedef ssize_t (*_pdfio_tconsume_cb_t)(void *data, size_t bytes); diff --git a/pdfio-stream.c b/pdfio-stream.c index 0f0715d..d921e80 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -14,6 +14,13 @@ #include "pdfio-private.h" +// +// Local functions... +// + +static ssize_t stream_read(pdfio_stream_t *st, char *buffer, size_t bytes); + + // // 'pdfioStreamClose()' - Close a (data) stream in a PDF file. // @@ -54,10 +61,35 @@ bool // O - `true` on success, `false` on EOF pdfioStreamConsume(pdfio_stream_t *st, // I - Stream size_t bytes)// I - Number of bytes to consume { - // TODO: Implement me - (void)st; - (void)bytes; - return (false); + size_t remaining; // Remaining bytes in buffer + ssize_t rbytes; // Bytes read + + + // Range check input... + if (!st || st->pdf->mode != _PDFIO_MODE_READ || !bytes) + return (false); + + // Skip bytes in the stream buffer until we've consumed the requested number + // or get to the end of the stream... + while ((remaining = (size_t)(st->bufend - st->bufptr)) < bytes) + { + bytes -= remaining; + + if ((rbytes = stream_read(st, st->buffer, sizeof(st->buffer))) > 0) + { + st->bufptr = st->buffer; + st->bufend = st->buffer + rbytes; + } + else + { + st->bufptr = st->bufend = st->buffer; + return (false); + } + } + + st->bufptr += bytes; + + return (true); } @@ -107,6 +139,9 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object bool decode) // I - Decode/decompress the stream? { pdfio_stream_t *st; // Stream + pdfio_dict_t *dict = pdfioObjGetDict(obj); + // Object dictionary + size_t length; // Length of stream // Allocate a new stream object... @@ -121,11 +156,36 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object _pdfioFileSeek(st->pdf, obj->stream_offset, SEEK_SET); + if ((length = (size_t)pdfioDictGetNumber(dict, "Length")) == 0) + { + // Length must be an indirect reference... + pdfio_obj_t *lenobj; // Length object + + if ((lenobj = pdfioDictGetObject(dict, "Length")) == NULL) + { + _pdfioFileError(obj->pdf, "Unable to get length of stream."); + free(st); + return (NULL); + } + + if (lenobj->value.type == PDFIO_VALTYPE_NONE) + _pdfioObjLoad(lenobj); + + if (lenobj->value.type != PDFIO_VALTYPE_NUMBER || lenobj->value.value.number <= 0.0f) + { + _pdfioFileError(obj->pdf, "Unable to get length of stream."); + free(st); + return (NULL); + } + + length = (size_t)lenobj->value.value.number; + } + + st->remaining = length; + if (decode) { // Try to decode/decompress the contents of this object... - pdfio_dict_t *dict = pdfioObjGetDict(obj); - // Object dictionary const char *filter = pdfioDictGetName(dict, "Filter"); // Filter value @@ -146,6 +206,7 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object else if (!strcmp(filter, "FlateDecode")) { // Flate compression +#if 0 // TODO: Determine whether we need to implement support for predictors int bpc = (int)pdfioDictGetNumber(dict, "BitsPerComponent"); // Bits per component int colors = (int)pdfioDictGetNumber(dict, "Colors"); @@ -154,8 +215,26 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object // Number of columns int predictor = (int)pdfioDictGetNumber(dict, "Predictor"); // Predictory value, if any +#endif // 0 st->filter = PDFIO_FILTER_FLATE; + + st->flate.zalloc = (alloc_func)0; + st->flate.zfree = (free_func)0; + st->flate.opaque = (voidpf)0; + st->flate.next_in = (Bytef *)st->cbuffer; + st->flate.next_out = NULL; + st->flate.avail_in = (uInt)_pdfioFileRead(st->pdf, st->cbuffer, sizeof(st->cbuffer)); + st->flate.avail_out = 0; + + if (inflateInit(&(st->flate)) != Z_OK) + { + _pdfioFileError(st->pdf, "Unable to start Flate filter."); + free(st); + return (NULL); + } + + st->remaining -= st->flate.avail_in; } else if (!strcmp(filter, "LZWDecode")) { @@ -189,12 +268,40 @@ pdfioStreamPeek(pdfio_stream_t *st, // I - Stream void *buffer, // I - Buffer size_t bytes) // I - Size of buffer { - // TODO: Implement me - (void)st; - (void)buffer; - (void)bytes; + size_t remaining; // Remaining bytes in buffer - return (-1); + + // Range check input... + if (!st || st->pdf->mode != _PDFIO_MODE_READ || !buffer || !bytes) + return (-1); + + // See if we have enough bytes in the buffer... + if ((remaining = (size_t)(st->bufend - st->bufptr)) < bytes) + { + // No, shift the buffer and read more + ssize_t rbytes; // Bytes read + + if (remaining > 0) + memmove(st->buffer, st->bufptr, remaining); + + st->bufptr = st->buffer; + st->bufend = st->buffer + remaining; + + if ((rbytes = stream_read(st, st->bufptr, sizeof(st->buffer) - remaining)) > 0) + { + st->bufend += rbytes; + remaining += (size_t)rbytes; + } + } + + // Copy bytes from the buffer... + if (bytes > remaining) + bytes = remaining; + + memcpy(buffer, st->bufptr, bytes); + + // Return the number of bytes that were copied... + return ((ssize_t)bytes); } @@ -247,12 +354,57 @@ pdfioStreamRead( void *buffer, // I - Buffer size_t bytes) // I - Bytes to read { - // TODO: Implement me - (void)st; - (void)buffer; - (void)bytes; + char *bufptr = (char *)buffer; + // Pointer into buffer + size_t remaining; // Remaining bytes in buffer + ssize_t rbytes; // Bytes read - return (-1); + + // Range check input... + if (!st || st->pdf->mode != _PDFIO_MODE_READ || !buffer || !bytes) + return (-1); + + // Loop until we have the requested bytes or hit the end of the stream... + while ((remaining = (size_t)(st->bufend - st->bufptr)) < bytes) + { + memcpy(bufptr, st->bufptr, remaining); + bufptr += remaining; + bytes -= remaining; + + if (bytes >= sizeof(st->buffer)) + { + // Read large amounts directly to caller's buffer... + if ((rbytes = stream_read(st, bufptr, bytes)) > 0) + { + bufptr += rbytes; + bytes = 0; + } + + st->bufptr = st->bufend = st->buffer; + break; + } + else if ((rbytes = stream_read(st, st->buffer, sizeof(st->buffer))) > 0) + { + st->bufptr = st->buffer; + st->bufend = st->buffer + rbytes; + } + else + { + st->bufptr = st->bufend = st->buffer; + break; + } + } + + // Copy any remaining bytes from the stream buffer... + if (bytes > 0) + { + memcpy(bufptr, st->bufptr, bytes); + bufptr += bytes; + st->bufptr += bytes; + } + + // Return the number of bytes that were read... + return (bufptr - (char *)buffer); } @@ -273,3 +425,68 @@ pdfioStreamWrite( return (false); } + + +// +// 'stream_read()' - Read data from a stream, including filters. +// + +static ssize_t // O - Number of bytes read or `-1` on error +stream_read(pdfio_stream_t *st, // I - Stream + char *buffer, // I - Buffer + size_t bytes) // I - Number of bytes to read +{ + ssize_t rbytes; // Bytes read + + + if (st->filter == PDFIO_FILTER_NONE) + { + // No filtering, but limit reads to the length of the stream... + if (bytes > st->remaining) + rbytes = _pdfioFileRead(st->pdf, buffer, st->remaining); + else + rbytes = _pdfioFileRead(st->pdf, buffer, bytes); + + if (rbytes > 0) + st->remaining -= (size_t)rbytes; + + return (rbytes); + } + else if (st->filter == PDFIO_FILTER_FLATE) + { + // Deflate compression... + int status; // Status of decompression + + if (st->flate.avail_in == 0) + { + // Read more from the file... + if (sizeof(st->cbuffer) > st->remaining) + rbytes = _pdfioFileRead(st->pdf, st->cbuffer, st->remaining); + else + rbytes = _pdfioFileRead(st->pdf, st->cbuffer, sizeof(st->cbuffer)); + + if (rbytes <= 0) + return (-1); // End of file... + + st->remaining -= (size_t)rbytes; + st->flate.next_in = (Bytef *)st->cbuffer; + st->flate.avail_in = (uInt)rbytes; + } + + // Decompress into the buffer... + st->flate.next_out = (Bytef *)buffer; + st->flate.avail_out = (uInt)bytes; + + if ((status = inflate(&(st->flate), Z_NO_FLUSH)) < Z_OK) + { + _pdfioFileError(st->pdf, "Unable to decompress stream data: %d", status); + return (-1); + } + + return (st->flate.next_out - (Bytef *)buffer); + } + + // If we get here something bad happened... + return (-1); +} +