From 44325ce2d9c42d1d139168127bbbe58a78cd95ea Mon Sep 17 00:00:00 2001
From: Michael R Sweet <michael.r.sweet@gmail.com>
Date: Fri, 7 May 2021 08:47:49 -0400
Subject: [PATCH] Safe work on streams - still need to implement predictors

---
 TODO.md         |   3 +
 pdfio-file.c    |  89 ++++++++++++++++-
 pdfio-object.c  |  20 +++-
 pdfio-private.h |  18 +++-
 pdfio-stream.c  | 249 ++++++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 355 insertions(+), 24 deletions(-)

diff --git a/TODO.md b/TODO.md
index 6139bb5..eb6c9a0 100644
--- a/TODO.md
+++ b/TODO.md
@@ -10,6 +10,9 @@ To-Do List
   one PDF to another, there are a bunch of resources that also need to be
   copied. A dictionary with an object reference can't be copied directly as the
   object number in the new PDF will likely be different than the old one.
+    - Add _pdfio_map_t with original pdfio_file_t * and object numbers
+    - Add _pdfioObjCopy function
+    - Add _pdfioFileGetMappedObject function to get the new object number
 - Security handlers (RC4 + AES, MD5 + SHA-256) for reading encrypted documents.
 - Signature generation/validation code
 - Documentation
diff --git a/pdfio-file.c b/pdfio-file.c
index ab5dc21..6b4cd32 100644
--- a/pdfio-file.c
+++ b/pdfio-file.c
@@ -537,14 +537,19 @@ load_xref(pdfio_file_t *pdf,		// I - PDF file
       return (false);
     }
 
+    PDFIO_DEBUG("load_xref: xref_offset=%lu, line='%s'\n", (unsigned long)xref_offset, line);
+
     if (isdigit(line[0] & 255) && strlen(line) > 4 && !strcmp(line + strlen(line) - 4, " obj"))
     {
       // Cross-reference stream
       pdfio_obj_t	*obj;		// Object
+      size_t		i;		// Looping var
       pdfio_array_t	*w_array;	// W array
       size_t		w[3];		// Size of each cross-reference field
+      size_t		w_2,		// Offset to second field
+			w_3;		// Offset to third field
       size_t		w_total;	// Total length
-      pdfio_stream_t	*st;		// Stream with
+      pdfio_stream_t	*st;		// Stream
       unsigned char	buffer[32];	// Read buffer
 
       if ((number = strtoimax(line, &ptr, 10)) < 1)
@@ -568,6 +573,8 @@ load_xref(pdfio_file_t *pdf,		// I - PDF file
 	return (false);
       }
 
+      PDFIO_DEBUG("load_xref: Loading object %lu %u.\n", (unsigned long)number, (unsigned)generation);
+
       if ((obj = add_obj(pdf, (size_t)number, (unsigned short)generation, xref_offset)) == NULL)
       {
         _pdfioFileError(pdf, "Unable to allocate memory for object.");
@@ -587,7 +594,85 @@ load_xref(pdfio_file_t *pdf,		// I - PDF file
 
       obj->value = trailer;
 
-      // TODO: read stream
+      if (!_pdfioFileGetToken(pdf, line, sizeof(line)) || strcmp(line, "stream"))
+      {
+        _pdfioFileError(pdf, "Unable to get stream after xref dictionary.");
+        return (false);
+      }
+
+      obj->stream_offset = _pdfioFileTell(pdf);
+
+      if ((w_array = pdfioDictGetArray(trailer.value.dict, "W")) == NULL)
+      {
+	_pdfioFileError(pdf, "Cross-reference stream does not have required W key.");
+	return (false);
+      }
+
+      w[0]    = (size_t)pdfioArrayGetNumber(w_array, 0);
+      w[1]    = (size_t)pdfioArrayGetNumber(w_array, 1);
+      w[2]    = (size_t)pdfioArrayGetNumber(w_array, 2);
+      w_total = w[0] + w[1] + w[2];
+      w_2     = w[0];
+      w_3     = w[0] + w[1];
+
+      if (w[1] == 0 || w[2] > 2 || w_total > sizeof(buffer))
+      {
+	_pdfioFileError(pdf, "Cross-reference stream has invalid W key.");
+	return (false);
+      }
+
+      if ((st = pdfioObjOpenStream(obj, true)) == NULL)
+      {
+	_pdfioFileError(pdf, "Unable to open cross-reference stream.");
+	return (false);
+      }
+
+      while (pdfioStreamRead(st, buffer, w_total) > 0)
+      {
+        PDFIO_DEBUG("load_xref: %02X%02X%02X%02X%02X\n", buffer[0], buffer[1], buffer[2], buffer[3], buffer[4]);
+
+        // Check whether this is an object definition...
+        if (w[0] > 0)
+        {
+          if (buffer[0] == 0)
+          {
+            // Ignore free objects...
+            continue;
+	  }
+	  else if (buffer[0] == 2)
+	  {
+	    // TODO: Add support for compressed object streams...
+	    // Compressed object...
+	    _pdfioFileError(pdf, "PDF file contains compressed object streams which are not currently supported.");
+	    continue;
+	  }
+	}
+
+        for (i = 1, offset = buffer[w_2]; i < w[1]; i ++)
+          offset = (offset << 8) | buffer[w_2 + i];
+
+        switch (w[2])
+        {
+          default :
+              generation = 0;
+              break;
+	  case 1 :
+	      generation = buffer[w_3];
+	      break;
+	  case 2 :
+	      generation = (buffer[w_3] << 8) | buffer[w_3 + 1];
+	      break;
+        }
+
+	// Create a placeholder for the object in memory...
+	if (pdfioFileFindObject(pdf, (size_t)number))
+	  continue;			// Don't replace newer object...
+
+	if (!add_obj(pdf, (size_t)number, (unsigned short)generation, offset))
+	  return (false);
+      }
+
+      pdfioStreamClose(st);
     }
     else if (!strcmp(line, "xref"))
     {
diff --git a/pdfio-object.c b/pdfio-object.c
index 6a7c834..a9146bf 100644
--- a/pdfio-object.c
+++ b/pdfio-object.c
@@ -200,9 +200,21 @@ pdfio_stream_t *			// O - Stream or `NULL` on error
 pdfioObjOpenStream(pdfio_obj_t *obj,	// I - Object
                    bool        decode)	// I - Decode/decompress data?
 {
-  // TODO: Implement me
-  (void)obj;
-  (void)decode;
+  // Range check input...
+  if (!obj)
+    return (NULL);
 
-  return (NULL);
+  // Make sure we've loaded the object dictionary...
+  if (!obj->value.type)
+  {
+    if (!_pdfioObjLoad(obj))
+      return (NULL);
+  }
+
+  // No stream if there is no dict or offset to a stream...
+  if (obj->value.type != PDFIO_VALTYPE_DICT || !obj->stream_offset)
+    return (NULL);
+
+  // Open the stream...
+  return (_pdfioStreamOpen(obj, decode));
 }
diff --git a/pdfio-private.h b/pdfio-private.h
index e7f79c9..aa62575 100644
--- a/pdfio-private.h
+++ b/pdfio-private.h
@@ -69,6 +69,17 @@ typedef enum _pdfio_mode_e		// Read/write mode
   _PDFIO_MODE_WRITE			// Write a PDF file
 } _pdfio_mode_t;
 
+typedef enum _pdfio_predictor_e		// PNG predictor constants
+{
+  _PDFIO_PREDICTOR_NONE = 1,		// No predictor (default)
+  _PDFIO_PREDICTOR_TIFF2 = 2,		// TIFF2 predictor (???)
+  _PDFIO_PREDICTOR_PNG_NONE = 10,	// PNG None predictor (same as `_PDFIO_PREDICTOR_NONE`)
+  _PDFIO_PREDICTOR_PNG_SUB = 11,	// PNG Sub predictor
+  _PDFIO_PREDICTOR_PNG_UP = 12,		// PNG Up predictor
+  _PDFIO_PREDICTOR_PNG_AVERAGE = 13,	// PNG Average predictor
+  _PDFIO_PREDICTOR_PNG_PAETH = 14	// PNG Paeth predictor
+} _pdfio_predictor_t;
+
 typedef struct _pdfio_value_s		// Value structure
 {
   pdfio_valtype_t type;			// Type of value
@@ -174,9 +185,12 @@ struct _pdfio_stream_s			// Stream
   pdfio_file_t	*pdf;			// PDF file
   pdfio_obj_t	*obj;			// Object
   pdfio_filter_t filter;		// Compression/decompression filter
-  char		buffer[8192];		// Read/write buffer
-  size_t	bufused;		// Number of bytes in buffer
+  size_t	remaining;		// Remaining bytes in stream
+  char		buffer[8192],		// Read/write buffer
+		*bufptr,		// Current position in buffer
+	        *bufend;		// End of buffer
   z_stream	flate;			// Flate filter state
+  char		cbuffer[4096];		// Compressed data buffer
 };
 
 typedef ssize_t (*_pdfio_tconsume_cb_t)(void *data, size_t bytes);
diff --git a/pdfio-stream.c b/pdfio-stream.c
index 0f0715d..d921e80 100644
--- a/pdfio-stream.c
+++ b/pdfio-stream.c
@@ -14,6 +14,13 @@
 #include "pdfio-private.h"
 
 
+//
+// Local functions...
+//
+
+static ssize_t	stream_read(pdfio_stream_t *st, char *buffer, size_t bytes);
+
+
 //
 // 'pdfioStreamClose()' - Close a (data) stream in a PDF file.
 //
@@ -54,10 +61,35 @@ bool					// O - `true` on success, `false` on EOF
 pdfioStreamConsume(pdfio_stream_t *st,	// I - Stream
                    size_t         bytes)// I - Number of bytes to consume
 {
-  // TODO: Implement me
-  (void)st;
-  (void)bytes;
-  return (false);
+  size_t	remaining;		// Remaining bytes in buffer
+  ssize_t	rbytes;			// Bytes read
+
+
+  // Range check input...
+  if (!st || st->pdf->mode != _PDFIO_MODE_READ || !bytes)
+    return (false);
+
+  // Skip bytes in the stream buffer until we've consumed the requested number
+  // or get to the end of the stream...
+  while ((remaining = (size_t)(st->bufend - st->bufptr)) < bytes)
+  {
+    bytes -= remaining;
+
+    if ((rbytes = stream_read(st, st->buffer, sizeof(st->buffer))) > 0)
+    {
+      st->bufptr = st->buffer;
+      st->bufend = st->buffer + rbytes;
+    }
+    else
+    {
+      st->bufptr = st->bufend = st->buffer;
+      return (false);
+    }
+  }
+
+  st->bufptr += bytes;
+
+  return (true);
 }
 
 
@@ -107,6 +139,9 @@ _pdfioStreamOpen(pdfio_obj_t *obj,	// I - Object
                  bool        decode)	// I - Decode/decompress the stream?
 {
   pdfio_stream_t	*st;		// Stream
+  pdfio_dict_t		*dict = pdfioObjGetDict(obj);
+					// Object dictionary
+  size_t		length;		// Length of stream
 
 
   // Allocate a new stream object...
@@ -121,11 +156,36 @@ _pdfioStreamOpen(pdfio_obj_t *obj,	// I - Object
 
   _pdfioFileSeek(st->pdf, obj->stream_offset, SEEK_SET);
 
+  if ((length = (size_t)pdfioDictGetNumber(dict, "Length")) == 0)
+  {
+    // Length must be an indirect reference...
+    pdfio_obj_t	*lenobj;		// Length object
+
+    if ((lenobj = pdfioDictGetObject(dict, "Length")) == NULL)
+    {
+      _pdfioFileError(obj->pdf, "Unable to get length of stream.");
+      free(st);
+      return (NULL);
+    }
+
+    if (lenobj->value.type == PDFIO_VALTYPE_NONE)
+      _pdfioObjLoad(lenobj);
+
+    if (lenobj->value.type != PDFIO_VALTYPE_NUMBER || lenobj->value.value.number <= 0.0f)
+    {
+      _pdfioFileError(obj->pdf, "Unable to get length of stream.");
+      free(st);
+      return (NULL);
+    }
+
+    length = (size_t)lenobj->value.value.number;
+  }
+
+  st->remaining = length;
+
   if (decode)
   {
     // Try to decode/decompress the contents of this object...
-    pdfio_dict_t *dict = pdfioObjGetDict(obj);
-					// Object dictionary
     const char	*filter = pdfioDictGetName(dict, "Filter");
 					// Filter value
 
@@ -146,6 +206,7 @@ _pdfioStreamOpen(pdfio_obj_t *obj,	// I - Object
     else if (!strcmp(filter, "FlateDecode"))
     {
       // Flate compression
+#if 0 // TODO: Determine whether we need to implement support for predictors
       int bpc = (int)pdfioDictGetNumber(dict, "BitsPerComponent");
 					// Bits per component
       int colors = (int)pdfioDictGetNumber(dict, "Colors");
@@ -154,8 +215,26 @@ _pdfioStreamOpen(pdfio_obj_t *obj,	// I - Object
 					// Number of columns
       int predictor = (int)pdfioDictGetNumber(dict, "Predictor");
 					// Predictory value, if any
+#endif // 0
 
       st->filter = PDFIO_FILTER_FLATE;
+
+      st->flate.zalloc    = (alloc_func)0;
+      st->flate.zfree     = (free_func)0;
+      st->flate.opaque    = (voidpf)0;
+      st->flate.next_in   = (Bytef *)st->cbuffer;
+      st->flate.next_out  = NULL;
+      st->flate.avail_in  = (uInt)_pdfioFileRead(st->pdf, st->cbuffer, sizeof(st->cbuffer));
+      st->flate.avail_out = 0;
+
+      if (inflateInit(&(st->flate)) != Z_OK)
+      {
+	_pdfioFileError(st->pdf, "Unable to start Flate filter.");
+	free(st);
+	return (NULL);
+      }
+
+      st->remaining -= st->flate.avail_in;
     }
     else if (!strcmp(filter, "LZWDecode"))
     {
@@ -189,12 +268,40 @@ pdfioStreamPeek(pdfio_stream_t *st,	// I - Stream
                 void           *buffer,	// I - Buffer
                 size_t         bytes)	// I - Size of buffer
 {
-  // TODO: Implement me
-  (void)st;
-  (void)buffer;
-  (void)bytes;
+  size_t	remaining;		// Remaining bytes in buffer
 
-  return (-1);
+
+  // Range check input...
+  if (!st || st->pdf->mode != _PDFIO_MODE_READ || !buffer || !bytes)
+    return (-1);
+
+  // See if we have enough bytes in the buffer...
+  if ((remaining = (size_t)(st->bufend - st->bufptr)) < bytes)
+  {
+    // No, shift the buffer and read more
+    ssize_t	rbytes;			// Bytes read
+
+    if (remaining > 0)
+      memmove(st->buffer, st->bufptr, remaining);
+
+    st->bufptr = st->buffer;
+    st->bufend = st->buffer + remaining;
+
+    if ((rbytes = stream_read(st, st->bufptr, sizeof(st->buffer) - remaining)) > 0)
+    {
+      st->bufend += rbytes;
+      remaining  += (size_t)rbytes;
+    }
+  }
+
+  // Copy bytes from the buffer...
+  if (bytes > remaining)
+    bytes = remaining;
+
+  memcpy(buffer, st->bufptr, bytes);
+
+  // Return the number of bytes that were copied...
+  return ((ssize_t)bytes);
 }
 
 
@@ -247,12 +354,57 @@ pdfioStreamRead(
     void           *buffer,		// I - Buffer
     size_t         bytes)		// I - Bytes to read
 {
-  // TODO: Implement me
-  (void)st;
-  (void)buffer;
-  (void)bytes;
+  char		*bufptr = (char *)buffer;
+					// Pointer into buffer
+  size_t	remaining;		// Remaining bytes in buffer
+  ssize_t	rbytes;			// Bytes read
 
-  return (-1);
+
+  // Range check input...
+  if (!st || st->pdf->mode != _PDFIO_MODE_READ || !buffer || !bytes)
+    return (-1);
+
+  // Loop until we have the requested bytes or hit the end of the stream...
+  while ((remaining = (size_t)(st->bufend - st->bufptr)) < bytes)
+  {
+    memcpy(bufptr, st->bufptr, remaining);
+    bufptr += remaining;
+    bytes -= remaining;
+
+    if (bytes >= sizeof(st->buffer))
+    {
+      // Read large amounts directly to caller's buffer...
+      if ((rbytes = stream_read(st, bufptr, bytes)) > 0)
+      {
+        bufptr += rbytes;
+        bytes  = 0;
+      }
+
+      st->bufptr = st->bufend = st->buffer;
+      break;
+    }
+    else if ((rbytes = stream_read(st, st->buffer, sizeof(st->buffer))) > 0)
+    {
+      st->bufptr = st->buffer;
+      st->bufend = st->buffer + rbytes;
+    }
+    else
+    {
+      st->bufptr = st->bufend = st->buffer;
+      break;
+    }
+  }
+
+  // Copy any remaining bytes from the stream buffer...
+  if (bytes > 0)
+  {
+    memcpy(bufptr, st->bufptr, bytes);
+    bufptr     += bytes;
+    st->bufptr += bytes;
+  }
+
+  // Return the number of bytes that were read...
+  return (bufptr - (char *)buffer);
 }
 
 
@@ -273,3 +425,68 @@ pdfioStreamWrite(
 
   return (false);
 }
+
+
+//
+// 'stream_read()' - Read data from a stream, including filters.
+//
+
+static ssize_t				// O - Number of bytes read or `-1` on error
+stream_read(pdfio_stream_t *st,		// I - Stream
+            char           *buffer,	// I - Buffer
+            size_t         bytes)	// I - Number of bytes to read
+{
+  ssize_t	rbytes;			// Bytes read
+
+
+  if (st->filter == PDFIO_FILTER_NONE)
+  {
+    // No filtering, but limit reads to the length of the stream...
+    if (bytes > st->remaining)
+      rbytes = _pdfioFileRead(st->pdf, buffer, st->remaining);
+    else
+      rbytes = _pdfioFileRead(st->pdf, buffer, bytes);
+
+    if (rbytes > 0)
+      st->remaining -= (size_t)rbytes;
+
+    return (rbytes);
+  }
+  else if (st->filter == PDFIO_FILTER_FLATE)
+  {
+    // Deflate compression...
+    int	status;				// Status of decompression
+
+    if (st->flate.avail_in == 0)
+    {
+      // Read more from the file...
+      if (sizeof(st->cbuffer) > st->remaining)
+        rbytes = _pdfioFileRead(st->pdf, st->cbuffer, st->remaining);
+      else
+        rbytes = _pdfioFileRead(st->pdf, st->cbuffer, sizeof(st->cbuffer));
+
+      if (rbytes <= 0)
+        return (-1);			// End of file...
+
+      st->remaining      -= (size_t)rbytes;
+      st->flate.next_in  = (Bytef *)st->cbuffer;
+      st->flate.avail_in = (uInt)rbytes;
+    }
+
+    // Decompress into the buffer...
+    st->flate.next_out  = (Bytef *)buffer;
+    st->flate.avail_out = (uInt)bytes;
+
+    if ((status = inflate(&(st->flate), Z_NO_FLUSH)) < Z_OK)
+    {
+      _pdfioFileError(st->pdf, "Unable to decompress stream data: %d", status);
+      return (-1);
+    }
+
+    return (st->flate.next_out - (Bytef *)buffer);
+  }
+
+  // If we get here something bad happened...
+  return (-1);
+}
+