Update the token reading code to protect against obvious format abuses.

Update the xref loading code to protect against looping xref tables.
2026-04-09 21:40:00 +02:00 · 2023-12-07 17:50:52 -05:00
parent ed723a46dc
commit c992b2ba89
3 changed files with 62 additions and 5 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -12,6 +12,9 @@ v1.2.0 (Month DD, YYYY)
  functions (Issue #24)
 - Renamed `pdfioContentTextNextLine` to `pdfioContentTextNewLine`.
 - Now use autoconf to configure the PDFio sources (Issue #54)
+- Updated the token reading code to protect against some obvious abuses of the
+  PDF format.
+- Updated the xref reading code to protect against loops.


 v1.1.4 (December 3, 2023)
--- a/pdfio-file.c
+++ b/pdfio-file.c
@@ -2091,8 +2091,19 @@ load_xref(
    PDFIO_DEBUG_VALUE(&trailer);
    PDFIO_DEBUG("\n");

-    if ((xref_offset = (off_t)pdfioDictGetNumber(trailer.value.dict, "Prev")) <= 0)
+    off_t new_offset = (off_t)pdfioDictGetNumber(trailer.value.dict, "Prev");
+
+    if (new_offset <= 0)
+    {
      done = true;
+    }
+    else if (new_offset == xref_offset)
+    {
+      _pdfioFileError(pdf, "Recursive xref table.");
+      return (false);
+    }
+
+    xref_offset = new_offset;
  }

  // Once we have all of the xref tables loaded, get the important objects and
--- a/pdfio-token.c
+++ b/pdfio-token.c
@@ -208,9 +208,10 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack
 	*bufend,			// End of buffer
 	state = '\0';			// Current state
  bool	saw_nul = false;		// Did we see a nul character?
+  size_t count = 0;			// Number of whitespace/comment bytes
+


-  //
  // "state" is:
  //
  // - '\0' for idle
@@ -229,17 +230,38 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack
  // Skip leading whitespace...
  while ((ch = get_char(tb)) != EOF)
  {
+    count ++;
+
    if (ch == '%')
    {
      // Skip comment
+      PDFIO_DEBUG("_pdfioTokenRead: Skipping comment...\n");
      while ((ch = get_char(tb)) != EOF)
      {
+        count ++;
+
 	if (ch == '\n' || ch == '\r')
+	{
 	  break;
+	}
+	else if (count > 2048)
+	{
+	  _pdfioFileError(tb->pdf, "Comment too long.");
+	  *bufptr = '\0';
+	  return (false);
+	}
      }
    }
    else if (!isspace(ch))
+    {
      break;
+    }
+    else if (count > 2048)
+    {
+      _pdfioFileError(tb->pdf, "Too much whitespace.");
+      *bufptr = '\0';
+      return (false);
+    }
  }

  if (ch == EOF)
@@ -266,6 +288,8 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack
    *bufptr++ = (char)ch;
  }

+  PDFIO_DEBUG("_pdfioTokenRead: state='%c'\n", state);
+
  switch (state)
  {
    case '(' : // Literal string
@@ -431,6 +455,7 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack
 	  if (!isdigit(ch) && ch != '.')
 	  {
 	    // End of number...
+	    PDFIO_DEBUG("_pdfioTokenRead: End of number with ch=0x%02x\n", ch);
 	    tb->bufptr --;
 	    break;
 	  }
@@ -496,6 +521,13 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack
 	    return (false);
 	  }
 	}
+
+	if (bufptr == (buffer + 1))
+	{
+	  _pdfioFileError(tb->pdf, "Empty name.");
+	  *bufptr = '\0';
+	  return (false);
+	}
 	break;

    case '<' : // Potential hex string
@@ -519,6 +551,8 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack
 	  return (false);
 	}

+        count = 0;
+
        do
 	{
 	  if (isxdigit(ch))
@@ -527,6 +561,7 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack
 	    {
 	      // Hex digit
 	      *bufptr++ = (char)ch;
+	      count = 0;
 	    }
 	    else
 	    {
@@ -542,6 +577,16 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack
 	    *bufptr = '\0';
 	    return (false);
 	  }
+	  else
+	  {
+	    count ++;
+	    if (count > 2048)
+	    {
+	      _pdfioFileError(tb->pdf, "Too much whitespace.");
+	      *bufptr = '\0';
+	      return (false);
+	    }
+	  }
 	}
 	while ((ch = get_char(tb)) != EOF && ch != '>');

@@ -569,7 +614,7 @@ _pdfioTokenRead(_pdfio_token_t *tb,	// I - Token buffer/stack

  *bufptr = '\0';

-//  PDFIO_DEBUG("_pdfioTokenRead: Read '%s'.\n", buffer);
+  PDFIO_DEBUG("_pdfioTokenRead: Read '%s'.\n", buffer);

  return (bufptr > buffer);
 }
@@ -606,7 +651,6 @@ get_char(_pdfio_token_t *tb)		// I - Token buffer
    tb->bufptr = tb->buffer;
    tb->bufend = tb->buffer + bytes;

-#if 0
 #ifdef DEBUG
    unsigned char *ptr;			// Pointer into buffer

@@ -620,7 +664,6 @@ get_char(_pdfio_token_t *tb)		// I - Token buffer
    }
    PDFIO_DEBUG("'\n");
 #endif // DEBUG
-#endif // 0
  }

  // Return the next character...