Multiple fixes to allow PDFio to read more edge-case PDFs.

- Update _pdfioFileGets to allow for really long lines where it
  doesn't matter if we lose the end of the line.
- Update "startxref" detection at the end of the file.
- Refactor repair logic so that you just get a single WARNING about
  the repair (debug messages available for testing)
- Allow whitespace after the "obj" in the object header.
- Make sure to close xref stream on error.
- Update predictor code to support Colors <= 32 (some implementations
  set Colors to the number of bytes per record in the xref stream,
  which prevents the predictor from doing anything...)
- Allow CR CR in xref table.
- Clear old trailer/root/pages/etc. objects when repairing, update
  existing objects that were already found in load_xref.
- Don't set current object in pdfioObjectCreate/OpenStream if the
  stream can't be created/opened.
This commit is contained in:
Michael R Sweet
2025-04-24 11:09:54 -04:00
parent 278ddb7fa7
commit cad8f450ab
6 changed files with 148 additions and 84 deletions

View File

@@ -134,19 +134,20 @@ _pdfioFileGetChar(pdfio_file_t *pdf) // I - PDF file
bool // O - `true` on success, `false` on error
_pdfioFileGets(pdfio_file_t *pdf, // I - PDF file
char *buffer, // I - Line buffer
size_t bufsize) // I - Size of line buffer
size_t bufsize, // I - Size of line buffer
bool discard) // I - OK to discard excess line chars?
{
bool eol = false; // End of line?
char *bufptr = buffer, // Pointer into buffer
*bufend = buffer + bufsize - 1; // Pointer to end of buffer
PDFIO_DEBUG("_pdfioFileGets(pdf=%p, buffer=%p, bufsize=%lu) bufpos=%ld, buffer=%p, bufptr=%p, bufend=%p, offset=%lu\n", pdf, buffer, (unsigned long)bufsize, (long)pdf->bufpos, pdf->buffer, pdf->bufptr, pdf->bufend, (unsigned long)(pdf->bufpos + (pdf->bufptr - pdf->buffer)));
PDFIO_DEBUG("_pdfioFileGets(pdf=%p, buffer=%p, bufsize=%lu, discard=%s) bufpos=%ld, buffer=%p, bufptr=%p, bufend=%p, offset=%lu\n", pdf, buffer, (unsigned long)bufsize, discard ? "true" : "false", (long)pdf->bufpos, pdf->buffer, pdf->bufptr, pdf->bufend, (unsigned long)(pdf->bufpos + (pdf->bufptr - pdf->buffer)));
while (!eol)
{
// If there are characters ready in the buffer, use them...
while (!eol && pdf->bufptr < pdf->bufend && bufptr < bufend)
while (!eol && pdf->bufptr < pdf->bufend)
{
char ch = *(pdf->bufptr++); // Next character in buffer
@@ -168,8 +169,10 @@ _pdfioFileGets(pdfio_file_t *pdf, // I - PDF file
pdf->bufptr ++;
}
}
else
else if (bufptr < bufend)
*bufptr++ = ch;
else if (!discard)
break;
}
// Fill the read buffer as needed...