mirror of
https://github.com/michaelrsweet/pdfio.git
synced 2025-02-26 22:02:49 +01:00
Add support for 'repairing' damaged PDF files (Issue #45)
This commit is contained in:
parent
77117ac789
commit
8d72f22efe
@ -8,6 +8,7 @@ v1.5.0 - YYYY-MM-DD
|
||||
- Added `pdfioFileCreateICCObjFromData` API.
|
||||
- Added `pdfioFileGetModDate()` API (Issue #88)
|
||||
- Added support for using libpng to embed PNG images in PDF output (Issue #90)
|
||||
- Now support opening damaged PDF files (Issue #45)
|
||||
- Updated the pdf2txt example to support font encodings.
|
||||
|
||||
|
||||
|
159
pdfio-file.c
159
pdfio-file.c
@ -25,6 +25,7 @@ static struct lconv *get_lconv(void);
|
||||
static bool load_obj_stream(pdfio_obj_t *obj);
|
||||
static bool load_pages(pdfio_file_t *pdf, pdfio_obj_t *obj, size_t depth);
|
||||
static bool load_xref(pdfio_file_t *pdf, off_t xref_offset, pdfio_password_cb_t password_cb, void *password_data);
|
||||
static bool repair_xref(pdfio_file_t *pdf, pdfio_password_cb_t password_cb, void *password_data);
|
||||
static bool write_pages(pdfio_file_t *pdf);
|
||||
static bool write_trailer(pdfio_file_t *pdf);
|
||||
|
||||
@ -1070,7 +1071,10 @@ pdfioFileOpen(
|
||||
xref_offset = (off_t)strtol(ptr + 9, NULL, 10);
|
||||
|
||||
if (!load_xref(pdf, xref_offset, password_cb, password_cbdata))
|
||||
goto error;
|
||||
{
|
||||
if (!repair_xref(pdf, password_cb, password_cbdata))
|
||||
goto error;
|
||||
}
|
||||
|
||||
return (pdf);
|
||||
|
||||
@ -2165,6 +2169,159 @@ load_xref(
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// 'repair_xref()' - Try to "repair" a PDF file and its cross-references...
|
||||
//
|
||||
|
||||
static bool // O - `true` on success, `false` on failure
|
||||
repair_xref(
|
||||
pdfio_file_t *pdf, // I - PDF file
|
||||
pdfio_password_cb_t password_cb, // I - Password callback or `NULL` for none
|
||||
void *password_data) // I - Password callback data, if any
|
||||
{
|
||||
char line[16384], // Line from file
|
||||
*ptr; // Pointer into line
|
||||
off_t line_offset; // Offset in file
|
||||
intmax_t number; // Object number
|
||||
int generation; // Generation number
|
||||
size_t i; // Looping var
|
||||
size_t num_sobjs = 0; // Number of object streams
|
||||
pdfio_obj_t *sobjs[16384]; // Object streams to load
|
||||
|
||||
|
||||
// Read from the beginning of the file, looking for
|
||||
if ((line_offset = _pdfioFileSeek(pdf, 0, SEEK_SET)) < 0)
|
||||
return (false);
|
||||
|
||||
while (_pdfioFileGets(pdf, line, sizeof(line)))
|
||||
{
|
||||
// See if this is the start of an object...
|
||||
if (line[0] >= '1' && line[0] <= '9')
|
||||
{
|
||||
// Maybe, look some more...
|
||||
if ((number = strtoimax(line, &ptr, 10)) >= 1 && (generation = (int)strtol(ptr, &ptr, 10)) >= 0 && generation < 65536)
|
||||
{
|
||||
while (isspace(*ptr & 255))
|
||||
ptr ++;
|
||||
|
||||
if (!strncmp(ptr, "obj", 3))
|
||||
{
|
||||
// Yes, start of an object...
|
||||
pdfio_obj_t *obj; // Object
|
||||
_pdfio_token_t tb; // Token buffer/stack
|
||||
|
||||
PDFIO_DEBUG("OBJECT %ld %d at offset %ld\n", (long)number, generation, (long)line_offset);
|
||||
|
||||
if ((obj = add_obj(pdf, (size_t)number, (unsigned short)generation, line_offset)) == NULL)
|
||||
{
|
||||
_pdfioFileError(pdf, "Unable to allocate memory for object.");
|
||||
return (false);
|
||||
}
|
||||
|
||||
_pdfioTokenInit(&tb, pdf, (_pdfio_tconsume_cb_t)_pdfioFileConsume, (_pdfio_tpeek_cb_t)_pdfioFilePeek, pdf);
|
||||
|
||||
if (!_pdfioValueRead(pdf, obj, &tb, &obj->value, 0))
|
||||
{
|
||||
_pdfioFileError(pdf, "Unable to read cross-reference stream dictionary.");
|
||||
return (false);
|
||||
}
|
||||
|
||||
if (_pdfioTokenGet(&tb, line, sizeof(line)) && strcmp(line, "stream"))
|
||||
{
|
||||
const char *type = pdfioObjGetType(obj);
|
||||
// Object type
|
||||
|
||||
_pdfioTokenFlush(&tb);
|
||||
obj->stream_offset = _pdfioFileTell(pdf);
|
||||
|
||||
if (type && !strcmp(type, "ObjStm") && num_sobjs < (sizeof(sobjs) / sizeof(sobjs[0])))
|
||||
{
|
||||
sobjs[num_sobjs] = obj;
|
||||
num_sobjs ++;
|
||||
}
|
||||
|
||||
if (type && !strcmp(type, "XRef") && !pdf->trailer_dict)
|
||||
{
|
||||
// Save the trailer dictionary...
|
||||
pdf->trailer_dict = pdfioObjGetDict(obj);
|
||||
pdf->encrypt_obj = pdfioDictGetObj(pdf->trailer_dict, "Encrypt");
|
||||
pdf->id_array = pdfioDictGetArray(pdf->trailer_dict, "ID");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!strncmp(line, "trailer", 7) && (!line[7] || isspace(line[7] & 255) || line[7] == '<'))
|
||||
{
|
||||
// Trailer dictionary
|
||||
_pdfio_token_t tb; // Token buffer/stack
|
||||
_pdfio_value_t trailer; // Trailer
|
||||
|
||||
if (line[7])
|
||||
{
|
||||
// Probably the start of the trailer dictionary, rewind the file so
|
||||
// we can read it...
|
||||
_pdfioFileSeek(pdf, line_offset + 7, SEEK_SET);
|
||||
}
|
||||
|
||||
PDFIO_DEBUG("TRAILER at offset %ld\n", (long)line_offset);
|
||||
|
||||
_pdfioTokenInit(&tb, pdf, (_pdfio_tconsume_cb_t)_pdfioFileConsume, (_pdfio_tpeek_cb_t)_pdfioFilePeek, pdf);
|
||||
if (!_pdfioValueRead(pdf, NULL, &tb, &trailer, 0))
|
||||
{
|
||||
_pdfioFileError(pdf, "Unable to read cross-reference stream dictionary.");
|
||||
return (false);
|
||||
}
|
||||
else if (trailer.type != PDFIO_VALTYPE_DICT)
|
||||
{
|
||||
_pdfioFileError(pdf, "Trailer is not a dictionary.");
|
||||
return (false);
|
||||
}
|
||||
|
||||
_pdfioTokenFlush(&tb);
|
||||
|
||||
if (!pdf->trailer_dict)
|
||||
{
|
||||
// Save the trailer dictionary and grab the root (catalog) and info
|
||||
// objects...
|
||||
pdf->trailer_dict = trailer.value.dict;
|
||||
pdf->encrypt_obj = pdfioDictGetObj(pdf->trailer_dict, "Encrypt");
|
||||
pdf->id_array = pdfioDictGetArray(pdf->trailer_dict, "ID");
|
||||
}
|
||||
}
|
||||
|
||||
// Get the offset for the next line...
|
||||
line_offset = _pdfioFileTell(pdf);
|
||||
}
|
||||
|
||||
// If the trailer contains an Encrypt key, try unlocking the file...
|
||||
if (pdf->encrypt_obj && !_pdfioCryptoUnlock(pdf, password_cb, password_data))
|
||||
return (false);
|
||||
|
||||
// Load any stream objects...
|
||||
for (i = 0; i < num_sobjs; i ++)
|
||||
{
|
||||
if (!load_obj_stream(sobjs[i]))
|
||||
return (false);
|
||||
}
|
||||
|
||||
// Once we have all of the xref tables loaded, get the important objects and
|
||||
// build the pages array...
|
||||
pdf->info_obj = pdfioDictGetObj(pdf->trailer_dict, "Info");
|
||||
|
||||
if ((pdf->root_obj = pdfioDictGetObj(pdf->trailer_dict, "Root")) == NULL)
|
||||
{
|
||||
_pdfioFileError(pdf, "Missing Root object.");
|
||||
return (false);
|
||||
}
|
||||
|
||||
PDFIO_DEBUG("repair_xref: Root=%p(%lu)\n", pdf->root_obj, (unsigned long)pdf->root_obj->number);
|
||||
|
||||
// Load pages...
|
||||
return (load_pages(pdf, pdfioDictGetObj(pdfioObjGetDict(pdf->root_obj), "Pages"), 0));
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// 'write_pages()' - Write the PDF pages objects.
|
||||
//
|
||||
|
Loading…
x
Reference in New Issue
Block a user