Compare commits

...

2 Commits

Author SHA1 Message Date
Michael R Sweet
5d760e7315
Update some debug printfs. 2023-12-13 12:48:31 -05:00
Michael R Sweet
2a85baaf81
Increase the maximum number of object streams in a file (Issue #58) - most files
only contain 1 or 2...

Change the implementation of add/find object to use a custom binary insertion
sort algorithm rather than doing a qsort after every addition.  This results in
a significant improvement in open speed - from 2371 seconds (about 39.5 minutes)
to 3.1 seconds for one large test file (an ESRI standard).
2023-12-13 12:26:25 -05:00
4 changed files with 111 additions and 39 deletions

View File

@ -5,16 +5,20 @@ Changes in PDFio
v1.2.0 (Month DD, YYYY) v1.2.0 (Month DD, YYYY)
----------------------- -----------------------
- Now use autoconf to configure the PDFio sources (Issue #54)
- Added `pdfioFileCreateNumberObj` and `pdfioFileCreateStringObj` functions - Added `pdfioFileCreateNumberObj` and `pdfioFileCreateStringObj` functions
(Issue #14) (Issue #14)
- Added `pdfioContentTextMeasure` function (Issue #17) - Added `pdfioContentTextMeasure` function (Issue #17)
- Added `pdfioContentTextNewLineShow` and `pdfioContentTextNewLineShowf` - Added `pdfioContentTextNewLineShow` and `pdfioContentTextNewLineShowf`
functions (Issue #24) functions (Issue #24)
- Renamed `pdfioContentTextNextLine` to `pdfioContentTextNewLine`. - Renamed `pdfioContentTextNextLine` to `pdfioContentTextNewLine`.
- Now use autoconf to configure the PDFio sources (Issue #54) - Updated the maximum number of object streams in a single file from 4096 to
8192 (Issue #58)
- Updated the token reading code to protect against some obvious abuses of the - Updated the token reading code to protect against some obvious abuses of the
PDF format. PDF format.
- Updated the xref reading code to protect against loops. - Updated the xref reading code to protect against loops.
- Updated the object handling code to use a binary insertion algorithm -
provides a significant (~800x) improvement in open times.
- Fixed handling of encrypted PDFs with per-object file IDs (Issue #42) - Fixed handling of encrypted PDFs with per-object file IDs (Issue #42)
- Fixed handling of of trailer dictionaries that started immediately after the - Fixed handling of of trailer dictionaries that started immediately after the
"trailer" keyword (Issue #58) "trailer" keyword (Issue #58)

View File

@ -518,7 +518,7 @@ _pdfioDictRead(pdfio_file_t *pdf, // I - PDF file
_pdfio_value_t value; // Dictionary value _pdfio_value_t value; // Dictionary value
PDFIO_DEBUG("_pdfioDictRead(pdf=%p)\n", pdf); PDFIO_DEBUG("_pdfioDictRead(pdf=%p, obj=%p, tb=%p, depth=%lu)\n", pdf, obj, tb, (unsigned long)depth);
// Create a dictionary and start reading... // Create a dictionary and start reading...
if ((dict = pdfioDictCreate(pdf)) == NULL) if ((dict = pdfioDictCreate(pdf)) == NULL)
@ -530,6 +530,7 @@ _pdfioDictRead(pdfio_file_t *pdf, // I - PDF file
if (!strcmp(key, ">>")) if (!strcmp(key, ">>"))
{ {
// End of dictionary... // End of dictionary...
PDFIO_DEBUG("_pdfioDictRead: Returning dictionary value...\n");
return (dict); return (dict);
} }
else if (key[0] != '/') else if (key[0] != '/')
@ -548,14 +549,14 @@ _pdfioDictRead(pdfio_file_t *pdf, // I - PDF file
if (!_pdfioValueRead(pdf, obj, tb, &value, depth)) if (!_pdfioValueRead(pdf, obj, tb, &value, depth))
{ {
_pdfioFileError(pdf, "Missing value for dictionary key."); _pdfioFileError(pdf, "Missing value for dictionary key '%s'.", key + 1);
break; break;
} }
if (!_pdfioDictSetValue(dict, pdfioStringCreate(pdf, key + 1), &value)) if (!_pdfioDictSetValue(dict, pdfioStringCreate(pdf, key + 1), &value))
break; break;
// PDFIO_DEBUG("_pdfioDictRead: Set %s.\n", key); PDFIO_DEBUG("_pdfioDictRead: Set %s.\n", key);
} }
// Dictionary is invalid - pdfioFileClose will free the memory, return NULL // Dictionary is invalid - pdfioFileClose will free the memory, return NULL

View File

@ -19,7 +19,6 @@
static pdfio_obj_t *add_obj(pdfio_file_t *pdf, size_t number, unsigned short generation, off_t offset); static pdfio_obj_t *add_obj(pdfio_file_t *pdf, size_t number, unsigned short generation, off_t offset);
static int compare_objmaps(_pdfio_objmap_t *a, _pdfio_objmap_t *b); static int compare_objmaps(_pdfio_objmap_t *a, _pdfio_objmap_t *b);
static int compare_objs(pdfio_obj_t **a, pdfio_obj_t **b);
static const char *get_info_string(pdfio_file_t *pdf, const char *key); static const char *get_info_string(pdfio_file_t *pdf, const char *key);
static bool load_obj_stream(pdfio_obj_t *obj); static bool load_obj_stream(pdfio_obj_t *obj);
static bool load_pages(pdfio_file_t *pdf, pdfio_obj_t *obj, size_t depth); static bool load_pages(pdfio_file_t *pdf, pdfio_obj_t *obj, size_t depth);
@ -916,20 +915,56 @@ pdfioFileFindObj(
pdfio_file_t *pdf, // I - PDF file pdfio_file_t *pdf, // I - PDF file
size_t number) // I - Object number (1 to N) size_t number) // I - Object number (1 to N)
{ {
pdfio_obj_t key, // Search key size_t left, // Left object
*keyptr, // Pointer to key right, // Right object
**match; // Pointer to match current; // Current object
if (pdf->num_objs > 0) PDFIO_DEBUG("pdfioFileFindObj(pdf=%p, number=%lu) alloc_objs=%lu, num_objs=%lu, objs=%p\n", (void *)pdf, (unsigned long)number, (unsigned long)(pdf ? pdf->alloc_objs : 0), (unsigned long)(pdf ? pdf->num_objs : 0), (void *)(pdf ? pdf->objs : NULL));
// Range check input...
if (!pdf || pdf->num_objs == 0 || number < 1)
return (NULL);
// Do a binary search for the object...
if ((current = number - 1) >= pdf->num_objs)
current = pdf->num_objs / 2;
PDFIO_DEBUG("pdfioFileFindObj: objs[current=%lu]=%p\n", (unsigned long)current, (void *)pdf->objs[current]);
if (number == pdf->objs[current]->number)
{ {
key.number = number; // Fast match...
keyptr = &key; return (pdf->objs[current]);
match = (pdfio_obj_t **)bsearch(&keyptr, pdf->objs, pdf->num_objs, sizeof(pdfio_obj_t *), (int (*)(const void *, const void *))compare_objs); }
else if (number < pdf->objs[current]->number)
return (match ? *match : NULL); {
left = 0;
right = current;
}
else
{
left = current;
right = pdf->num_objs - 1;
} }
while ((right - left) > 1)
{
current = (left + right) / 2;
if (number == pdf->objs[current]->number)
return (pdf->objs[current]);
else if (number < pdf->objs[current]->number)
right = current;
else
left = current;
}
if (number == pdf->objs[left]->number)
return (pdf->objs[left]);
else if (number == pdf->objs[right]->number)
return (pdf->objs[right]);
else
return (NULL); return (NULL);
} }
@ -1385,6 +1420,9 @@ add_obj(pdfio_file_t *pdf, // I - PDF file
off_t offset) // I - Offset in file off_t offset) // I - Offset in file
{ {
pdfio_obj_t *obj; // Object pdfio_obj_t *obj; // Object
size_t left, // Left object
right, // Right object
current; // Current object (center)
// Allocate memory for the object... // Allocate memory for the object...
@ -1410,8 +1448,6 @@ add_obj(pdfio_file_t *pdf, // I - PDF file
pdf->alloc_objs += 32; pdf->alloc_objs += 32;
} }
pdf->objs[pdf->num_objs ++] = obj;
obj->pdf = pdf; obj->pdf = pdf;
obj->number = number; obj->number = number;
obj->generation = generation; obj->generation = generation;
@ -1419,9 +1455,56 @@ add_obj(pdfio_file_t *pdf, // I - PDF file
PDFIO_DEBUG("add_obj: obj=%p, ->pdf=%p, ->number=%lu, ->offset=%lu\n", obj, pdf, (unsigned long)obj->number, (unsigned long)offset); PDFIO_DEBUG("add_obj: obj=%p, ->pdf=%p, ->number=%lu, ->offset=%lu\n", obj, pdf, (unsigned long)obj->number, (unsigned long)offset);
// Re-sort object array as needed... // Insert object into array as needed...
if (pdf->num_objs > 1 && pdf->objs[pdf->num_objs - 2]->number > number) if (pdf->num_objs == 0 || obj->number > pdf->objs[pdf->num_objs - 1]->number)
qsort(pdf->objs, pdf->num_objs, sizeof(pdfio_obj_t *), (int (*)(const void *, const void *))compare_objs); {
// Append object...
PDFIO_DEBUG("add_obj: Appending at %lu\n", (unsigned long)pdf->num_objs);
pdf->objs[pdf->num_objs] = obj;
pdf->last_obj = pdf->num_objs;
}
else
{
// Insert object...
if (obj->number < pdf->objs[pdf->last_obj]->number)
{
left = 0;
right = pdf->last_obj;
}
else
{
left = pdf->last_obj;
right = pdf->num_objs - 1;
}
while ((right - left) > 1)
{
current = (left + right) / 2;
if (obj->number < pdf->objs[current]->number)
right = current;
else
left = current;
}
if (obj->number < pdf->objs[left]->number)
current = left;
else if (obj->number < pdf->objs[right]->number)
current = right;
else
current = right;
PDFIO_DEBUG("add_obj: Inserting at %lu\n", (unsigned long)current);
if (current < pdf->num_objs)
memmove(pdf->objs + current + 1, pdf->objs + current, (pdf->num_objs - current) * sizeof(pdfio_obj_t *));
pdf->objs[current] = obj;
pdf->last_obj = current;
}
pdf->num_objs ++;
return (obj); return (obj);
} }
@ -1448,23 +1531,6 @@ compare_objmaps(_pdfio_objmap_t *a, // I - First object map
} }
//
// 'compare_objs()' - Compare the object numbers of two objects.
//
static int // O - Result of comparison
compare_objs(pdfio_obj_t **a, // I - First object
pdfio_obj_t **b) // I - Second object
{
if ((*a)->number < (*b)->number)
return (-1);
else if ((*a)->number == (*b)->number)
return (0);
else
return (1);
}
// //
// 'get_info_string()' - Get a string value from the Info dictionary. // 'get_info_string()' - Get a string value from the Info dictionary.
// //
@ -1737,7 +1803,7 @@ load_xref(
pdfio_stream_t *st; // Stream pdfio_stream_t *st; // Stream
unsigned char buffer[32]; // Read buffer unsigned char buffer[32]; // Read buffer
size_t num_sobjs = 0, // Number of object streams size_t num_sobjs = 0, // Number of object streams
sobjs[4096]; // Object streams to load sobjs[8192]; // Object streams to load
pdfio_obj_t *current; // Current object pdfio_obj_t *current; // Current object
if ((number = strtoimax(line, &ptr, 10)) < 1) if ((number = strtoimax(line, &ptr, 10)) < 1)

View File

@ -266,7 +266,8 @@ struct _pdfio_file_s // PDF file structure
alloc_dicts; // Allocated dictionaries alloc_dicts; // Allocated dictionaries
pdfio_dict_t **dicts; // Dictionaries pdfio_dict_t **dicts; // Dictionaries
size_t num_objs, // Number of objects size_t num_objs, // Number of objects
alloc_objs; // Allocated objects alloc_objs, // Allocated objects
last_obj; // Last object added
pdfio_obj_t **objs, // Objects pdfio_obj_t **objs, // Objects
*current_obj; // Current object being written/read *current_obj; // Current object being written/read
size_t num_objmaps, // Number of object maps size_t num_objmaps, // Number of object maps