Increase the maximum number of object streams in a file (Issue #58) - most files

only contain 1 or 2...

Change the implementation of add/find object to use a custom binary insertion
sort algorithm rather than doing a qsort after every addition.  This results in
a significant improvement in open speed - from 2371 seconds (about 39.5 minutes)
to 3.1 seconds for one large test file (an ESRI standard).
This commit is contained in:
Michael R Sweet 2023-12-13 12:21:59 -05:00
parent 2b92044504
commit 2a85baaf81
No known key found for this signature in database
GPG Key ID: BE67C75EC81F3244
3 changed files with 107 additions and 36 deletions

View File

@ -5,16 +5,20 @@ Changes in PDFio
v1.2.0 (Month DD, YYYY) v1.2.0 (Month DD, YYYY)
----------------------- -----------------------
- Now use autoconf to configure the PDFio sources (Issue #54)
- Added `pdfioFileCreateNumberObj` and `pdfioFileCreateStringObj` functions - Added `pdfioFileCreateNumberObj` and `pdfioFileCreateStringObj` functions
(Issue #14) (Issue #14)
- Added `pdfioContentTextMeasure` function (Issue #17) - Added `pdfioContentTextMeasure` function (Issue #17)
- Added `pdfioContentTextNewLineShow` and `pdfioContentTextNewLineShowf` - Added `pdfioContentTextNewLineShow` and `pdfioContentTextNewLineShowf`
functions (Issue #24) functions (Issue #24)
- Renamed `pdfioContentTextNextLine` to `pdfioContentTextNewLine`. - Renamed `pdfioContentTextNextLine` to `pdfioContentTextNewLine`.
- Now use autoconf to configure the PDFio sources (Issue #54) - Updated the maximum number of object streams in a single file from 4096 to
8192 (Issue #58)
- Updated the token reading code to protect against some obvious abuses of the - Updated the token reading code to protect against some obvious abuses of the
PDF format. PDF format.
- Updated the xref reading code to protect against loops. - Updated the xref reading code to protect against loops.
- Updated the object handling code to use a binary insertion algorithm -
provides a significant (~800x) improvement in open times.
- Fixed handling of encrypted PDFs with per-object file IDs (Issue #42) - Fixed handling of encrypted PDFs with per-object file IDs (Issue #42)
- Fixed handling of of trailer dictionaries that started immediately after the - Fixed handling of of trailer dictionaries that started immediately after the
"trailer" keyword (Issue #58) "trailer" keyword (Issue #58)

View File

@ -19,7 +19,6 @@
static pdfio_obj_t *add_obj(pdfio_file_t *pdf, size_t number, unsigned short generation, off_t offset); static pdfio_obj_t *add_obj(pdfio_file_t *pdf, size_t number, unsigned short generation, off_t offset);
static int compare_objmaps(_pdfio_objmap_t *a, _pdfio_objmap_t *b); static int compare_objmaps(_pdfio_objmap_t *a, _pdfio_objmap_t *b);
static int compare_objs(pdfio_obj_t **a, pdfio_obj_t **b);
static const char *get_info_string(pdfio_file_t *pdf, const char *key); static const char *get_info_string(pdfio_file_t *pdf, const char *key);
static bool load_obj_stream(pdfio_obj_t *obj); static bool load_obj_stream(pdfio_obj_t *obj);
static bool load_pages(pdfio_file_t *pdf, pdfio_obj_t *obj, size_t depth); static bool load_pages(pdfio_file_t *pdf, pdfio_obj_t *obj, size_t depth);
@ -916,20 +915,56 @@ pdfioFileFindObj(
pdfio_file_t *pdf, // I - PDF file pdfio_file_t *pdf, // I - PDF file
size_t number) // I - Object number (1 to N) size_t number) // I - Object number (1 to N)
{ {
pdfio_obj_t key, // Search key size_t left, // Left object
*keyptr, // Pointer to key right, // Right object
**match; // Pointer to match current; // Current object
if (pdf->num_objs > 0) PDFIO_DEBUG("pdfioFileFindObj(pdf=%p, number=%lu) alloc_objs=%lu, num_objs=%lu, objs=%p\n", (void *)pdf, (unsigned long)number, (unsigned long)(pdf ? pdf->alloc_objs : 0), (unsigned long)(pdf ? pdf->num_objs : 0), (void *)(pdf ? pdf->objs : NULL));
// Range check input...
if (!pdf || pdf->num_objs == 0 || number < 1)
return (NULL);
// Do a binary search for the object...
if ((current = number - 1) >= pdf->num_objs)
current = pdf->num_objs / 2;
PDFIO_DEBUG("pdfioFileFindObj: objs[current=%lu]=%p\n", (unsigned long)current, (void *)pdf->objs[current]);
if (number == pdf->objs[current]->number)
{ {
key.number = number; // Fast match...
keyptr = &key; return (pdf->objs[current]);
match = (pdfio_obj_t **)bsearch(&keyptr, pdf->objs, pdf->num_objs, sizeof(pdfio_obj_t *), (int (*)(const void *, const void *))compare_objs); }
else if (number < pdf->objs[current]->number)
return (match ? *match : NULL); {
left = 0;
right = current;
}
else
{
left = current;
right = pdf->num_objs - 1;
} }
while ((right - left) > 1)
{
current = (left + right) / 2;
if (number == pdf->objs[current]->number)
return (pdf->objs[current]);
else if (number < pdf->objs[current]->number)
right = current;
else
left = current;
}
if (number == pdf->objs[left]->number)
return (pdf->objs[left]);
else if (number == pdf->objs[right]->number)
return (pdf->objs[right]);
else
return (NULL); return (NULL);
} }
@ -1385,6 +1420,9 @@ add_obj(pdfio_file_t *pdf, // I - PDF file
off_t offset) // I - Offset in file off_t offset) // I - Offset in file
{ {
pdfio_obj_t *obj; // Object pdfio_obj_t *obj; // Object
size_t left, // Left object
right, // Right object
current; // Current object (center)
// Allocate memory for the object... // Allocate memory for the object...
@ -1410,8 +1448,6 @@ add_obj(pdfio_file_t *pdf, // I - PDF file
pdf->alloc_objs += 32; pdf->alloc_objs += 32;
} }
pdf->objs[pdf->num_objs ++] = obj;
obj->pdf = pdf; obj->pdf = pdf;
obj->number = number; obj->number = number;
obj->generation = generation; obj->generation = generation;
@ -1419,9 +1455,56 @@ add_obj(pdfio_file_t *pdf, // I - PDF file
PDFIO_DEBUG("add_obj: obj=%p, ->pdf=%p, ->number=%lu, ->offset=%lu\n", obj, pdf, (unsigned long)obj->number, (unsigned long)offset); PDFIO_DEBUG("add_obj: obj=%p, ->pdf=%p, ->number=%lu, ->offset=%lu\n", obj, pdf, (unsigned long)obj->number, (unsigned long)offset);
// Re-sort object array as needed... // Insert object into array as needed...
if (pdf->num_objs > 1 && pdf->objs[pdf->num_objs - 2]->number > number) if (pdf->num_objs == 0 || obj->number > pdf->objs[pdf->num_objs - 1]->number)
qsort(pdf->objs, pdf->num_objs, sizeof(pdfio_obj_t *), (int (*)(const void *, const void *))compare_objs); {
// Append object...
PDFIO_DEBUG("add_obj: Appending at %lu\n", (unsigned long)pdf->num_objs);
pdf->objs[pdf->num_objs] = obj;
pdf->last_obj = pdf->num_objs;
}
else
{
// Insert object...
if (obj->number < pdf->objs[pdf->last_obj]->number)
{
left = 0;
right = pdf->last_obj;
}
else
{
left = pdf->last_obj;
right = pdf->num_objs - 1;
}
while ((right - left) > 1)
{
current = (left + right) / 2;
if (obj->number < pdf->objs[current]->number)
right = current;
else
left = current;
}
if (obj->number < pdf->objs[left]->number)
current = left;
else if (obj->number < pdf->objs[right]->number)
current = right;
else
current = right;
PDFIO_DEBUG("add_obj: Inserting at %lu\n", (unsigned long)current);
if (current < pdf->num_objs)
memmove(pdf->objs + current + 1, pdf->objs + current, (pdf->num_objs - current) * sizeof(pdfio_obj_t *));
pdf->objs[current] = obj;
pdf->last_obj = current;
}
pdf->num_objs ++;
return (obj); return (obj);
} }
@ -1448,23 +1531,6 @@ compare_objmaps(_pdfio_objmap_t *a, // I - First object map
} }
//
// 'compare_objs()' - Compare the object numbers of two objects.
//
static int // O - Result of comparison
compare_objs(pdfio_obj_t **a, // I - First object
pdfio_obj_t **b) // I - Second object
{
if ((*a)->number < (*b)->number)
return (-1);
else if ((*a)->number == (*b)->number)
return (0);
else
return (1);
}
// //
// 'get_info_string()' - Get a string value from the Info dictionary. // 'get_info_string()' - Get a string value from the Info dictionary.
// //
@ -1737,7 +1803,7 @@ load_xref(
pdfio_stream_t *st; // Stream pdfio_stream_t *st; // Stream
unsigned char buffer[32]; // Read buffer unsigned char buffer[32]; // Read buffer
size_t num_sobjs = 0, // Number of object streams size_t num_sobjs = 0, // Number of object streams
sobjs[4096]; // Object streams to load sobjs[8192]; // Object streams to load
pdfio_obj_t *current; // Current object pdfio_obj_t *current; // Current object
if ((number = strtoimax(line, &ptr, 10)) < 1) if ((number = strtoimax(line, &ptr, 10)) < 1)

View File

@ -266,7 +266,8 @@ struct _pdfio_file_s // PDF file structure
alloc_dicts; // Allocated dictionaries alloc_dicts; // Allocated dictionaries
pdfio_dict_t **dicts; // Dictionaries pdfio_dict_t **dicts; // Dictionaries
size_t num_objs, // Number of objects size_t num_objs, // Number of objects
alloc_objs; // Allocated objects alloc_objs, // Allocated objects
last_obj; // Last object added
pdfio_obj_t **objs, // Objects pdfio_obj_t **objs, // Objects
*current_obj; // Current object being written/read *current_obj; // Current object being written/read
size_t num_objmaps, // Number of object maps size_t num_objmaps, // Number of object maps