From 2a85baaf816d5d37b37fd0f8b20c112648913a37 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Wed, 13 Dec 2023 12:21:59 -0500 Subject: [PATCH] Increase the maximum number of object streams in a file (Issue #58) - most files only contain 1 or 2... Change the implementation of add/find object to use a custom binary insertion sort algorithm rather than doing a qsort after every addition. This results in a significant improvement in open speed - from 2371 seconds (about 39.5 minutes) to 3.1 seconds for one large test file (an ESRI standard). --- CHANGES.md | 6 ++- pdfio-file.c | 134 ++++++++++++++++++++++++++++++++++++------------ pdfio-private.h | 3 +- 3 files changed, 107 insertions(+), 36 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 64e7ac3..b3d165d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,16 +5,20 @@ Changes in PDFio v1.2.0 (Month DD, YYYY) ----------------------- +- Now use autoconf to configure the PDFio sources (Issue #54) - Added `pdfioFileCreateNumberObj` and `pdfioFileCreateStringObj` functions (Issue #14) - Added `pdfioContentTextMeasure` function (Issue #17) - Added `pdfioContentTextNewLineShow` and `pdfioContentTextNewLineShowf` functions (Issue #24) - Renamed `pdfioContentTextNextLine` to `pdfioContentTextNewLine`. -- Now use autoconf to configure the PDFio sources (Issue #54) +- Updated the maximum number of object streams in a single file from 4096 to + 8192 (Issue #58) - Updated the token reading code to protect against some obvious abuses of the PDF format. - Updated the xref reading code to protect against loops. +- Updated the object handling code to use a binary insertion algorithm - + provides a significant (~800x) improvement in open times. - Fixed handling of encrypted PDFs with per-object file IDs (Issue #42) - Fixed handling of of trailer dictionaries that started immediately after the "trailer" keyword (Issue #58) diff --git a/pdfio-file.c b/pdfio-file.c index 486b1ba..a3ded25 100644 --- a/pdfio-file.c +++ b/pdfio-file.c @@ -19,7 +19,6 @@ static pdfio_obj_t *add_obj(pdfio_file_t *pdf, size_t number, unsigned short generation, off_t offset); static int compare_objmaps(_pdfio_objmap_t *a, _pdfio_objmap_t *b); -static int compare_objs(pdfio_obj_t **a, pdfio_obj_t **b); static const char *get_info_string(pdfio_file_t *pdf, const char *key); static bool load_obj_stream(pdfio_obj_t *obj); static bool load_pages(pdfio_file_t *pdf, pdfio_obj_t *obj, size_t depth); @@ -916,21 +915,57 @@ pdfioFileFindObj( pdfio_file_t *pdf, // I - PDF file size_t number) // I - Object number (1 to N) { - pdfio_obj_t key, // Search key - *keyptr, // Pointer to key - **match; // Pointer to match + size_t left, // Left object + right, // Right object + current; // Current object - if (pdf->num_objs > 0) + PDFIO_DEBUG("pdfioFileFindObj(pdf=%p, number=%lu) alloc_objs=%lu, num_objs=%lu, objs=%p\n", (void *)pdf, (unsigned long)number, (unsigned long)(pdf ? pdf->alloc_objs : 0), (unsigned long)(pdf ? pdf->num_objs : 0), (void *)(pdf ? pdf->objs : NULL)); + + // Range check input... + if (!pdf || pdf->num_objs == 0 || number < 1) + return (NULL); + + // Do a binary search for the object... + if ((current = number - 1) >= pdf->num_objs) + current = pdf->num_objs / 2; + + PDFIO_DEBUG("pdfioFileFindObj: objs[current=%lu]=%p\n", (unsigned long)current, (void *)pdf->objs[current]); + + if (number == pdf->objs[current]->number) { - key.number = number; - keyptr = &key; - match = (pdfio_obj_t **)bsearch(&keyptr, pdf->objs, pdf->num_objs, sizeof(pdfio_obj_t *), (int (*)(const void *, const void *))compare_objs); - - return (match ? *match : NULL); + // Fast match... + return (pdf->objs[current]); + } + else if (number < pdf->objs[current]->number) + { + left = 0; + right = current; + } + else + { + left = current; + right = pdf->num_objs - 1; } - return (NULL); + while ((right - left) > 1) + { + current = (left + right) / 2; + + if (number == pdf->objs[current]->number) + return (pdf->objs[current]); + else if (number < pdf->objs[current]->number) + right = current; + else + left = current; + } + + if (number == pdf->objs[left]->number) + return (pdf->objs[left]); + else if (number == pdf->objs[right]->number) + return (pdf->objs[right]); + else + return (NULL); } @@ -1385,6 +1420,9 @@ add_obj(pdfio_file_t *pdf, // I - PDF file off_t offset) // I - Offset in file { pdfio_obj_t *obj; // Object + size_t left, // Left object + right, // Right object + current; // Current object (center) // Allocate memory for the object... @@ -1410,8 +1448,6 @@ add_obj(pdfio_file_t *pdf, // I - PDF file pdf->alloc_objs += 32; } - pdf->objs[pdf->num_objs ++] = obj; - obj->pdf = pdf; obj->number = number; obj->generation = generation; @@ -1419,9 +1455,56 @@ add_obj(pdfio_file_t *pdf, // I - PDF file PDFIO_DEBUG("add_obj: obj=%p, ->pdf=%p, ->number=%lu, ->offset=%lu\n", obj, pdf, (unsigned long)obj->number, (unsigned long)offset); - // Re-sort object array as needed... - if (pdf->num_objs > 1 && pdf->objs[pdf->num_objs - 2]->number > number) - qsort(pdf->objs, pdf->num_objs, sizeof(pdfio_obj_t *), (int (*)(const void *, const void *))compare_objs); + // Insert object into array as needed... + if (pdf->num_objs == 0 || obj->number > pdf->objs[pdf->num_objs - 1]->number) + { + // Append object... + PDFIO_DEBUG("add_obj: Appending at %lu\n", (unsigned long)pdf->num_objs); + + pdf->objs[pdf->num_objs] = obj; + pdf->last_obj = pdf->num_objs; + } + else + { + // Insert object... + if (obj->number < pdf->objs[pdf->last_obj]->number) + { + left = 0; + right = pdf->last_obj; + } + else + { + left = pdf->last_obj; + right = pdf->num_objs - 1; + } + + while ((right - left) > 1) + { + current = (left + right) / 2; + + if (obj->number < pdf->objs[current]->number) + right = current; + else + left = current; + } + + if (obj->number < pdf->objs[left]->number) + current = left; + else if (obj->number < pdf->objs[right]->number) + current = right; + else + current = right; + + PDFIO_DEBUG("add_obj: Inserting at %lu\n", (unsigned long)current); + + if (current < pdf->num_objs) + memmove(pdf->objs + current + 1, pdf->objs + current, (pdf->num_objs - current) * sizeof(pdfio_obj_t *)); + + pdf->objs[current] = obj; + pdf->last_obj = current; + } + + pdf->num_objs ++; return (obj); } @@ -1448,23 +1531,6 @@ compare_objmaps(_pdfio_objmap_t *a, // I - First object map } -// -// 'compare_objs()' - Compare the object numbers of two objects. -// - -static int // O - Result of comparison -compare_objs(pdfio_obj_t **a, // I - First object - pdfio_obj_t **b) // I - Second object -{ - if ((*a)->number < (*b)->number) - return (-1); - else if ((*a)->number == (*b)->number) - return (0); - else - return (1); -} - - // // 'get_info_string()' - Get a string value from the Info dictionary. // @@ -1737,7 +1803,7 @@ load_xref( pdfio_stream_t *st; // Stream unsigned char buffer[32]; // Read buffer size_t num_sobjs = 0, // Number of object streams - sobjs[4096]; // Object streams to load + sobjs[8192]; // Object streams to load pdfio_obj_t *current; // Current object if ((number = strtoimax(line, &ptr, 10)) < 1) diff --git a/pdfio-private.h b/pdfio-private.h index 2093181..b32acf6 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -266,7 +266,8 @@ struct _pdfio_file_s // PDF file structure alloc_dicts; // Allocated dictionaries pdfio_dict_t **dicts; // Dictionaries size_t num_objs, // Number of objects - alloc_objs; // Allocated objects + alloc_objs, // Allocated objects + last_obj; // Last object added pdfio_obj_t **objs, // Objects *current_obj; // Current object being written/read size_t num_objmaps, // Number of object maps