pdfio/pdfio-file.c

669 lines
15 KiB
C
Raw Normal View History

2021-04-17 02:41:46 +02:00
//
// PDF file functions for pdfio.
//
// Copyright © 2021 by Michael R Sweet.
//
// Licensed under Apache License v2.0. See the file "LICENSE" for more
// information.
//
//
// Include necessary headers...
//
#include "pdfio-private.h"
2021-04-28 03:22:34 +02:00
#ifndef O_BINARY
# define O_BINARY 0
#endif // !O_BINARY
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
// Local functions...
2021-04-17 02:41:46 +02:00
//
static pdfio_obj_t *add_obj(pdfio_file_t *pdf, size_t number, unsigned short generation, off_t offset);
static int compare_objs(pdfio_obj_t **a, pdfio_obj_t **b);
2021-05-01 23:50:52 +02:00
static bool load_xref(pdfio_file_t *pdf, off_t xref_offset);
static bool write_trailer(pdfio_file_t *pdf);
2021-04-28 03:22:34 +02:00
//
// 'pdfioFileClose()' - Close a PDF file and free all memory used for it.
//
bool // O - `true` on success and `false` on failure
pdfioFileClose(pdfio_file_t *pdf) // I - PDF file
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
bool ret = true; // Return value
size_t i; // Looping var
// Range check input
if (!pdf)
return (false);
// Close the file itself...
if (pdf->mode == _PDFIO_MODE_WRITE)
ret = write_trailer(pdf);
if (close(pdf->fd) < 0)
ret = false;
// Free all data...
free(pdf->filename);
free(pdf->version);
for (i = 0; i < pdf->num_arrays; i ++)
_pdfioArrayDelete(pdf->arrays[i]);
free(pdf->arrays);
for (i = 0; i < pdf->num_dicts; i ++)
_pdfioDictDelete(pdf->dicts[i]);
free(pdf->dicts);
for (i = 0; i < pdf->num_objs; i ++)
_pdfioObjDelete(pdf->objs[i]);
free(pdf->objs);
free(pdf->pages);
for (i = 0; i < pdf->num_strings; i ++)
free(pdf->strings[i]);
free(pdf->strings);
free(pdf);
return (ret);
2021-04-17 02:41:46 +02:00
}
//
2021-04-28 03:22:34 +02:00
// 'pdfioFileCreate()' - Create a PDF file.
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
pdfio_file_t * // O - PDF file or `NULL` on error
pdfioFileCreate(
const char *filename, // I - Filename
const char *version, // I - PDF version number or `NULL` for default (2.0)
pdfio_error_cb_t error_cb, // I - Error callback or `NULL` for default
void *error_data) // I - Error callback data, if any
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
pdfio_file_t *pdf; // PDF file
// Range check input...
if (!filename)
return (NULL);
if (!version)
version = "2.0";
if (!error_cb)
{
error_cb = _pdfioFileDefaultError;
error_data = NULL;
}
// Allocate a PDF file structure...
if ((pdf = (pdfio_file_t *)calloc(1, sizeof(pdfio_file_t))) == NULL)
{
pdfio_file_t temp; // Dummy file
char message[8192]; // Message string
temp.filename = (char *)filename;
snprintf(message, sizeof(message), "Unable to allocate memory for PDF file - %s", strerror(errno));
(error_cb)(&temp, message, error_data);
return (NULL);
}
pdf->filename = strdup(filename);
pdf->version = strdup(version);
pdf->mode = _PDFIO_MODE_WRITE;
pdf->error_cb = error_cb;
pdf->error_data = error_data;
// Create the file...
if ((pdf->fd = open(filename, O_WRONLY | O_BINARY | O_CREAT | O_TRUNC, 0666)) < 0)
{
_pdfioFileError(pdf, "Unable to create file - %s", strerror(errno));
free(pdf->filename);
free(pdf->version);
free(pdf);
return (NULL);
}
// Write a standard PDF header...
if (!_pdfioFilePrintf(pdf, "%%PDF-%s\n%%\342\343\317\323\n", version))
{
pdfioFileClose(pdf);
unlink(filename);
return (NULL);
}
return (pdf);
2021-04-17 02:41:46 +02:00
}
//
2021-04-28 03:22:34 +02:00
// 'pdfioFileCreateObject()' - Create a new object in a PDF file.
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
pdfio_obj_t * // O - New object
pdfioFileCreateObject(
pdfio_file_t *pdf, // I - PDF file
pdfio_dict_t *dict) // I - Object dictionary
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
pdfio_obj_t *obj; // New object
// Range check input...
if (!pdf || !dict)
{
if (pdf)
_pdfioFileError(pdf, "Missing object dictionary.");
return (NULL);
}
if (pdf->mode != _PDFIO_MODE_WRITE)
return (NULL);
2021-04-28 03:22:34 +02:00
if (dict->pdf != pdf)
dict = pdfioDictCopy(pdf, dict); // Copy dictionary to new PDF
// Allocate memory for the object...
if ((obj = (pdfio_obj_t *)calloc(1, sizeof(pdfio_obj_t))) == NULL)
{
_pdfioFileError(pdf, "Unable to allocate memory for object - %s", strerror(errno));
return (NULL);
}
// Expand the objects array as needed
if (pdf->num_objs >= pdf->alloc_objs)
{
pdfio_obj_t **temp = (pdfio_obj_t **)realloc(pdf->objs, (pdf->alloc_objs + 32) * sizeof(pdfio_obj_t *));
if (!temp)
{
_pdfioFileError(pdf, "Unable to allocate memory for object - %s", strerror(errno));
free(obj);
2021-04-28 03:22:34 +02:00
return (NULL);
}
pdf->objs = temp;
pdf->alloc_objs += 32;
}
pdf->objs[pdf->num_objs ++] = obj;
// Initialize the object...
obj->pdf = pdf;
obj->number = pdf->num_objs;
obj->dict = dict;
obj->offset = _pdfioFileTell(pdf);
// Don't write anything just yet...
return (obj);
}
//
// 'pdfioFileCreatePage()' - Create a page in a PDF file.
//
pdfio_obj_t * // O - New object
pdfioFileCreatePage(pdfio_file_t *pdf, // I - PDF file
pdfio_dict_t *dict) // I - Page dictionary
{
// TODO: Implement me
(void)pdf;
(void)dict;
return (NULL);
2021-04-17 02:41:46 +02:00
}
//
// 'pdfioFileFindObject()' - Find an object using its object number.
//
// This differs from @link pdfioFileGetObject@ which takes an index into the
// list of objects while this function takes the object number.
//
pdfio_obj_t * // O - Object or `NULL` if not found
pdfioFileFindObject(
pdfio_file_t *pdf, // I - PDF file
size_t number) // I - Object number (1 to N)
{
pdfio_obj_t key, // Search key
*keyptr; // Pointer to key
if (pdf->num_objs > 0)
{
key.number = number;
keyptr = &key;
return ((pdfio_obj_t *)bsearch(&keyptr, pdf->objs, pdf->num_objs, sizeof(pdfio_obj_t *), (int (*)(const void *, const void *))compare_objs));
}
return (NULL);
}
2021-05-01 23:50:52 +02:00
//
// 'pdfioFileGetID()' - Get the PDF file's ID strings.
//
pdfio_array_t * // O - Array with binary strings
pdfioFileGetID(pdfio_file_t *pdf) // I - PDF file
{
return (pdf ? pdfioDictGetArray(pdf->trailer, "ID") : NULL);
}
2021-04-17 03:09:43 +02:00
//
2021-04-28 03:22:34 +02:00
// 'pdfioFileGetName()' - Get a PDF's filename.
2021-04-17 03:09:43 +02:00
//
2021-04-28 03:22:34 +02:00
const char * // O - Filename
pdfioFileGetName(pdfio_file_t *pdf) // I - PDF file
2021-04-17 03:09:43 +02:00
{
2021-04-28 03:22:34 +02:00
return (pdf ? pdf->filename : NULL);
2021-04-17 03:09:43 +02:00
}
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
// 'pdfioFileGetNumObjects()' - Get the number of objects in a PDF file.
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
size_t // O - Number of objects
pdfioFileGetNumObjects(
pdfio_file_t *pdf) // I - PDF file
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
return (pdf ? pdf->num_objs : 0);
2021-04-17 02:41:46 +02:00
}
//
2021-04-28 03:22:34 +02:00
// 'pdfioFileGetNumPages()' - Get the number of pages in a PDF file.
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
size_t // O - Number of pages
pdfioFileGetNumPages(pdfio_file_t *pdf) // I - PDF file
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
return (pdf ? pdf->num_pages : 0);
2021-04-17 02:41:46 +02:00
}
//
2021-04-28 03:22:34 +02:00
// 'pdfioFileGetObject()' - Get an object from a PDF file.
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
pdfio_obj_t * // O - Object
pdfioFileGetObject(pdfio_file_t *pdf, // I - PDF file
size_t number) // I - Object number (starting at 1)
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
if (!pdf || number < 1 || number > pdf->num_objs)
return (NULL);
else
return (pdf->objs[number - 1]);
2021-04-17 02:41:46 +02:00
}
//
2021-04-28 03:22:34 +02:00
// 'pdfioFileGetPage()' - Get a page object from a PDF file.
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
pdfio_obj_t * // O - Object
pdfioFileGetPage(pdfio_file_t *pdf, // I - PDF file
size_t number) // I - Page number (starting at 1)
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
if (!pdf || number < 1 || number > pdf->num_pages)
return (NULL);
else
return (pdf->pages[number - 1]);
2021-04-17 02:41:46 +02:00
}
//
2021-05-01 23:50:52 +02:00
// 'pdfioFileGetVersion()' - Get the PDF version number for a PDF file.
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
const char * // O - Version number or `NULL`
pdfioFileGetVersion(
pdfio_file_t *pdf) // I - PDF file
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
return (pdf ? pdf->version : NULL);
2021-04-17 02:41:46 +02:00
}
//
2021-04-28 03:22:34 +02:00
// 'pdfioFileOpen()' - Open a PDF file for reading.
2021-04-17 02:41:46 +02:00
//
2021-04-28 03:22:34 +02:00
pdfio_file_t * // O - PDF file
pdfioFileOpen(
const char *filename, // I - Filename
pdfio_error_cb_t error_cb, // I - Error callback or `NULL` for default
void *error_data) // I - Error callback data, if any
2021-04-17 02:41:46 +02:00
{
2021-04-28 03:22:34 +02:00
pdfio_file_t *pdf; // PDF file
char line[1024], // Line from file
*ptr; // Pointer into line
off_t xref_offset; // Offset to xref table
2021-04-28 03:22:34 +02:00
// Range check input...
if (!filename)
return (NULL);
if (!error_cb)
{
error_cb = _pdfioFileDefaultError;
error_data = NULL;
}
// Allocate a PDF file structure...
if ((pdf = (pdfio_file_t *)calloc(1, sizeof(pdfio_file_t))) == NULL)
{
pdfio_file_t temp; // Dummy file
char message[8192]; // Message string
temp.filename = (char *)filename;
snprintf(message, sizeof(message), "Unable to allocate memory for PDF file - %s", strerror(errno));
(error_cb)(&temp, message, error_data);
return (NULL);
}
pdf->filename = strdup(filename);
pdf->mode = _PDFIO_MODE_READ;
pdf->error_cb = error_cb;
pdf->error_data = error_data;
// Open the file...
if ((pdf->fd = open(filename, O_RDONLY | O_BINARY)) < 0)
{
_pdfioFileError(pdf, "Unable to open file - %s", strerror(errno));
free(pdf->filename);
free(pdf);
return (NULL);
}
// Read the header from the first line...
if (!_pdfioFileGets(pdf, line, sizeof(line)))
goto error;
if ((strncmp(line, "%PDF-1.", 7) && strncmp(line, "%PDF-2.", 7)) || !isdigit(line[7] & 255))
{
// Bad header
_pdfioFileError(pdf, "Bad header '%s'.", line);
goto error;
}
// Copy the version number...
pdf->version = strdup(line + 5);
// Grab the last 32 characters of the file to find the start of the xref table...
_pdfioFileSeek(pdf, -32, SEEK_END);
if (_pdfioFileRead(pdf, line, 32) < 32)
{
_pdfioFileError(pdf, "Unable to read startxref data.");
goto error;
}
line[32] = '\0';
if ((ptr = strstr(line, "startxref")) == NULL)
{
_pdfioFileError(pdf, "Unable to find start of xref table.");
goto error;
}
xref_offset = (off_t)strtol(ptr + 9, NULL, 10);
2021-05-01 23:50:52 +02:00
if (!load_xref(pdf, xref_offset))
goto error;
2021-04-28 03:22:34 +02:00
return (pdf);
// If we get here we had a fatal read error...
error:
pdfioFileClose(pdf);
return (NULL);
}
//
// '_pdfioObjAdd()' - Add an object to a file.
//
pdfio_obj_t * // O - Object
add_obj(pdfio_file_t *pdf, // I - PDF file
size_t number, // I - Object number
unsigned short generation, // I - Object generation
off_t offset) // I - Offset in file
{
pdfio_obj_t *obj; // Object
// Allocate memory for the object...
if ((obj = (pdfio_obj_t *)calloc(1, sizeof(pdfio_obj_t))) == NULL)
{
_pdfioFileError(pdf, "Unable to allocate memory for object - %s", strerror(errno));
return (NULL);
}
// Expand the objects array as needed
if (pdf->num_objs >= pdf->alloc_objs)
{
pdfio_obj_t **temp = (pdfio_obj_t **)realloc(pdf->objs, (pdf->alloc_objs + 32) * sizeof(pdfio_obj_t *));
if (!temp)
{
_pdfioFileError(pdf, "Unable to allocate memory for object - %s", strerror(errno));
free(obj);
return (NULL);
}
pdf->objs = temp;
pdf->alloc_objs += 32;
}
pdf->objs[pdf->num_objs ++] = obj;
obj->number = number;
obj->generation = generation;
obj->offset = offset;
// Re-sort object array as needed...
if (pdf->num_objs > 1 && pdf->objs[pdf->num_objs - 2]->number > number)
qsort(pdf->objs, pdf->num_objs, sizeof(pdfio_obj_t *), (int (*)(const void *, const void *))compare_objs);
return (obj);
}
//
// 'compare_objs()' - Compare the object numbers of two objects.
//
static int
compare_objs(pdfio_obj_t **a, // I - First object
pdfio_obj_t **b) // I - Second object
{
if ((*a)->number < (*b)->number)
return (-1);
else if ((*a)->number == (*b)->number)
return (0);
else
return (1);
2021-04-17 02:41:46 +02:00
}
2021-05-01 23:50:52 +02:00
//
// 'load_xref()' - Load an XREF table...
//
static bool // O - `true` on success, `false` on failure
load_xref(pdfio_file_t *pdf, // I - PDF file
off_t xref_offset) // I - Offset to xref
{
2021-05-04 18:24:29 +02:00
bool done = false; // Are we done?
char line[1024], // Line from file
*ptr; // Pointer into line
_pdfio_value_t trailer; // Trailer dictionary
2021-05-01 23:50:52 +02:00
while (!done)
{
if (_pdfioFileSeek(pdf, xref_offset, SEEK_SET) != xref_offset)
{
_pdfioFileError(pdf, "Unable to seek to start of xref table.");
return (false);
}
if (!_pdfioFileGets(pdf, line, sizeof(line)))
{
_pdfioFileError(pdf, "Unable to read start of xref table.");
return (false);
}
if (strcmp(line, "xref"))
{
_pdfioFileError(pdf, "Bad xref table header '%s'.", line);
return (false);
}
// Read the xref tables
while (_pdfioFileGets(pdf, line, sizeof(line)))
{
intmax_t number, // Object number
num_objects; // Number of objects
if (!strcmp(line, "trailer"))
break;
if (sscanf(line, "%jd%jd", &number, &num_objects) != 2)
{
_pdfioFileError(pdf, "Malformed xref table section '%s'.", line);
return (false);
}
// Read this group of objects...
for (; num_objects > 0; num_objects --, number ++)
{
2021-05-04 18:24:29 +02:00
intmax_t offset; // Offset in file
2021-05-01 23:50:52 +02:00
int generation; // Generation number
// Read a line from the file and validate it...
if (_pdfioFileRead(pdf, line, 20) != 20)
return (false);
line[20] = '\0';
if (strcmp(line + 18, "\r\n") && strcmp(line + 18, " \n") && strcmp(line + 18, " \r"))
{
_pdfioFileError(pdf, "Malformed xref table entry '%s'.", line);
return (false);
}
line[18] = '\0';
// Parse the line
if ((offset = strtoimax(line, &ptr, 10)) < 0)
{
_pdfioFileError(pdf, "Malformed xref table entry '%s'.", line);
return (false);
}
if ((generation = (int)strtol(ptr, &ptr, 10)) < 0 || generation > 65535)
{
_pdfioFileError(pdf, "Malformed xref table entry '%s'.", line);
return (false);
}
if (*ptr != ' ')
{
_pdfioFileError(pdf, "Malformed xref table entry '%s'.", line);
return (false);
}
ptr ++;
if (*ptr != 'f' && *ptr != 'n')
{
_pdfioFileError(pdf, "Malformed xref table entry '%s'.", line);
return (false);
}
if (*ptr == 'f')
continue; // Don't care about free objects...
// Create a placeholder for the object in memory...
if (pdfioFileFindObject(pdf, (size_t)number))
continue; // Don't replace newer object...
2021-05-01 23:50:52 +02:00
if (!add_obj(pdf, (size_t)number, (unsigned short)generation, offset))
return (false);
2021-05-01 23:50:52 +02:00
}
}
if (strcmp(line, "trailer"))
{
_pdfioFileError(pdf, "Missing trailer.");
return (false);
}
2021-05-04 18:24:29 +02:00
if (!_pdfioValueRead(pdf, &trailer))
{
_pdfioFileError(pdf, "Unable to read trailer dictionary.");
return (false);
}
else if (trailer.type != PDFIO_VALTYPE_DICT)
2021-05-04 18:24:29 +02:00
{
_pdfioFileError(pdf, "Trailer is not a dictionary.");
return (false);
}
PDFIO_DEBUG("load_xref: Contents of trailer dictionary:\n");
PDFIO_DEBUG_DICT(trailer.value.dict, "load_xref");
2021-05-04 18:24:29 +02:00
if (!pdf->trailer)
{
// Save the trailer dictionary and grab the root (catalog) and info
// objects...
pdf->trailer = trailer.value.dict;
if ((pdf->root = pdfioDictGetObject(pdf->trailer, "Root")) == NULL)
{
_pdfioFileError(pdf, "Missing Root object.");
return (false);
}
pdf->info = pdfioDictGetObject(pdf->trailer, "Info");
pdf->encrypt = pdfioDictGetObject(pdf->trailer, "Encrypt");
pdf->id_array = pdfioDictGetArray(pdf->trailer, "ID");
}
if ((xref_offset = (off_t)pdfioDictGetNumber(pdf->trailer, "Prev")) <= 0)
done = true;
2021-05-01 23:50:52 +02:00
}
return (true);
}
2021-04-28 03:22:34 +02:00
//
// 'write_trailer()' - Write the PDF catalog object, xref table, and trailer.
//
static bool // O - `true` on success, `false` on failure
write_trailer(pdfio_file_t *pdf) // I - PDF file
{
// TODO: Write trailer
(void)pdf;
return (false);
}