From a24fdee33502853d7afb0b0654d6d26e02763cb5 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Sat, 21 Dec 2024 11:31:54 -0500 Subject: [PATCH] Fix an uninitialized pointer issue in format_block, and some margin issues on the top of the page. --- doc/pdfio.md | 227 +++++++++++++++++++++++++++++++++++++++++++++- examples/Makefile | 4 +- examples/md2pdf.c | 76 +++++++++++----- 3 files changed, 279 insertions(+), 28 deletions(-) diff --git a/doc/pdfio.md b/doc/pdfio.md index 1112cd7..3e6a467 100644 --- a/doc/pdfio.md +++ b/doc/pdfio.md @@ -2,7 +2,7 @@ Introduction ============ PDFio is a simple C library for reading and writing PDF files. The primary -goals of pdfio are: +goals of PDFio are: - Read and write any version of PDF file - Provide access to pages, objects, and streams within a PDF file @@ -1203,5 +1203,228 @@ a PDF file that can be distributed. > Note: The md2pdf example is by far the most complex example code included with > PDFio and shows how to layout text, add headers and footers, add links, embed -> images, and format tables. +> images, format tables, and add an outline (table of contents) for navigation. +### Managing Document State + +The `md2pdf` program needs to maintain three sets of state - one for the +markdown document which is represented by nodes of type `mmd_t` and the others +for the PDF document and current PDF page which are contained in the `docdata_t` +structure: + +```c +typedef struct docdata_s // Document formatting data +{ + // State for the whole document + pdfio_file_t *pdf; // PDF file + pdfio_rect_t media_box; // Media (page) box + pdfio_rect_t crop_box; // Crop box (for margins) + pdfio_rect_t art_box; // Art box (for markdown content) + pdfio_obj_t *fonts[DOCFONT_MAX]; // Embedded fonts + size_t num_images; // Number of embedded images + docimage_t images[DOCIMAGE_MAX]; // Embedded images + const char *title; // Document title + char *heading; // Current document heading + size_t num_actions; // Number of actions for this document + docaction_t actions[DOCACTION_MAX]; // Actions for this document + size_t num_targets; // Number of targets for this document + doctarget_t targets[DOCTARGET_MAX]; // Targets for this document + size_t num_toc; // Number of table-of-contents entries + doctoc_t toc[DOCTOC_MAX]; // Table-of-contents entries + + // State for the current page + pdfio_stream_t *st; // Current page stream + double y; // Current position on page + docfont_t font; // Current font + double fsize; // Current font size + doccolor_t color; // Current color + pdfio_array_t *annots_array; // Annotations array (for links) + pdfio_obj_t *annots_obj; // Annotations object (for links) + size_t num_links; // Number of links for this page + doclink_t links[DOCLINK_MAX]; // Links for this page +} docdata_t; +``` + + +#### Document State + +The output is fixed to the "universal" media size (the intersection of US Letter +and ISO A4) with 1/2 inch margins - the `PAGE_` constants can be changed to +select a different size or margins. The `media_box` member contains the +"MediaBox" rectangle for the PDF pages, while the `crop_box` and `art_box` +members contain the "CropBox" and "ArtBox" values, respectively. + +Four embedded fonts are used: + +- `DOCFONT_REGULAR`: the default font used for text, +- `DOCFONT_BOLD`: a boldface font used for heading and strong text, +- `DOCFONT_ITALIC`: an italic/oblique font used for emphasized text, and +- `DOCFONT_MONOSPACE`: a fixed-width font used for code. + +By default the code uses the base PostScript fonts Helvetica, Helvetica-Bold, +Helvetica-Oblique, and Courier. The `USE_TRUETYPE` define can be used to +replace these with the Roboto TrueType fonts. + +Embedded JPEG and PNG images are copied into the PDF document, with the `images` +array containing the list of the images and their objects. + +The `title` member contains the document title, while the `heading` member +contains the current heading text. + +The `actions` array contains a list of action dictionaries for interior document +links that need to be resolved, while the `targets` array keeps track of the +location of the headings in the PDF document. + +The `toc` array contains a list of headings and is used to construct the PDF +outlines dictionaries/objects, which provides a table of contents for navigation +in most PDF readers. + + +#### Page State + +The `st` member provides the stream for the current page content. The `color`, +`font`, `fsize`, and `y` members provide the current graphics state on the page. + +The `annots_array`, `annots_obj`, `num_links`, and `links` members contain a +list of hyperlinks on the current page. + + +### Creating Pages + +The `new_page` function is used to start a new page. Aside from creating the +new page object and stream, it adds a standard header and footer to the page. +It starts by closing the current page if it is open: + +```c +// Close the current page... +if (dd->st) +{ + pdfioStreamClose(dd->st); + add_links(dd); +} +``` + +The new page needs a dictionary containing any link annotations, the media and +art boxes, the four fonts, and any images: + +```c +// Prep the new page... +page_dict = pdfioDictCreate(dd->pdf); + +dd->annots_array = pdfioArrayCreate(dd->pdf); +dd->annots_obj = pdfioFileCreateArrayObj(dd->pdf, dd->annots_array); +pdfioDictSetObj(page_dict, "Annots", dd->annots_obj); + +pdfioDictSetRect(page_dict, "MediaBox", &dd->media_box); +pdfioDictSetRect(page_dict, "ArtBox", &dd->art_box); + +for (fontface = DOCFONT_REGULAR; fontface < DOCFONT_MAX; fontface ++) + pdfioPageDictAddFont(page_dict, docfont_names[fontface], + dd->fonts[fontface]); + +for (i = 0; i < dd->num_images; i ++) + pdfioPageDictAddImage(page_dict, + pdfioStringCreatef(dd->pdf, "I%u", + (unsigned)i), + dd->images[i].obj); +``` + +Once the page dictionary is initialized, we create a new page and initialize +the current graphics state: + +```c +dd->st = pdfioFileCreatePage(dd->pdf, page_dict); +dd->color = DOCCOLOR_BLACK; +dd->font = DOCFONT_MAX; +dd->fsize = 0.0; +dd->y = dd->art_box.y2; +``` + +The header consists of a dark gray separating line and the document title. We +don't show the header on the first page: + +```c +// Add header/footer text +set_color(dd, DOCCOLOR_GRAY); +set_font(dd, DOCFONT_REGULAR, SIZE_HEADFOOT); + +if (pdfioFileGetNumPages(dd->pdf) > 1 && dd->title) +{ + // Show title in header... + width = pdfioContentTextMeasure(dd->fonts[DOCFONT_REGULAR], + dd->title, SIZE_HEADFOOT); + + pdfioContentTextBegin(dd->st); + pdfioContentTextMoveTo(dd->st, + dd->crop_box.x1 + 0.5 * (dd->crop_box.x2 - + dd->crop_box.x1 - width), + dd->crop_box.y2 - SIZE_HEADFOOT); + pdfioContentTextShow(dd->st, UNICODE_VALUE, dd->title); + pdfioContentTextEnd(dd->st); + + pdfioContentPathMoveTo(dd->st, dd->crop_box.x1, + dd->crop_box.y2 - + 2 * SIZE_HEADFOOT * LINE_HEIGHT + + SIZE_HEADFOOT); + pdfioContentPathLineTo(dd->st, dd->crop_box.x2, + dd->crop_box.y2 - + 2 * SIZE_HEADFOOT * LINE_HEIGHT + + SIZE_HEADFOOT); + pdfioContentStroke(dd->st); +} +``` + +The footer contains the same dark gray separating line with the current heading +and page number on opposite sides. The page number is always positioned on the +outer edge for a two-sided print - right justified on odd numbered pages and +left justified on even numbered pages: + +```c +// Show page number and current heading... +pdfioContentPathMoveTo(dd->st, dd->crop_box.x1, + dd->crop_box.y1 + SIZE_HEADFOOT * LINE_HEIGHT); +pdfioContentPathLineTo(dd->st, dd->crop_box.x2, + dd->crop_box.y1 + SIZE_HEADFOOT * LINE_HEIGHT); +pdfioContentStroke(dd->st); + +pdfioContentTextBegin(dd->st); +snprintf(temp, sizeof(temp), "%u", + (unsigned)pdfioFileGetNumPages(dd->pdf)); +if (pdfioFileGetNumPages(dd->pdf) & 1) +{ + // Page number on right... + width = pdfioContentTextMeasure(dd->fonts[DOCFONT_REGULAR], temp, + SIZE_HEADFOOT); + pdfioContentTextMoveTo(dd->st, dd->crop_box.x2 - width, + dd->crop_box.y1); +} +else +{ + // Page number on left... + pdfioContentTextMoveTo(dd->st, dd->crop_box.x1, dd->crop_box.y1); +} + +pdfioContentTextShow(dd->st, UNICODE_VALUE, temp); +pdfioContentTextEnd(dd->st); + +if (dd->heading) +{ + pdfioContentTextBegin(dd->st); + + if (pdfioFileGetNumPages(dd->pdf) & 1) + { + // Current heading on left... + pdfioContentTextMoveTo(dd->st, dd->crop_box.x1, dd->crop_box.y1); + } + else + { + width = pdfioContentTextMeasure(dd->fonts[DOCFONT_REGULAR], + dd->heading, SIZE_HEADFOOT); + pdfioContentTextMoveTo(dd->st, dd->crop_box.x2 - width, + dd->crop_box.y1); + } + + pdfioContentTextShow(dd->st, UNICODE_VALUE, dd->heading); + pdfioContentTextEnd(dd->st); +} +``` diff --git a/examples/Makefile b/examples/Makefile index 10bca9c..ada00b6 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -12,8 +12,8 @@ # Common options -CFLAGS = -g $(CPPFLAGS) -#CFLAGS = -g -fsanitize=address $(CPPFLAGS) +#CFLAGS = -g $(CPPFLAGS) +CFLAGS = -g -fsanitize=address $(CPPFLAGS) CPPFLAGS = -I.. LIBS = -L.. -lpdfio -lz diff --git a/examples/md2pdf.c b/examples/md2pdf.c index 97896e8..5f16bf4 100644 --- a/examples/md2pdf.c +++ b/examples/md2pdf.c @@ -101,6 +101,7 @@ typedef struct doctoc_s // Document table-of-contents entry typedef struct docdata_s // Document formatting data { + // State for the whole document pdfio_file_t *pdf; // PDF file pdfio_rect_t media_box; // Media (page) box pdfio_rect_t crop_box; // Crop box (for margins) @@ -110,6 +111,14 @@ typedef struct docdata_s // Document formatting data docimage_t images[DOCIMAGE_MAX]; // Embedded images const char *title; // Document title char *heading; // Current document heading + size_t num_actions; // Number of actions for this document + docaction_t actions[DOCACTION_MAX]; // Actions for this document + size_t num_targets; // Number of targets for this document + doctarget_t targets[DOCTARGET_MAX]; // Targets for this document + size_t num_toc; // Number of table-of-contents entries + doctoc_t toc[DOCTOC_MAX]; // Table-of-contents entries + + // State for the current page pdfio_stream_t *st; // Current page stream double y; // Current position on page docfont_t font; // Current font @@ -119,12 +128,6 @@ typedef struct docdata_s // Document formatting data pdfio_obj_t *annots_obj; // Annotations object (for links) size_t num_links; // Number of links for this page doclink_t links[DOCLINK_MAX]; // Links for this page - size_t num_actions; // Number of actions for this document - docaction_t actions[DOCACTION_MAX]; // Actions for this document - size_t num_targets; // Number of targets for this document - doctarget_t targets[DOCTARGET_MAX]; // Targets for this document - size_t num_toc; // Number of table-of-contents entries - doctoc_t toc[DOCTOC_MAX]; // Table-of-contents entries } docdata_t; typedef struct linefrag_s // Line fragment @@ -302,7 +305,8 @@ main(int argc, // I - Number of command-line arguments dd.art_box.x2 = PAGE_RIGHT; dd.art_box.y2 = PAGE_TOP; - dd.title = mmdGetMetadata(doc, "title"); + if ((dd.title = mmdGetMetadata(doc, "title")) == NULL) + dd.art_box.y2 = PAGE_HEADER; // No header if there is no title if (argc == 2) { @@ -587,12 +591,16 @@ format_block(docdata_t *dd, // I - Document data if (leader) { // Add leader text on first line... - frags[0].width = pdfioContentTextMeasure(dd->fonts[deffont], leader, fsize); - frags[0].height = fsize; - frags[0].x = left - frags[0].width; - frags[0].text = leader; - frags[0].font = deffont; - frags[0].color = DOCCOLOR_BLACK; + frags[0].type = MMD_TYPE_NORMAL_TEXT; + frags[0].width = pdfioContentTextMeasure(dd->fonts[deffont], leader, fsize); + frags[0].height = fsize; + frags[0].x = left - frags[0].width; + frags[0].imagenum = 0; + frags[0].text = leader; + frags[0].url = NULL; + frags[0].ws = false; + frags[0].font = deffont; + frags[0].color = DOCCOLOR_BLACK; num_frags = 1; lineheight = fsize * LINE_HEIGHT; @@ -806,20 +814,29 @@ format_code(docdata_t *dd, // I - Document data double right) // I - Right margin { mmd_t *code; // Current code block - double lineheight; // Line height + double lineheight, // Line height + margin_top; // Top margin + // Compute line height and initial top margin... + lineheight = SIZE_CODEBLOCK * LINE_HEIGHT; + margin_top = lineheight; + // Start a new page as needed... if (!dd->st) + { new_page(dd); - lineheight = SIZE_CODEBLOCK * LINE_HEIGHT; - dd->y -= 2.0 * lineheight; + margin_top = (1.0 - LINE_HEIGHT) * lineheight; + } + + dd->y -= lineheight + margin_top; + if ((dd->y - lineheight) < dd->art_box.y1) { new_page(dd); - dd->y -= lineheight; + dd->y -= lineheight / LINE_HEIGHT; } // Start a code text block... @@ -1338,15 +1355,15 @@ new_page(docdata_t *dd) // I - Document data // Prep the new page... page_dict = pdfioDictCreate(dd->pdf); + dd->annots_array = pdfioArrayCreate(dd->pdf); dd->annots_obj = pdfioFileCreateArrayObj(dd->pdf, dd->annots_array); + pdfioDictSetObj(page_dict, "Annots", dd->annots_obj); pdfioDictSetRect(page_dict, "MediaBox", &dd->media_box); // pdfioDictSetRect(page_dict, "CropBox", &dd->crop_box); pdfioDictSetRect(page_dict, "ArtBox", &dd->art_box); - pdfioDictSetObj(page_dict, "Annots", dd->annots_obj); - for (fontface = DOCFONT_REGULAR; fontface < DOCFONT_MAX; fontface ++) pdfioPageDictAddFont(page_dict, docfont_names[fontface], dd->fonts[fontface]); @@ -1452,12 +1469,13 @@ render_line(docdata_t *dd, // I - Document data size_t i; // Looping var linefrag_t *frag; // Current line fragment bool in_text = false; // Are we in a text block? +// bool ws_after; // Do we have whitespace after this fragment? if (!dd->st) { new_page(dd); - margin_top = 0.0; + margin_top = (1.0 - LINE_HEIGHT) * lineheight; } dd->y -= margin_top + lineheight; @@ -1465,7 +1483,7 @@ render_line(docdata_t *dd, // I - Document data { new_page(dd); - dd->y -= lineheight; + dd->y -= lineheight / LINE_HEIGHT; } for (i = 0, frag = frags; i < num_frags; i ++, frag ++) @@ -1510,10 +1528,20 @@ render_line(docdata_t *dd, // I - Document data in_text = true; } - if (frag->ws) - pdfioContentTextShowf(dd->st, UNICODE_VALUE, " %s", frag->text); +#if 0 + if (frag->font != DOCFONT_MONOSPACE && (i + 1) < num_frags && frag[1].ws && frag[1].font == DOCFONT_MONOSPACE) + { + // Don't use a monospace space to separate it from non-monospace + ws_after = true; + frag[1].ws = false; + } else - pdfioContentTextShow(dd->st, UNICODE_VALUE, frag->text); + { + ws_after = false; + } +#endif // 0 + + pdfioContentTextShowf(dd->st, UNICODE_VALUE, "%s%s%s", frag->ws ? " " : "", frag->text, /*ws_after ? " " :*/ ""); if (frag->url && dd->num_links < DOCLINK_MAX) {