From 258205237734ed5d23a02332bb9f227d545e5082 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Thu, 18 Sep 2025 12:05:15 -0400 Subject: [PATCH] More tagged PDF changes (Issue #123): - Add MarkInfo dictionary to document catalog when generating marked content. - Add basic structured tags to md2pdf example. --- examples/md2pdf.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++- pdfio-content.c | 17 ++++++++- pdfio-private.h | 1 + 3 files changed, 111 insertions(+), 4 deletions(-) diff --git a/examples/md2pdf.c b/examples/md2pdf.c index 1c77889..4e3d46b 100644 --- a/examples/md2pdf.c +++ b/examples/md2pdf.c @@ -122,6 +122,9 @@ typedef struct docdata_s // Document formatting data // State for the current page pdfio_stream_t *st; // Current page stream double y; // Current position on page + const char *tag; // Current block tag + bool in_table, // Are we in a table? + in_row; // Are we in a table row? docfont_t font; // Current font double fsize; // Current font size doccolor_t color; // Current color @@ -472,6 +475,7 @@ add_links(docdata_t *dd) // I - Document data // Create the annotation object pointing to the action... dict = pdfioDictCreate(dd->pdf); pdfioDictSetName(dict, "Subtype", "Link"); + pdfioDictSetNumber(dict, "F", 4); // Print flag pdfioDictSetRect(dict, "Rect", &l->box); border = pdfioArrayCreate(dd->pdf); pdfioArrayAppendNumber(border, 0.0); @@ -618,6 +622,50 @@ format_block(docdata_t *dd, // I - Document data frag = frags + num_frags; + switch (mmdGetType(block)) + { + case MMD_TYPE_HEADING_1 : + dd->tag = "H1"; + break; + + case MMD_TYPE_HEADING_2 : + dd->tag = "H2"; + break; + + case MMD_TYPE_HEADING_3 : + dd->tag = "H3"; + break; + + case MMD_TYPE_HEADING_4 : + dd->tag = "H4"; + break; + + case MMD_TYPE_HEADING_5 : + dd->tag = "H5"; + break; + + case MMD_TYPE_HEADING_6 : + dd->tag = "H6"; + break; + + case MMD_TYPE_TABLE_HEADER_CELL : + dd->tag = "TH"; + break; + + case MMD_TYPE_TABLE_BODY_CELL_LEFT : + case MMD_TYPE_TABLE_BODY_CELL_CENTER : + case MMD_TYPE_TABLE_BODY_CELL_RIGHT : + dd->tag = "TD"; + break; + + default : + dd->tag = "P"; + break; + } + + if (dd->st) + pdfioContentBeginMarked(dd->st, dd->tag, /*dict*/NULL); + // Loop through the block and render lines... for (current = mmdGetFirstChild(block), x = left; current; current = next) { @@ -796,6 +844,9 @@ format_block(docdata_t *dd, // I - Document data pdfioContentRestore(dd->st); } } + + pdfioContentEndMarked(dd->st); + dd->tag = NULL; } @@ -835,6 +886,9 @@ format_code(docdata_t *dd, // I - Document data pdfioContentFillAndStroke(dd->st, false); // Start a code text block... + dd->tag = "P"; + pdfioContentBeginMarked(dd->st, dd->tag, /*dict*/NULL); + set_font(dd, DOCFONT_MONOSPACE, SIZE_CODEBLOCK); pdfioContentTextBegin(dd->st); pdfioContentTextMoveTo(dd->st, left, dd->y); @@ -869,6 +923,9 @@ format_code(docdata_t *dd, // I - Document data pdfioContentTextEnd(dd->st); dd->y += lineheight; + pdfioContentEndMarked(dd->st); + dd->tag = NULL; + // Draw the bottom padding... set_color(dd, DOCCOLOR_LTGRAY); pdfioContentPathRect(dd->st, left - CODE_PADDING, dd->y - CODE_PADDING - (LINE_HEIGHT - 1.0) * SIZE_CODEBLOCK, right - left + 2.0 * CODE_PADDING, CODE_PADDING); @@ -1144,8 +1201,17 @@ format_table(docdata_t *dd, // I - Document data } // Render each table row... + dd->in_table = true; + + if (dd->st) + pdfioContentBeginMarked(dd->st, "Table", /*dict*/NULL); + for (row = 0, rowptr = rows; row < num_rows; row ++, rowptr ++) render_row(dd, num_cols, cols, rowptr); + + pdfioContentEndMarked(dd->st); + + dd->in_table = false; } @@ -1359,6 +1425,16 @@ new_page(docdata_t *dd) // I - Document data // Close the current page... if (dd->st) { + if (dd->tag) + { + // Close current tag and any row or table... + pdfioContentEndMarked(dd->st); + if (dd->in_row) + pdfioContentEndMarked(dd->st); + if (dd->in_table) + pdfioContentEndMarked(dd->st); + } + pdfioStreamClose(dd->st); add_links(dd); } @@ -1387,6 +1463,7 @@ new_page(docdata_t *dd) // I - Document data dd->y = dd->art_box.y2; // Add header/footer text + pdfioContentBeginMarked(dd->st, "Artifact", /*dict*/NULL); set_color(dd, DOCCOLOR_GRAY); set_font(dd, DOCFONT_REGULAR, SIZE_HEADFOOT); @@ -1445,6 +1522,17 @@ new_page(docdata_t *dd) // I - Document data pdfioContentTextShow(dd->st, UNICODE_VALUE, dd->heading); pdfioContentTextEnd(dd->st); } + + pdfioContentEndMarked(dd->st); + + if (dd->in_table) + pdfioContentBeginMarked(dd->st, "Table", /*dict*/NULL); + + if (dd->in_row) + pdfioContentBeginMarked(dd->st, "TR", /*dict*/NULL); + + if (dd->tag) + pdfioContentBeginMarked(dd->st, dd->tag, /*dict*/NULL); } @@ -1617,10 +1705,12 @@ render_row(docdata_t *dd, // I - Document data // Start a new page as needed... if (!dd->st) new_page(dd); - - if ((dd->y - row->height) < dd->art_box.y1) + else if ((dd->y - row->height) < dd->art_box.y1) new_page(dd); + dd->in_row = true; + pdfioContentBeginMarked(dd->st, "TR", /*dict*/NULL); + if (mmdGetType(row->cells[0]) == MMD_TYPE_TABLE_HEADER_CELL) { // Header row, no border... @@ -1651,6 +1741,9 @@ render_row(docdata_t *dd, // I - Document data } dd->y = row_y - row->height; + + pdfioContentEndMarked(dd->st); + dd->in_row = false; } diff --git a/pdfio-content.c b/pdfio-content.c index b22509f..771073d 100644 --- a/pdfio-content.c +++ b/pdfio-content.c @@ -479,6 +479,7 @@ pdfioContentBeginMarked( if (!st || !name) return (false); + // Send the BDC/BMC command... if (!pdfioStreamPrintf(st, "%N", name)) return (false); @@ -488,13 +489,25 @@ pdfioContentBeginMarked( if (!write_dict(st, dict)) return (false); - return (pdfioStreamPuts(st, "BDC\n")); + if (!pdfioStreamPuts(st, "BDC\n")) + return (false); } else { // No dictionary so use the BMC operator... - return (pdfioStreamPuts(st, " BMC\n")); + if (!pdfioStreamPuts(st, " BMC\n")) + return (false); } + + // Make sure we have the MarkInfo dictionary in the catalog... + if (!st->pdf->markinfo) + { + st->pdf->markinfo = pdfioDictCreate(st->pdf); + pdfioDictSetBoolean(st->pdf->markinfo, "Marked", true); + pdfioDictSetDict(pdfioObjGetDict(st->pdf->root_obj), "MarkInfo", st->pdf->markinfo); + } + + return (true); } diff --git a/pdfio-private.h b/pdfio-private.h index 4a1d992..2531a1c 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -273,6 +273,7 @@ struct _pdfio_file_s // PDF file structure *unicode_obj; // Unicode font encoding object pdfio_array_t *id_array; // ID array bool encrypt_metadata; // Encrypt metadata? + pdfio_dict_t *markinfo; // MarkInfo dictionary, if any // Allocated data elements size_t num_arrays, // Number of arrays