From 7c69f13ba9811d147e20717053c13eb79029c810 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Tue, 30 Sep 2025 18:38:02 -0400 Subject: [PATCH 1/4] Document tagged PDF support (Issue #123) --- doc/pdfio.3 | 96 +++++++++++++++++++++++++++++++++++++++++++++++- doc/pdfio.html | 98 +++++++++++++++++++++++++++++++++++++++++++++++++ doc/pdfio.md | 67 +++++++++++++++++++++++++++++++++ pdfio-content.c | 11 +++++- pdfio-content.h | 2 +- 5 files changed, 271 insertions(+), 3 deletions(-) diff --git a/doc/pdfio.3 b/doc/pdfio.3 index 9e158e3..d16995b 100644 --- a/doc/pdfio.3 +++ b/doc/pdfio.3 @@ -1,4 +1,4 @@ -.TH pdfio 3 "pdf read/write library" "2025-08-26" "pdf read/write library" +.TH pdfio 3 "pdf read/write library" "2025-09-30" "pdf read/write library" .SH NAME pdfio \- pdf read/write library .SH Introduction @@ -1032,6 +1032,45 @@ pdfioContentTextShowf draws a formatted string in a text block pdfioContentTextShowJustified draws an array of literal strings with offsets between them +.SS Tagged and Marked PDF Content +.PP +Content in a page stream can be tagged to help a PDF reader application know the kind and organization of that content. Content inserted using the PDFio Page Stream Functions can be tagged by surrounding it with the pdfioContentBeginMarked and pdfioContentEndMarked functions. +.PP +The pdfioContentBeginMarked function accepts a named tag and optional dictionary of attributes such as the marked content identifier ("MCID"). For example, the following code tags a paragraph of text: +.nf + + pdfio_file_t *pdf; // PDF file + pdfio_stream_t *st; // Page stream + + pdfioContentBeginMarked(st, "P", /*dict*/NULL); + + pdfioContentTextShow(st, /*unicode*/false, "Mary had a little lamb\\n"); + pdfioContentTextShow(st, /*unicode*/false, "whose fleece was white as snow.\\n"); + pdfioContentTextShow(st, /*unicode*/false, "And everywhere that Mary went\\n"); + pdfioContentTextShow(st, /*unicode*/false, "the lamb was sure to go,\\n"); + + pdfioContentEndMarked(st); +.fi +.PP +To mark the same paragraph with a content identifier you would first create a dictionary containing the "MCID" key/value pair and then mark the paragraph with that dictionary: +.nf + + pdfio_file_t *pdf; // PDF file + pdfio_stream_t *st; // Page stream + pdfio_dict_t *dict; // Content dictionary + + dict = pdfioDictCreate(pdf); + pdfioDictSetNumber(dict, "MCID", 42); + + pdfioContentBeginMarked(st, "P", dict); + + pdfioContentTextShow(st, /*unicode*/false, "Mary had a little lamb\\n"); + pdfioContentTextShow(st, /*unicode*/false, "whose fleece was white as snow.\\n"); + pdfioContentTextShow(st, /*unicode*/false, "And everywhere that Mary went\\n"); + pdfioContentTextShow(st, /*unicode*/false, "the lamb was sure to go,\\n"); + + pdfioContentEndMarked(st); +.fi .SH Examples .PP PDFio includes several example programs that are typically installed to the /usr/share/doc/pdfio/examples or /usr/local/share/doc/pdfio/examples directories. A makefile is included to build them. @@ -1691,6 +1730,9 @@ The md2pdf program needs to maintain three sets of state \- one for the markdown // State for the current page pdfio_stream_t *st; // Current page stream double y; // Current position on page + const char *tag; // Current block tag + bool in_table, // Are we in a table? + in_row; // Are we in a table row? docfont_t font; // Current font double fsize; // Current font size doccolor_t color; // Current color @@ -2167,6 +2209,9 @@ Code blocks consist of one or more lines of plain monospaced text. We draw a lig pdfioContentFillAndStroke(dd\->st, false); // Start a code text block... + dd\->tag = "P"; + pdfioContentBeginMarked(dd\->st, dd\->tag, /*dict*/NULL); + set_font(dd, DOCFONT_MONOSPACE, SIZE_CODEBLOCK); pdfioContentTextBegin(dd\->st); pdfioContentTextMoveTo(dd\->st, left, dd\->y); @@ -2203,6 +2248,9 @@ Code blocks consist of one or more lines of plain monospaced text. We draw a lig pdfioContentTextEnd(dd\->st); dd\->y += lineheight; + pdfioContentEndMarked(dd\->st); + dd\->tag = NULL; + // Draw the bottom padding... set_color(dd, DOCCOLOR_LTGRAY); pdfioContentPathRect(dd\->st, left \- CODE_PADDING, @@ -2332,8 +2380,17 @@ Finally, we render each row in the table: .nf // Render each table row... + dd\->in_table = true; + + if (dd\->st) + pdfioContentBeginMarked(dd\->st, "Table", /*dict*/NULL); + for (row = 0, rowptr = rows; row < num_rows; row ++, rowptr ++) render_row(dd, num_cols, cols, rowptr); + + pdfioContentEndMarked(dd\->st); + + dd\->in_table = false; .fi .PP Rendering the Markdown Document @@ -2957,6 +3014,30 @@ bool pdfioArrayRemove ( .fi .PP +.SS pdfioContentBeginMarked +Start marked content with an optional dictionary. +.PP +.nf +bool pdfioContentBeginMarked ( + pdfio_stream_t *st, + const char *tag, + pdfio_dict_t *dict +); +.fi +.PP +This function starts an area of marked content with an optional dictionary. +It must be paired with a call to the \fIpdfioContentEndMarked\fR function. +.PP +The "tag" argument specifies the tag name string for the content such as "P" +for a paragraph, "H1" for a top-level heading, and so forth. The "dict" +argument specifies an optional dictionary of properties for the content such +as the marked content identifier ("MCID") number. +.PP +Calling this function sets the "Marked" key in the "MarkInfo" dictionary of +the document catalog. The caller is responsible for setting the +"StructTreeRoot" dictionary when creating marked content. + + .SS pdfioContentClip Clip output to the current path. .PP @@ -2982,6 +3063,19 @@ bool pdfioContentDrawImage ( .PP The object name must be part of the page dictionary resources, typically using the \fIpdfioPageDictAddImage\fR function. +.SS pdfioContentEndMarked +End marked content. +.PP +.nf +bool pdfioContentEndMarked ( + pdfio_stream_t *st +); +.fi +.PP +This function ends an area of marked content that was started using the +\fIpdfioContentBeginMarked\fR function. + + .SS pdfioContentFill Fill the current path. .PP diff --git a/doc/pdfio.html b/doc/pdfio.html index 8feec04..fe2b175 100644 --- a/doc/pdfio.html +++ b/doc/pdfio.html @@ -273,6 +273,7 @@ span.string {
  • PDF Objects
  • PDF Streams
  • PDF Content Helper Functions
  • +
  • Tagged and Marked PDF Content
  • Examples +

    Tagged and Marked PDF Content

    +

    Content in a page stream can be tagged to help a PDF reader application know the kind and organization of that content. Content inserted using the PDFio Page Stream Functions can be tagged by surrounding it with the pdfioContentBeginMarked and pdfioContentEndMarked functions.

    +

    The pdfioContentBeginMarked function accepts a named tag and optional dictionary of attributes such as the marked content identifier ("MCID"). For example, the following code tags a paragraph of text:

    +
    pdfio_file_t   *pdf;  // PDF file
    +pdfio_stream_t *st;   // Page stream
    +
    +pdfioContentBeginMarked(st, "P", /*dict*/NULL);
    +
    +pdfioContentTextShow(st, /*unicode*/false, "Mary had a little lamb\n");
    +pdfioContentTextShow(st, /*unicode*/false, "whose fleece was white as snow.\n");
    +pdfioContentTextShow(st, /*unicode*/false, "And everywhere that Mary went\n");
    +pdfioContentTextShow(st, /*unicode*/false, "the lamb was sure to go,\n");
    +
    +pdfioContentEndMarked(st);
    +
    +

    To mark the same paragraph with a content identifier you would first create a dictionary containing the "MCID" key/value pair and then mark the paragraph with that dictionary:

    +
    pdfio_file_t   *pdf;  // PDF file
    +pdfio_stream_t *st;   // Page stream
    +pdfio_dict_t   *dict; // Content dictionary
    +
    +dict = pdfioDictCreate(pdf);
    +pdfioDictSetNumber(dict, "MCID", 42);
    +
    +pdfioContentBeginMarked(st, "P", dict);
    +
    +pdfioContentTextShow(st, /*unicode*/false, "Mary had a little lamb\n");
    +pdfioContentTextShow(st, /*unicode*/false, "whose fleece was white as snow.\n");
    +pdfioContentTextShow(st, /*unicode*/false, "And everywhere that Mary went\n");
    +pdfioContentTextShow(st, /*unicode*/false, "the lamb was sure to go,\n");
    +
    +pdfioContentEndMarked(st);
    +

    Examples

    PDFio includes several example programs that are typically installed to the /usr/share/doc/pdfio/examples or /usr/local/share/doc/pdfio/examples directories. A makefile is included to build them.

    Read PDF Metadata

    @@ -1753,6 +1788,9 @@ pdfioStreamClose(page_st); // State for the current page pdfio_stream_t *st; // Current page stream double y; // Current position on page + const char *tag; // Current block tag + bool in_table, // Are we in a table? + in_row; // Are we in a table row? docfont_t font; // Current font double fsize; // Current font size doccolor_t color; // Current color @@ -2130,6 +2168,9 @@ pdfioContentPathRect(dd->st, left - CODE_PADDING, dd->y + SIZE_CODEBLOCK, pdfioContentFillAndStroke(dd->st, false); // Start a code text block... +dd->tag = "P"; +pdfioContentBeginMarked(dd->st, dd->tag, /*dict*/NULL); + set_font(dd, DOCFONT_MONOSPACE, SIZE_CODEBLOCK); pdfioContentTextBegin(dd->st); pdfioContentTextMoveTo(dd->st, left, dd->y); @@ -2166,6 +2207,9 @@ pdfioContentTextMoveTo(dd->st, left, dd->y); pdfioContentTextEnd(dd->st); dd->y += lineheight; +pdfioContentEndMarked(dd->st); +dd->tag = NULL; + // Draw the bottom padding... set_color(dd, DOCCOLOR_LTGRAY); pdfioContentPathRect(dd->st, left - CODE_PADDING, @@ -2276,8 +2320,17 @@ pdfioContentFillAndStroke(dd->st, false);

    Finally, we render each row in the table:

    // Render each table row...
    +dd->in_table = true;
    +
    +if (dd->st)
    +  pdfioContentBeginMarked(dd->st, "Table", /*dict*/NULL);
    +
     for (row = 0, rowptr = rows; row < num_rows; row ++, rowptr ++)
       render_row(dd, num_cols, cols, rowptr);
    +
    +pdfioContentEndMarked(dd->st);
    +
    +dd->in_table = false;
     

    Rendering the Markdown Document

    The formatted content in arrays of linefrag_t and tablerow_t structures are passed to the render_line and render_row functions respectively to produce content in the PDF document.

    @@ -2767,6 +2820,35 @@ size_t pdfioArrayGetSize(pdfio_array_t *a);

    Return Value

    true on success, false otherwise

    +

     PDFio 1.6 pdfioContentBeginMarked

    +

    Start marked content with an optional dictionary.

    +

    +bool pdfioContentBeginMarked(pdfio_stream_t *st, const char *tag, pdfio_dict_t *dict);

    +

    Parameters

    + + + + + + + +
    stStream
    tagTag name of marked content
    dictDictionary of parameters or NULL if none
    +

    Return Value

    +

    true on success, false on failure

    +

    Discussion

    +

    This function starts an area of marked content with an optional dictionary. +It must be paired with a call to the pdfioContentEndMarked function.
    +
    +The "tag" argument specifies the tag name string for the content such as "P" +for a paragraph, "H1" for a top-level heading, and so forth. The "dict" +argument specifies an optional dictionary of properties for the content such +as the marked content identifier ("MCID") number.
    +
    +Calling this function sets the "Marked" key in the "MarkInfo" dictionary of +the document catalog. The caller is responsible for setting the +"StructTreeRoot" dictionary when creating marked content. + +

    pdfioContentClip

    Clip output to the current path.

    @@ -2804,6 +2886,22 @@ size_t pdfioArrayGetSize(pdfio_array_t *a);

    Discussion

    The object name must be part of the page dictionary resources, typically using the pdfioPageDictAddImage function.

    +

     PDFio 1.6 pdfioContentEndMarked

    +

    End marked content.

    +

    +bool pdfioContentEndMarked(pdfio_stream_t *st);

    +

    Parameters

    + + + +
    stStream
    +

    Return Value

    +

    true on success, false on failure

    +

    Discussion

    +

    This function ends an area of marked content that was started using the +pdfioContentBeginMarked function. + +

    pdfioContentFill

    Fill the current path.

    diff --git a/doc/pdfio.md b/doc/pdfio.md index 8709d33..eac3c67 100644 --- a/doc/pdfio.md +++ b/doc/pdfio.md @@ -868,6 +868,55 @@ escaping, as needed: offsets between them +Tagged and Marked PDF Content +----------------------------- + +Content in a page stream can be tagged to help a PDF reader application know the +kind and organization of that content. Content inserted using the PDFio +[Page Stream Functions](@) can be tagged by surrounding it with the +[`pdfioContentBeginMarked`](@@) and [`pdfioContentEndMarked`](@@) functions. + +The `pdfioContentBeginMarked` function accepts a named tag and optional +dictionary of attributes such as the marked content identifier ("MCID"). For +example, the following code tags a paragraph of text: + +```c +pdfio_file_t *pdf; // PDF file +pdfio_stream_t *st; // Page stream + +pdfioContentBeginMarked(st, "P", /*dict*/NULL); + +pdfioContentTextShow(st, /*unicode*/false, "Mary had a little lamb\n"); +pdfioContentTextShow(st, /*unicode*/false, "whose fleece was white as snow.\n"); +pdfioContentTextShow(st, /*unicode*/false, "And everywhere that Mary went\n"); +pdfioContentTextShow(st, /*unicode*/false, "the lamb was sure to go,\n"); + +pdfioContentEndMarked(st); +``` + +To mark the same paragraph with a content identifier you would first create a +dictionary containing the "MCID" key/value pair and then mark the paragraph with +that dictionary: + +```c +pdfio_file_t *pdf; // PDF file +pdfio_stream_t *st; // Page stream +pdfio_dict_t *dict; // Content dictionary + +dict = pdfioDictCreate(pdf); +pdfioDictSetNumber(dict, "MCID", 42); + +pdfioContentBeginMarked(st, "P", dict); + +pdfioContentTextShow(st, /*unicode*/false, "Mary had a little lamb\n"); +pdfioContentTextShow(st, /*unicode*/false, "whose fleece was white as snow.\n"); +pdfioContentTextShow(st, /*unicode*/false, "And everywhere that Mary went\n"); +pdfioContentTextShow(st, /*unicode*/false, "the lamb was sure to go,\n"); + +pdfioContentEndMarked(st); +``` + + Examples ======== @@ -1597,6 +1646,9 @@ typedef struct docdata_s // Document formatting data // State for the current page pdfio_stream_t *st; // Current page stream double y; // Current position on page + const char *tag; // Current block tag + bool in_table, // Are we in a table? + in_row; // Are we in a table row? docfont_t font; // Current font double fsize; // Current font size doccolor_t color; // Current color @@ -2100,6 +2152,9 @@ pdfioContentPathRect(dd->st, left - CODE_PADDING, dd->y + SIZE_CODEBLOCK, pdfioContentFillAndStroke(dd->st, false); // Start a code text block... +dd->tag = "P"; +pdfioContentBeginMarked(dd->st, dd->tag, /*dict*/NULL); + set_font(dd, DOCFONT_MONOSPACE, SIZE_CODEBLOCK); pdfioContentTextBegin(dd->st); pdfioContentTextMoveTo(dd->st, left, dd->y); @@ -2136,6 +2191,9 @@ for (code = mmdGetFirstChild(block); code; code = mmdGetNextSibling(code)) pdfioContentTextEnd(dd->st); dd->y += lineheight; +pdfioContentEndMarked(dd->st); +dd->tag = NULL; + // Draw the bottom padding... set_color(dd, DOCCOLOR_LTGRAY); pdfioContentPathRect(dd->st, left - CODE_PADDING, @@ -2276,8 +2334,17 @@ Finally, we render each row in the table: ```c // Render each table row... +dd->in_table = true; + +if (dd->st) + pdfioContentBeginMarked(dd->st, "Table", /*dict*/NULL); + for (row = 0, rowptr = rows; row < num_rows; row ++, rowptr ++) render_row(dd, num_cols, cols, rowptr); + +pdfioContentEndMarked(dd->st); + +dd->in_table = false; ``` diff --git a/pdfio-content.c b/pdfio-content.c index 81e1134..78f0ad5 100644 --- a/pdfio-content.c +++ b/pdfio-content.c @@ -467,13 +467,22 @@ pdfioArrayCreateColorFromStandard( // This function starts an area of marked content with an optional dictionary. // It must be paired with a call to the @link pdfioContentEndMarked@ function. // +// The "tag" argument specifies the tag name string for the content such as "P" +// for a paragraph, "H1" for a top-level heading, and so forth. The "dict" +// argument specifies an optional dictionary of properties for the content such +// as the marked content identifier ("MCID") number. +// +// Calling this function sets the "Marked" key in the "MarkInfo" dictionary of +// the document catalog. The caller is responsible for setting the +// "StructTreeRoot" dictionary when creating marked content. +// // @since PDFio 1.6@ // bool // O - `true` on success, `false` on failure pdfioContentBeginMarked( pdfio_stream_t *st, // I - Stream - const char *name, // I - Name of marked content + const char *tag, // I - Tag name of marked content pdfio_dict_t *dict) // I - Dictionary of parameters or `NULL` if none { if (!st || !name) diff --git a/pdfio-content.h b/pdfio-content.h index 0c325c5..6ee4d24 100644 --- a/pdfio-content.h +++ b/pdfio-content.h @@ -69,7 +69,7 @@ extern pdfio_array_t *pdfioArrayCreateColorFromPrimaries(pdfio_file_t *pdf, size extern pdfio_array_t *pdfioArrayCreateColorFromStandard(pdfio_file_t *pdf, size_t num_colors, pdfio_cs_t cs); // PDF content drawing functions... -extern bool pdfioContentBeginMarked(pdfio_stream_t *st, const char *name, pdfio_dict_t *dict) _PDFIO_PUBLIC; +extern bool pdfioContentBeginMarked(pdfio_stream_t *st, const char *tag, pdfio_dict_t *dict) _PDFIO_PUBLIC; extern bool pdfioContentClip(pdfio_stream_t *st, bool even_odd) _PDFIO_PUBLIC; extern bool pdfioContentDrawImage(pdfio_stream_t *st, const char *name, double x, double y, double w, double h) _PDFIO_PUBLIC; extern bool pdfioContentEndMarked(pdfio_stream_t *st) _PDFIO_PUBLIC; From 203a974682bd1091833bafe98e797e2c0aa12722 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Tue, 30 Sep 2025 18:40:28 -0400 Subject: [PATCH 2/4] Fix typo. --- pdfio-content.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pdfio-content.c b/pdfio-content.c index 78f0ad5..9ee1f2f 100644 --- a/pdfio-content.c +++ b/pdfio-content.c @@ -485,11 +485,11 @@ pdfioContentBeginMarked( const char *tag, // I - Tag name of marked content pdfio_dict_t *dict) // I - Dictionary of parameters or `NULL` if none { - if (!st || !name) + if (!st || !tag) return (false); // Send the BDC/BMC command... - if (!pdfioStreamPrintf(st, "%N", name)) + if (!pdfioStreamPrintf(st, "%N", tag)) return (false); if (dict) From da58eec96df51106f99c13770ab0c96c4b44bb10 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Tue, 30 Sep 2025 18:41:26 -0400 Subject: [PATCH 3/4] Update Windows DLL exports file. --- pdfio1.def | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pdfio1.def b/pdfio1.def index c38ec32..89f149d 100644 --- a/pdfio1.def +++ b/pdfio1.def @@ -1,6 +1,10 @@ LIBRARY pdfio1 VERSION 1.6 EXPORTS +_pdfio_strlcpy +_pdfio_strtod +_pdfio_utf16cpy +_pdfio_vsnprintf _pdfioArrayDebug _pdfioArrayDecrypt _pdfioArrayDelete @@ -11,12 +15,12 @@ _pdfioCryptoAESDecrypt _pdfioCryptoAESEncrypt _pdfioCryptoAESInit _pdfioCryptoLock -_pdfioCryptoMD5Append -_pdfioCryptoMD5Finish -_pdfioCryptoMD5Init _pdfioCryptoMakeRandom _pdfioCryptoMakeReader _pdfioCryptoMakeWriter +_pdfioCryptoMD5Append +_pdfioCryptoMD5Finish +_pdfioCryptoMD5Init _pdfioCryptoRC4Crypt _pdfioCryptoRC4Init _pdfioCryptoSHA256Append @@ -69,10 +73,6 @@ _pdfioValueDecrypt _pdfioValueDelete _pdfioValueRead _pdfioValueWrite -_pdfio_strlcpy -_pdfio_strtod -_pdfio_utf16cpy -_pdfio_vsnprintf pdfioArrayAppendArray pdfioArrayAppendBinary pdfioArrayAppendBoolean @@ -101,8 +101,10 @@ pdfioArrayGetSize pdfioArrayGetString pdfioArrayGetType pdfioArrayRemove +pdfioContentBeginMarked pdfioContentClip pdfioContentDrawImage +pdfioContentEndMarked pdfioContentFill pdfioContentFillAndStroke pdfioContentMatrixConcat @@ -156,8 +158,8 @@ pdfioContentTextNewLineShow pdfioContentTextNewLineShowf pdfioContentTextNextLine pdfioContentTextShow -pdfioContentTextShowJustified pdfioContentTextShowf +pdfioContentTextShowJustified pdfioDictClear pdfioDictCopy pdfioDictCreate @@ -168,8 +170,8 @@ pdfioDictGetDate pdfioDictGetDict pdfioDictGetKey pdfioDictGetName -pdfioDictGetNumPairs pdfioDictGetNumber +pdfioDictGetNumPairs pdfioDictGetObj pdfioDictGetRect pdfioDictGetString From 4032eef8264ecf6350016afbba9d0b4750200225 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Tue, 30 Sep 2025 19:48:30 -0400 Subject: [PATCH 4/4] Fix the unsupported filter error message (Issue #130) --- CHANGES.md | 1 + pdfio-stream.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index d6baa49..1b4bc24 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -26,6 +26,7 @@ v1.6.0 - YYYY-MM-DD v1.5.5 - YYYY-MM-DD ------------------- +- Fixed unsupported filter error (Issue #130) - Fixed EOF comment written to the PDF (Issue #136) - Fixed TTF cmap underflow error. - Fixed some Clang warnings. diff --git a/pdfio-stream.c b/pdfio-stream.c index 4791b7c..528a3c9 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -623,7 +623,7 @@ _pdfioStreamOpen(pdfio_obj_t *obj, // I - Object else { // Something else we don't support - _pdfioFileError(st->pdf, "Unsupported stream filter '%N'.", filter); + _pdfioFileError(st->pdf, "Unsupported stream filter '/%s'.", filter); goto error; } }