From cca7383c73142e936aaf42d8acb797562483f760 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Fri, 24 Jan 2025 10:43:41 -0500 Subject: [PATCH] Fix support for UTF-16 string values in dictionaries (Issue #92) Specifically to support Unicode Title and Author values. --- CHANGES.md | 1 + doc/pdfio.3 | 2 +- pdfio-dict.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 129 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index bbaf253..0c657cf 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,6 +10,7 @@ v1.4.1 - YYYY-MM-DD - Fixed opening of PDF files less than 1024 bytes in length (Issue #87) - Fixed potential `NULL` dereference when reading (Issue #89) - Fixed reading of compressed object streams (Issue #92) +- Fixed reading of UTF-16 string values (Issue #92) v1.4.0 - 2024-12-26 diff --git a/doc/pdfio.3 b/doc/pdfio.3 index 41eeedb..d4325fe 100644 --- a/doc/pdfio.3 +++ b/doc/pdfio.3 @@ -1,4 +1,4 @@ -.TH pdfio 3 "pdf read/write library" "2025-01-17" "pdf read/write library" +.TH pdfio 3 "pdf read/write library" "2025-01-24" "pdf read/write library" .SH NAME pdfio \- pdf read/write library .SH Introduction diff --git a/pdfio-dict.c b/pdfio-dict.c index a559c71..062607d 100644 --- a/pdfio-dict.c +++ b/pdfio-dict.c @@ -465,10 +465,134 @@ pdfioDictGetString(pdfio_dict_t *dict, // I - Dictionary else if (value && value->type == PDFIO_VALTYPE_BINARY && value->value.binary.datalen < 4096) { // Convert binary string to regular string... - char temp[4096]; // Temporary string + char temp[4096], // Temporary string + *tempptr; // Pointer into temporary string + unsigned char *dataptr; // Pointer into the data string - memcpy(temp, value->value.binary.data, value->value.binary.datalen); - temp[value->value.binary.datalen] = '\0'; + if (!(value->value.binary.datalen & 1) && !memcmp(value->value.binary.data, "\377\376", 2)) + { + // Copy UTF-16 BE + int ch; // Unicode character + size_t remaining; // Remaining bytes + + for (dataptr = value->value.binary.data + 2, remaining = value->value.binary.datalen - 2, tempptr = temp; remaining > 1 && tempptr < (temp + sizeof(temp) - 5); dataptr += 2, remaining -= 2) + { + ch = (dataptr[0] << 8) | dataptr[1]; + + if (ch >= 0xd800 && ch <= 0xdbff && remaining > 3) + { + // Multi-word UTF-16 char... + int lch; // Lower bits + + lch = (dataptr[2] << 8) | dataptr[3]; + + if (lch < 0xdc00 || lch >= 0xdfff) + break; + + ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000; + dataptr += 2; + remaining -= 2; + } + else if (ch >= 0xfffe) + { + continue; + } + + if (ch < 128) + { + // ASCII + *tempptr++ = (char)ch; + } + else if (ch < 4096) + { + // 2-byte UTF-8 + *tempptr++ = (char)(0xc0 | (ch >> 6)); + *tempptr++ = (char)(0x80 | (ch & 0x3f)); + } + else if (ch < 65536) + { + // 3-byte UTF-8 + *tempptr++ = (char)(0xe0 | (ch >> 12)); + *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *tempptr++ = (char)(0x80 | (ch & 0x3f)); + } + else + { + // 4-byte UTF-8 + *tempptr++ = (char)(0xe0 | (ch >> 18)); + *tempptr++ = (char)(0x80 | ((ch >> 12) & 0x3f)); + *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *tempptr++ = (char)(0x80 | (ch & 0x3f)); + } + } + + *tempptr = '\0'; + } + else if (!(value->value.binary.datalen & 1) && !memcmp(value->value.binary.data, "\376\377", 2)) + { + // Copy UTF-16 LE + int ch; // Unicode character + size_t remaining; // Remaining bytes + + for (dataptr = value->value.binary.data + 2, remaining = value->value.binary.datalen - 2, tempptr = temp; remaining > 1 && tempptr < (temp + sizeof(temp) - 5); dataptr += 2, remaining -= 2) + { + ch = (dataptr[1] << 8) | dataptr[0]; + + if (ch >= 0xd800 && ch <= 0xdbff && remaining > 3) + { + // Multi-word UTF-16 char... + int lch; // Lower bits + + lch = (dataptr[3] << 8) | dataptr[2]; + + if (lch < 0xdc00 || lch >= 0xdfff) + break; + + ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000; + dataptr += 2; + remaining -= 2; + } + else if (ch >= 0xfffe) + { + continue; + } + + if (ch < 128) + { + // ASCII + *tempptr++ = (char)ch; + } + else if (ch < 4096) + { + // 2-byte UTF-8 + *tempptr++ = (char)(0xc0 | (ch >> 6)); + *tempptr++ = (char)(0x80 | (ch & 0x3f)); + } + else if (ch < 65536) + { + // 3-byte UTF-8 + *tempptr++ = (char)(0xe0 | (ch >> 12)); + *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *tempptr++ = (char)(0x80 | (ch & 0x3f)); + } + else + { + // 4-byte UTF-8 + *tempptr++ = (char)(0xe0 | (ch >> 18)); + *tempptr++ = (char)(0x80 | ((ch >> 12) & 0x3f)); + *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *tempptr++ = (char)(0x80 | (ch & 0x3f)); + } + } + + *tempptr = '\0'; + } + else + { + // Copy as-is... + memcpy(temp, value->value.binary.data, value->value.binary.datalen); + temp[value->value.binary.datalen] = '\0'; + } free(value->value.binary.data); value->type = PDFIO_VALTYPE_STRING;