From 3ae95f15454e4de05c82f5ad1aa16ee17476d55b Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Sun, 13 Apr 2025 09:04:00 -0400 Subject: [PATCH] Fix decryption of UTF-16 strings (Issue #42) --- CHANGES.md | 3 +- pdfio-dict.c | 123 ++---------------------------------------------- pdfio-private.h | 1 + pdfio-string.c | 83 ++++++++++++++++++++++++++++++++ pdfio-value.c | 21 ++++++++- 5 files changed, 110 insertions(+), 121 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 80bd736..56a84c3 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -12,7 +12,8 @@ v1.6.0 - YYYY-MM-DD v1.5.3 - YYYY-MM-DD ------------------- -- Fixed decryption of PDF files "protected" by 40-bit RC4. +- Fixed decryption of PDF files "protected" by 40-bit RC4 (Issue #42) +- Fixed decryption of UTF-16 strings (Issue #42) v1.5.2 - 2025-04-12 diff --git a/pdfio-dict.c b/pdfio-dict.c index 2ea63de..8702846 100644 --- a/pdfio-dict.c +++ b/pdfio-dict.c @@ -465,127 +465,12 @@ pdfioDictGetString(pdfio_dict_t *dict, // I - Dictionary else if (value && value->type == PDFIO_VALTYPE_BINARY && value->value.binary.datalen < 4096) { // Convert binary string to regular string... - char temp[4096], // Temporary string - *tempptr; // Pointer into temporary string - unsigned char *dataptr; // Pointer into the data string + char temp[4096]; // Temporary UTF-8 string - if (!(value->value.binary.datalen & 1) && !memcmp(value->value.binary.data, "\376\377", 2)) + if (!(value->value.binary.datalen & 1) && (!memcmp(value->value.binary.data, "\376\377", 2) || !memcmp(value->value.binary.data, "\377\376", 2))) { - // Copy UTF-16 BE - int ch; // Unicode character - size_t remaining; // Remaining bytes - - for (dataptr = value->value.binary.data + 2, remaining = value->value.binary.datalen - 2, tempptr = temp; remaining > 1 && tempptr < (temp + sizeof(temp) - 5); dataptr += 2, remaining -= 2) - { - ch = (dataptr[0] << 8) | dataptr[1]; - - if (ch >= 0xd800 && ch <= 0xdbff && remaining > 3) - { - // Multi-word UTF-16 char... - int lch; // Lower bits - - lch = (dataptr[2] << 8) | dataptr[3]; - - if (lch < 0xdc00 || lch >= 0xdfff) - break; - - ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000; - dataptr += 2; - remaining -= 2; - } - else if (ch >= 0xfffe) - { - continue; - } - - if (ch < 128) - { - // ASCII - *tempptr++ = (char)ch; - } - else if (ch < 4096) - { - // 2-byte UTF-8 - *tempptr++ = (char)(0xc0 | (ch >> 6)); - *tempptr++ = (char)(0x80 | (ch & 0x3f)); - } - else if (ch < 65536) - { - // 3-byte UTF-8 - *tempptr++ = (char)(0xe0 | (ch >> 12)); - *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *tempptr++ = (char)(0x80 | (ch & 0x3f)); - } - else - { - // 4-byte UTF-8 - *tempptr++ = (char)(0xe0 | (ch >> 18)); - *tempptr++ = (char)(0x80 | ((ch >> 12) & 0x3f)); - *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *tempptr++ = (char)(0x80 | (ch & 0x3f)); - } - } - - *tempptr = '\0'; - } - else if (!(value->value.binary.datalen & 1) && !memcmp(value->value.binary.data, "\377\376", 2)) - { - // Copy UTF-16 LE - int ch; // Unicode character - size_t remaining; // Remaining bytes - - for (dataptr = value->value.binary.data + 2, remaining = value->value.binary.datalen - 2, tempptr = temp; remaining > 1 && tempptr < (temp + sizeof(temp) - 5); dataptr += 2, remaining -= 2) - { - ch = (dataptr[1] << 8) | dataptr[0]; - - if (ch >= 0xd800 && ch <= 0xdbff && remaining > 3) - { - // Multi-word UTF-16 char... - int lch; // Lower bits - - lch = (dataptr[3] << 8) | dataptr[2]; - - if (lch < 0xdc00 || lch >= 0xdfff) - break; - - ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000; - dataptr += 2; - remaining -= 2; - } - else if (ch >= 0xfffe) - { - continue; - } - - if (ch < 128) - { - // ASCII - *tempptr++ = (char)ch; - } - else if (ch < 4096) - { - // 2-byte UTF-8 - *tempptr++ = (char)(0xc0 | (ch >> 6)); - *tempptr++ = (char)(0x80 | (ch & 0x3f)); - } - else if (ch < 65536) - { - // 3-byte UTF-8 - *tempptr++ = (char)(0xe0 | (ch >> 12)); - *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *tempptr++ = (char)(0x80 | (ch & 0x3f)); - } - else - { - // 4-byte UTF-8 - *tempptr++ = (char)(0xe0 | (ch >> 18)); - *tempptr++ = (char)(0x80 | ((ch >> 12) & 0x3f)); - *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *tempptr++ = (char)(0x80 | (ch & 0x3f)); - } - } - - *tempptr = '\0'; + // Copy UTF-16... + _pdfio_utf16cpy(temp, value->value.binary.data, value->value.binary.datalen, sizeof(temp)); } else { diff --git a/pdfio-private.h b/pdfio-private.h index 9a82fe3..7d0ba9b 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -339,6 +339,7 @@ struct _pdfio_stream_s // Stream extern size_t _pdfio_strlcpy(char *dst, const char *src, size_t dstsize) _PDFIO_INTERNAL; extern double _pdfio_strtod(pdfio_file_t *pdf, const char *s) _PDFIO_INTERNAL; +extern void _pdfio_utf16cpy(char *dst, const unsigned char *src, size_t srclen, size_t dstsize) _PDFIO_INTERNAL; extern ssize_t _pdfio_vsnprintf(pdfio_file_t *pdf, char *buffer, size_t bufsize, const char *format, va_list ap) _PDFIO_INTERNAL; extern bool _pdfioArrayDecrypt(pdfio_file_t *pdf, pdfio_obj_t *obj, pdfio_array_t *a, size_t depth) _PDFIO_INTERNAL; diff --git a/pdfio-string.c b/pdfio-string.c index 507e1a8..1295169 100644 --- a/pdfio-string.c +++ b/pdfio-string.c @@ -158,6 +158,89 @@ _pdfio_strtod(pdfio_file_t *pdf, // I - PDF file } +// +// '_pdfio_utf16cpy()' - Convert UTF-16 to UTF-8. +// + +void +_pdfio_utf16cpy( + char *dst, // I - Destination buffer for UTF-8 + const unsigned char *src, // I - Source UTF-16 + size_t srclen, // I - Length of UTF-16 + size_t dstsize) // I - Destination buffer size +{ + char *dstptr = dst, // Pointer into buffer + *dstend = dst + dstsize - 5; // End of buffer + int ch; // Unicode character + bool is_be = !memcmp(src, "\376\377", 2); + // Big-endian strings? + + + // Loop through the UTF-16 string, converting to Unicode then UTF-8... + for (src += 2, srclen -= 2; srclen > 1 && dstptr < dstend; src += 2, srclen -= 2) + { + // Initial character... + if (is_be) + ch = (src[0] << 8) | src[1]; + else + ch = (src[1] << 8) | src[0]; + + if (ch >= 0xd800 && ch <= 0xdbff && srclen > 3) + { + // Multi-word UTF-16 char... + int lch; // Lower bits + + if (is_be) + lch = (src[2] << 8) | src[3]; + else + lch = (src[3] << 8) | src[2]; + + if (lch < 0xdc00 || lch >= 0xdfff) + break; + + ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000; + src += 2; + srclen -= 2; + } + else if (ch >= 0xfffe) + { + continue; + } + + // Convert Unicode to UTF-8... + if (ch < 128) + { + // ASCII + *dstptr++ = (char)ch; + } + else if (ch < 4096) + { + // 2-byte UTF-8 + *dstptr++ = (char)(0xc0 | (ch >> 6)); + *dstptr++ = (char)(0x80 | (ch & 0x3f)); + } + else if (ch < 65536) + { + // 3-byte UTF-8 + *dstptr++ = (char)(0xe0 | (ch >> 12)); + *dstptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *dstptr++ = (char)(0x80 | (ch & 0x3f)); + } + else + { + // 4-byte UTF-8 + *dstptr++ = (char)(0xe0 | (ch >> 18)); + *dstptr++ = (char)(0x80 | ((ch >> 12) & 0x3f)); + *dstptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *dstptr++ = (char)(0x80 | (ch & 0x3f)); + } + } + + // Nul-terminate the UTF-8 string... + *dstptr = '\0'; +} + + // // '_pdfio_vsnprintf()' - Format a string. // diff --git a/pdfio-value.c b/pdfio-value.c index 802dfc6..86a9423 100644 --- a/pdfio-value.c +++ b/pdfio-value.c @@ -205,7 +205,26 @@ _pdfioValueDecrypt(pdfio_file_t *pdf, // I - PDF file temp[templen] = '\0'; - if ((timeval = get_date_time((char *)temp)) != 0) + if ((templen & 1) == 0 && (!memcmp(temp, "\376\377", 2) || !memcmp(temp, "\377\376", 2))) + { + // Convert UTF-16 to UTF-8... + char utf8[4096]; // Temporary string + + _pdfio_utf16cpy(utf8, temp, templen, sizeof(utf8)); + + if ((timeval = get_date_time((char *)utf8)) != 0) + { + // Change the type to date... + v->type = PDFIO_VALTYPE_DATE; + v->value.date = timeval; + } + else + { + // Copy the decrypted string back to the value... + v->value.string = pdfioStringCreate(pdf, utf8); + } + } + else if ((timeval = get_date_time((char *)temp)) != 0) { // Change the type to date... v->type = PDFIO_VALTYPE_DATE;