Fix decryption of UTF-16 strings (Issue #42)

This commit is contained in:
Michael R Sweet 2025-04-13 09:04:00 -04:00
parent cfdd6f55d9
commit 3ae95f1545
No known key found for this signature in database
GPG Key ID: BE67C75EC81F3244
5 changed files with 110 additions and 121 deletions

View File

@ -12,7 +12,8 @@ v1.6.0 - YYYY-MM-DD
v1.5.3 - YYYY-MM-DD
-------------------
- Fixed decryption of PDF files "protected" by 40-bit RC4.
- Fixed decryption of PDF files "protected" by 40-bit RC4 (Issue #42)
- Fixed decryption of UTF-16 strings (Issue #42)
v1.5.2 - 2025-04-12

View File

@ -465,127 +465,12 @@ pdfioDictGetString(pdfio_dict_t *dict, // I - Dictionary
else if (value && value->type == PDFIO_VALTYPE_BINARY && value->value.binary.datalen < 4096)
{
// Convert binary string to regular string...
char temp[4096], // Temporary string
*tempptr; // Pointer into temporary string
unsigned char *dataptr; // Pointer into the data string
char temp[4096]; // Temporary UTF-8 string
if (!(value->value.binary.datalen & 1) && !memcmp(value->value.binary.data, "\376\377", 2))
if (!(value->value.binary.datalen & 1) && (!memcmp(value->value.binary.data, "\376\377", 2) || !memcmp(value->value.binary.data, "\377\376", 2)))
{
// Copy UTF-16 BE
int ch; // Unicode character
size_t remaining; // Remaining bytes
for (dataptr = value->value.binary.data + 2, remaining = value->value.binary.datalen - 2, tempptr = temp; remaining > 1 && tempptr < (temp + sizeof(temp) - 5); dataptr += 2, remaining -= 2)
{
ch = (dataptr[0] << 8) | dataptr[1];
if (ch >= 0xd800 && ch <= 0xdbff && remaining > 3)
{
// Multi-word UTF-16 char...
int lch; // Lower bits
lch = (dataptr[2] << 8) | dataptr[3];
if (lch < 0xdc00 || lch >= 0xdfff)
break;
ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000;
dataptr += 2;
remaining -= 2;
}
else if (ch >= 0xfffe)
{
continue;
}
if (ch < 128)
{
// ASCII
*tempptr++ = (char)ch;
}
else if (ch < 4096)
{
// 2-byte UTF-8
*tempptr++ = (char)(0xc0 | (ch >> 6));
*tempptr++ = (char)(0x80 | (ch & 0x3f));
}
else if (ch < 65536)
{
// 3-byte UTF-8
*tempptr++ = (char)(0xe0 | (ch >> 12));
*tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*tempptr++ = (char)(0x80 | (ch & 0x3f));
}
else
{
// 4-byte UTF-8
*tempptr++ = (char)(0xe0 | (ch >> 18));
*tempptr++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*tempptr++ = (char)(0x80 | (ch & 0x3f));
}
}
*tempptr = '\0';
}
else if (!(value->value.binary.datalen & 1) && !memcmp(value->value.binary.data, "\377\376", 2))
{
// Copy UTF-16 LE
int ch; // Unicode character
size_t remaining; // Remaining bytes
for (dataptr = value->value.binary.data + 2, remaining = value->value.binary.datalen - 2, tempptr = temp; remaining > 1 && tempptr < (temp + sizeof(temp) - 5); dataptr += 2, remaining -= 2)
{
ch = (dataptr[1] << 8) | dataptr[0];
if (ch >= 0xd800 && ch <= 0xdbff && remaining > 3)
{
// Multi-word UTF-16 char...
int lch; // Lower bits
lch = (dataptr[3] << 8) | dataptr[2];
if (lch < 0xdc00 || lch >= 0xdfff)
break;
ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000;
dataptr += 2;
remaining -= 2;
}
else if (ch >= 0xfffe)
{
continue;
}
if (ch < 128)
{
// ASCII
*tempptr++ = (char)ch;
}
else if (ch < 4096)
{
// 2-byte UTF-8
*tempptr++ = (char)(0xc0 | (ch >> 6));
*tempptr++ = (char)(0x80 | (ch & 0x3f));
}
else if (ch < 65536)
{
// 3-byte UTF-8
*tempptr++ = (char)(0xe0 | (ch >> 12));
*tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*tempptr++ = (char)(0x80 | (ch & 0x3f));
}
else
{
// 4-byte UTF-8
*tempptr++ = (char)(0xe0 | (ch >> 18));
*tempptr++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*tempptr++ = (char)(0x80 | (ch & 0x3f));
}
}
*tempptr = '\0';
// Copy UTF-16...
_pdfio_utf16cpy(temp, value->value.binary.data, value->value.binary.datalen, sizeof(temp));
}
else
{

View File

@ -339,6 +339,7 @@ struct _pdfio_stream_s // Stream
extern size_t _pdfio_strlcpy(char *dst, const char *src, size_t dstsize) _PDFIO_INTERNAL;
extern double _pdfio_strtod(pdfio_file_t *pdf, const char *s) _PDFIO_INTERNAL;
extern void _pdfio_utf16cpy(char *dst, const unsigned char *src, size_t srclen, size_t dstsize) _PDFIO_INTERNAL;
extern ssize_t _pdfio_vsnprintf(pdfio_file_t *pdf, char *buffer, size_t bufsize, const char *format, va_list ap) _PDFIO_INTERNAL;
extern bool _pdfioArrayDecrypt(pdfio_file_t *pdf, pdfio_obj_t *obj, pdfio_array_t *a, size_t depth) _PDFIO_INTERNAL;

View File

@ -158,6 +158,89 @@ _pdfio_strtod(pdfio_file_t *pdf, // I - PDF file
}
//
// '_pdfio_utf16cpy()' - Convert UTF-16 to UTF-8.
//
void
_pdfio_utf16cpy(
char *dst, // I - Destination buffer for UTF-8
const unsigned char *src, // I - Source UTF-16
size_t srclen, // I - Length of UTF-16
size_t dstsize) // I - Destination buffer size
{
char *dstptr = dst, // Pointer into buffer
*dstend = dst + dstsize - 5; // End of buffer
int ch; // Unicode character
bool is_be = !memcmp(src, "\376\377", 2);
// Big-endian strings?
// Loop through the UTF-16 string, converting to Unicode then UTF-8...
for (src += 2, srclen -= 2; srclen > 1 && dstptr < dstend; src += 2, srclen -= 2)
{
// Initial character...
if (is_be)
ch = (src[0] << 8) | src[1];
else
ch = (src[1] << 8) | src[0];
if (ch >= 0xd800 && ch <= 0xdbff && srclen > 3)
{
// Multi-word UTF-16 char...
int lch; // Lower bits
if (is_be)
lch = (src[2] << 8) | src[3];
else
lch = (src[3] << 8) | src[2];
if (lch < 0xdc00 || lch >= 0xdfff)
break;
ch = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000;
src += 2;
srclen -= 2;
}
else if (ch >= 0xfffe)
{
continue;
}
// Convert Unicode to UTF-8...
if (ch < 128)
{
// ASCII
*dstptr++ = (char)ch;
}
else if (ch < 4096)
{
// 2-byte UTF-8
*dstptr++ = (char)(0xc0 | (ch >> 6));
*dstptr++ = (char)(0x80 | (ch & 0x3f));
}
else if (ch < 65536)
{
// 3-byte UTF-8
*dstptr++ = (char)(0xe0 | (ch >> 12));
*dstptr++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*dstptr++ = (char)(0x80 | (ch & 0x3f));
}
else
{
// 4-byte UTF-8
*dstptr++ = (char)(0xe0 | (ch >> 18));
*dstptr++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*dstptr++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*dstptr++ = (char)(0x80 | (ch & 0x3f));
}
}
// Nul-terminate the UTF-8 string...
*dstptr = '\0';
}
//
// '_pdfio_vsnprintf()' - Format a string.
//

View File

@ -205,7 +205,26 @@ _pdfioValueDecrypt(pdfio_file_t *pdf, // I - PDF file
temp[templen] = '\0';
if ((timeval = get_date_time((char *)temp)) != 0)
if ((templen & 1) == 0 && (!memcmp(temp, "\376\377", 2) || !memcmp(temp, "\377\376", 2)))
{
// Convert UTF-16 to UTF-8...
char utf8[4096]; // Temporary string
_pdfio_utf16cpy(utf8, temp, templen, sizeof(utf8));
if ((timeval = get_date_time((char *)utf8)) != 0)
{
// Change the type to date...
v->type = PDFIO_VALTYPE_DATE;
v->value.date = timeval;
}
else
{
// Copy the decrypted string back to the value...
v->value.string = pdfioStringCreate(pdf, utf8);
}
}
else if ((timeval = get_date_time((char *)temp)) != 0)
{
// Change the type to date...
v->type = PDFIO_VALTYPE_DATE;