mirror of
				https://github.com/michaelrsweet/pdfio.git
				synced 2025-10-31 10:26:22 +01:00 
			
		
		
		
	Fix support for UTF-16 string values in dictionaries (Issue #92)
Specifically to support Unicode Title and Author values.
This commit is contained in:
		| @@ -10,6 +10,7 @@ v1.4.1 - YYYY-MM-DD | |||||||
| - Fixed opening of PDF files less than 1024 bytes in length (Issue #87) | - Fixed opening of PDF files less than 1024 bytes in length (Issue #87) | ||||||
| - Fixed potential `NULL` dereference when reading (Issue #89) | - Fixed potential `NULL` dereference when reading (Issue #89) | ||||||
| - Fixed reading of compressed object streams (Issue #92) | - Fixed reading of compressed object streams (Issue #92) | ||||||
|  | - Fixed reading of UTF-16 string values (Issue #92) | ||||||
|  |  | ||||||
|  |  | ||||||
| v1.4.0 - 2024-12-26 | v1.4.0 - 2024-12-26 | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| .TH pdfio 3 "pdf read/write library" "2025-01-17" "pdf read/write library" | .TH pdfio 3 "pdf read/write library" "2025-01-24" "pdf read/write library" | ||||||
| .SH NAME | .SH NAME | ||||||
| pdfio \- pdf read/write library | pdfio \- pdf read/write library | ||||||
| .SH Introduction | .SH Introduction | ||||||
|   | |||||||
							
								
								
									
										130
									
								
								pdfio-dict.c
									
									
									
									
									
								
							
							
						
						
									
										130
									
								
								pdfio-dict.c
									
									
									
									
									
								
							| @@ -465,10 +465,134 @@ pdfioDictGetString(pdfio_dict_t *dict,	// I - Dictionary | |||||||
|   else if (value && value->type == PDFIO_VALTYPE_BINARY && value->value.binary.datalen < 4096) |   else if (value && value->type == PDFIO_VALTYPE_BINARY && value->value.binary.datalen < 4096) | ||||||
|   { |   { | ||||||
|     // Convert binary string to regular string... |     // Convert binary string to regular string... | ||||||
|     char	temp[4096];		// Temporary string |     char	temp[4096],		// Temporary string | ||||||
|  | 		*tempptr;		// Pointer into temporary string | ||||||
|  |     unsigned char *dataptr;		// Pointer into the data string | ||||||
|  |  | ||||||
|     memcpy(temp, value->value.binary.data, value->value.binary.datalen); |     if (!(value->value.binary.datalen & 1) && !memcmp(value->value.binary.data, "\377\376", 2)) | ||||||
|     temp[value->value.binary.datalen] = '\0'; |     { | ||||||
|  |       // Copy UTF-16 BE | ||||||
|  |       int	ch;			// Unicode character | ||||||
|  |       size_t	remaining;		// Remaining bytes | ||||||
|  |  | ||||||
|  |       for (dataptr = value->value.binary.data + 2, remaining = value->value.binary.datalen - 2, tempptr = temp; remaining > 1 && tempptr < (temp + sizeof(temp) - 5); dataptr += 2, remaining -= 2) | ||||||
|  |       { | ||||||
|  |         ch = (dataptr[0] << 8) | dataptr[1]; | ||||||
|  |  | ||||||
|  |         if (ch >= 0xd800 && ch <= 0xdbff && remaining > 3) | ||||||
|  | 	{ | ||||||
|  | 	  // Multi-word UTF-16 char... | ||||||
|  |           int lch;			// Lower bits | ||||||
|  |  | ||||||
|  | 	  lch = (dataptr[2] << 8) | dataptr[3]; | ||||||
|  |  | ||||||
|  |           if (lch < 0xdc00 || lch >= 0xdfff) | ||||||
|  | 	    break; | ||||||
|  |  | ||||||
|  |           ch        = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000; | ||||||
|  |           dataptr   += 2; | ||||||
|  |           remaining -= 2; | ||||||
|  | 	} | ||||||
|  | 	else if (ch >= 0xfffe) | ||||||
|  | 	{ | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  |         if (ch < 128) | ||||||
|  |         { | ||||||
|  |           // ASCII | ||||||
|  |           *tempptr++ = (char)ch; | ||||||
|  |         } | ||||||
|  |         else if (ch < 4096) | ||||||
|  |         { | ||||||
|  |           // 2-byte UTF-8 | ||||||
|  |           *tempptr++ = (char)(0xc0 | (ch >> 6)); | ||||||
|  |           *tempptr++ = (char)(0x80 | (ch & 0x3f)); | ||||||
|  |         } | ||||||
|  |         else if (ch < 65536) | ||||||
|  |         { | ||||||
|  |           // 3-byte UTF-8 | ||||||
|  |           *tempptr++ = (char)(0xe0 | (ch >> 12)); | ||||||
|  |           *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | ||||||
|  |           *tempptr++ = (char)(0x80 | (ch & 0x3f)); | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |           // 4-byte UTF-8 | ||||||
|  |           *tempptr++ = (char)(0xe0 | (ch >> 18)); | ||||||
|  |           *tempptr++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | ||||||
|  |           *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | ||||||
|  |           *tempptr++ = (char)(0x80 | (ch & 0x3f)); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       *tempptr = '\0'; | ||||||
|  |     } | ||||||
|  |     else if (!(value->value.binary.datalen & 1) && !memcmp(value->value.binary.data, "\376\377", 2)) | ||||||
|  |     { | ||||||
|  |       // Copy UTF-16 LE | ||||||
|  |       int	ch;			// Unicode character | ||||||
|  |       size_t	remaining;		// Remaining bytes | ||||||
|  |  | ||||||
|  |       for (dataptr = value->value.binary.data + 2, remaining = value->value.binary.datalen - 2, tempptr = temp; remaining > 1 && tempptr < (temp + sizeof(temp) - 5); dataptr += 2, remaining -= 2) | ||||||
|  |       { | ||||||
|  |         ch = (dataptr[1] << 8) | dataptr[0]; | ||||||
|  |  | ||||||
|  |         if (ch >= 0xd800 && ch <= 0xdbff && remaining > 3) | ||||||
|  | 	{ | ||||||
|  | 	  // Multi-word UTF-16 char... | ||||||
|  |           int lch;			// Lower bits | ||||||
|  |  | ||||||
|  | 	  lch = (dataptr[3] << 8) | dataptr[2]; | ||||||
|  |  | ||||||
|  |           if (lch < 0xdc00 || lch >= 0xdfff) | ||||||
|  | 	    break; | ||||||
|  |  | ||||||
|  |           ch        = (((ch & 0x3ff) << 10) | (lch & 0x3ff)) + 0x10000; | ||||||
|  |           dataptr   += 2; | ||||||
|  |           remaining -= 2; | ||||||
|  | 	} | ||||||
|  | 	else if (ch >= 0xfffe) | ||||||
|  | 	{ | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  |         if (ch < 128) | ||||||
|  |         { | ||||||
|  |           // ASCII | ||||||
|  |           *tempptr++ = (char)ch; | ||||||
|  |         } | ||||||
|  |         else if (ch < 4096) | ||||||
|  |         { | ||||||
|  |           // 2-byte UTF-8 | ||||||
|  |           *tempptr++ = (char)(0xc0 | (ch >> 6)); | ||||||
|  |           *tempptr++ = (char)(0x80 | (ch & 0x3f)); | ||||||
|  |         } | ||||||
|  |         else if (ch < 65536) | ||||||
|  |         { | ||||||
|  |           // 3-byte UTF-8 | ||||||
|  |           *tempptr++ = (char)(0xe0 | (ch >> 12)); | ||||||
|  |           *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | ||||||
|  |           *tempptr++ = (char)(0x80 | (ch & 0x3f)); | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |           // 4-byte UTF-8 | ||||||
|  |           *tempptr++ = (char)(0xe0 | (ch >> 18)); | ||||||
|  |           *tempptr++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | ||||||
|  |           *tempptr++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | ||||||
|  |           *tempptr++ = (char)(0x80 | (ch & 0x3f)); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       *tempptr = '\0'; | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |       // Copy as-is... | ||||||
|  |       memcpy(temp, value->value.binary.data, value->value.binary.datalen); | ||||||
|  |       temp[value->value.binary.datalen] = '\0'; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     free(value->value.binary.data); |     free(value->value.binary.data); | ||||||
|     value->type         = PDFIO_VALTYPE_STRING; |     value->type         = PDFIO_VALTYPE_STRING; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user