2021-05-01 23:50:52 +02:00
//
2021-05-30 13:10:44 +02:00
// PDF token parsing functions for PDFio.
2021-05-01 23:50:52 +02:00
//
2023-02-04 02:39:04 +01:00
// Copyright © 2021-2023 by Michael R Sweet.
2021-05-01 23:50:52 +02:00
//
// Licensed under Apache License v2.0. See the file "LICENSE" for more
// information.
//
//
// Include necessary headers...
//
# include "pdfio-private.h"
//
// This file parses PDF language syntax:
//
// << dict >> "<<" and ">>" delimit a dictionary
// (string) "(" and ")" delimit a string
// [array] "[" and "]" delimit an array
// <hex-string> "<" and ">" delimit a hex string
2021-05-02 04:38:11 +02:00
// {...} "{" and "}" are reserved as future delimiters
2021-05-01 23:50:52 +02:00
// /name "/" starts a name with any special characters
// quoted as "#HH" where HH is the byte value in hex.
// %comment "%" starts a comment to the end of a line
2021-05-02 04:38:11 +02:00
// keyword A keyword consists of other unreserved characters
2021-05-01 23:50:52 +02:00
// [-+]?[0-9]*(.[0-9]*)? A number optionally starts with "+" or "-".
//
// Newlines are CR, LF, or CR LF.
//
// Strings and names are returned with the leading delimiter ("(string",
// "<hex-string", "/name") and all escaping/whitespace removal resolved.
// Other delimiters, keywords, and numbers are returned as-is.
//
2021-05-02 04:38:11 +02:00
//
// Constants...
//
# define PDFIO_NUMBER_CHARS "0123456789-+."
# define PDFIO_DELIM_CHARS "<>(){}[] / %"
//
2021-05-08 13:38:44 +02:00
// Local functions...
//
static int get_char ( _pdfio_token_t * tb ) ;
//
// '_pdfioTokenClear()' - Clear the token stack.
2021-05-02 04:38:11 +02:00
//
2021-05-08 13:38:44 +02:00
void
_pdfioTokenClear ( _pdfio_token_t * tb ) // I - Token buffer/stack
2021-05-02 04:38:11 +02:00
{
2021-05-08 13:38:44 +02:00
PDFIO_DEBUG ( " _pdfioTokenClear(tb=%p) \n " , tb ) ;
while ( tb - > num_tokens > 0 )
{
tb - > num_tokens - - ;
free ( tb - > tokens [ tb - > num_tokens ] ) ;
tb - > tokens [ tb - > num_tokens ] = NULL ;
}
}
2021-05-02 04:38:11 +02:00
2021-05-10 23:37:57 +02:00
//
// '_pdfioTokenFlush()' - Flush (consume) any bytes that have been used.
//
void
_pdfioTokenFlush ( _pdfio_token_t * tb ) // I - Token buffer/stack
{
if ( tb - > bufptr > tb - > buffer )
{
size_t remaining = ( size_t ) ( tb - > bufend - tb - > bufptr ) ;
// Remaining bytes in buffer
// Consume what we've used...
PDFIO_DEBUG ( " _pdfioTokenFlush: Consuming %d bytes. \n " , ( int ) ( tb - > bufptr - tb - > buffer ) ) ;
( tb - > consume_cb ) ( tb - > cb_data , ( size_t ) ( tb - > bufptr - tb - > buffer ) ) ;
if ( remaining > 0 )
{
// Shuffle remaining bytes for next call...
memmove ( tb - > buffer , tb - > bufptr , remaining ) ;
tb - > bufptr = tb - > buffer ;
tb - > bufend = tb - > buffer + remaining ;
# ifdef DEBUG
unsigned char * ptr ; // Pointer into buffer
PDFIO_DEBUG ( " _pdfioTokenFlush: Remainder ' " ) ;
for ( ptr = tb - > buffer ; ptr < tb - > bufend ; ptr + + )
{
if ( * ptr < ' ' | | * ptr = = 0x7f )
PDFIO_DEBUG ( " \\ %03o " , * ptr ) ;
else
PDFIO_DEBUG ( " %c " , * ptr ) ;
}
PDFIO_DEBUG ( " ' \n " ) ;
# endif // DEBUG
}
else
{
// Nothing left, reset pointers...
2021-08-23 20:31:54 +02:00
PDFIO_DEBUG ( " _pdfioTokenFlush: Resetting pointers. \n " ) ;
2021-05-10 23:37:57 +02:00
tb - > bufptr = tb - > bufend = tb - > buffer ;
}
}
}
2021-05-02 04:38:11 +02:00
//
2021-05-08 13:38:44 +02:00
// '_pdfioTokenGet()' - Get a token.
//
bool // O - `true` on success, `false` on failure
_pdfioTokenGet ( _pdfio_token_t * tb , // I - Token buffer/stack
char * buffer , // I - String buffer
size_t bufsize ) // I - Size of string buffer
{
// See if we have a token waiting on the stack...
if ( tb - > num_tokens > 0 )
{
// Yes, return it...
2023-02-04 02:39:04 +01:00
size_t len ; // Length of token
2021-05-08 13:38:44 +02:00
tb - > num_tokens - - ;
2023-02-04 02:39:04 +01:00
if ( ( len = strlen ( tb - > tokens [ tb - > num_tokens ] ) ) > ( bufsize - 1 ) )
{
// Value too large...
PDFIO_DEBUG ( " _pdfioTokenGet(tb=%p, buffer=%p, bufsize=%u): Token '%s' from stack too large. \n " , tb , buffer , ( unsigned ) bufsize , tb - > tokens [ tb - > num_tokens ] ) ;
* buffer = ' \0 ' ;
return ( false ) ;
}
memcpy ( buffer , tb - > tokens [ tb - > num_tokens ] , len ) ;
buffer [ len ] = ' \0 ' ;
2021-05-08 13:38:44 +02:00
PDFIO_DEBUG ( " _pdfioTokenGet(tb=%p, buffer=%p, bufsize=%u): Popping '%s' from stack. \n " , tb , buffer , ( unsigned ) bufsize , buffer ) ;
free ( tb - > tokens [ tb - > num_tokens ] ) ;
tb - > tokens [ tb - > num_tokens ] = NULL ;
return ( true ) ;
}
// No, read a new one...
return ( _pdfioTokenRead ( tb , buffer , bufsize ) ) ;
}
2021-05-02 04:38:11 +02:00
//
2021-05-08 13:38:44 +02:00
// '_pdfioTokenInit()' - Initialize a token buffer/stack.
//
void
_pdfioTokenInit (
_pdfio_token_t * ts , // I - Token buffer/stack
pdfio_file_t * pdf , // I - PDF file
_pdfio_tconsume_cb_t consume_cb , // I - Consume callback
_pdfio_tpeek_cb_t peek_cb , // I - Peek callback
void * cb_data ) // I - Callback data
{
// Zero everything out and then initialize key pointers...
memset ( ts , 0 , sizeof ( _pdfio_token_t ) ) ;
ts - > pdf = pdf ;
ts - > consume_cb = consume_cb ;
ts - > peek_cb = peek_cb ;
ts - > cb_data = cb_data ;
ts - > bufptr = ts - > buffer ;
ts - > bufend = ts - > buffer ;
}
2021-05-02 04:38:11 +02:00
2021-05-08 13:38:44 +02:00
//
// '_pdfioTokenPush()' - Push a token on the token stack.
//
void
_pdfioTokenPush ( _pdfio_token_t * tb , // I - Token buffer/stack
const char * token ) // I - Token to push
{
if ( tb - > num_tokens < ( sizeof ( tb - > tokens ) / sizeof ( tb - > tokens [ 0 ] ) ) )
{
if ( ( tb - > tokens [ tb - > num_tokens + + ] = strdup ( token ) ) = = NULL )
tb - > num_tokens - - ;
}
}
2021-05-02 04:38:11 +02:00
2021-05-01 23:50:52 +02:00
//
// '_pdfioTokenRead()' - Read a token from a file/stream.
//
bool // O - `true` on success, `false` on failure
2021-05-08 13:38:44 +02:00
_pdfioTokenRead ( _pdfio_token_t * tb , // I - Token buffer/stack
char * buffer , // I - String buffer
size_t bufsize ) // I - Size of string buffer
2021-05-01 23:50:52 +02:00
{
2021-05-10 03:32:09 +02:00
int ch , // Character
parens = 0 ; // Parenthesis level
2021-05-08 13:38:44 +02:00
char * bufptr , // Pointer into buffer
* bufend , // End of buffer
state = ' \0 ' ; // Current state
2021-11-02 02:30:46 +01:00
bool saw_nul = false ; // Did we see a nul character?
2021-05-02 04:38:11 +02:00
//
// "state" is:
//
// - '\0' for idle
2021-05-05 03:31:58 +02:00
// - '(' for literal string
2021-05-02 04:38:11 +02:00
// - '/' for name
// - '<' for possible hex string or dict
// - '>' for possible dict
// - '%' for comment
// - 'K' for keyword
// - 'N' for number
// Read the next token, skipping any leading whitespace...
bufptr = buffer ;
bufend = buffer + bufsize - 1 ;
// Skip leading whitespace...
2021-05-08 13:38:44 +02:00
while ( ( ch = get_char ( tb ) ) ! = EOF )
2021-05-02 04:38:11 +02:00
{
if ( ch = = ' % ' )
{
// Skip comment
2021-05-08 13:38:44 +02:00
while ( ( ch = get_char ( tb ) ) ! = EOF )
2021-05-02 04:38:11 +02:00
{
if ( ch = = ' \n ' | | ch = = ' \r ' )
break ;
}
}
else if ( ! isspace ( ch ) )
break ;
}
if ( ch = = EOF )
return ( false ) ;
// Check for delimiters...
if ( strchr ( PDFIO_DELIM_CHARS , ch ) ! = NULL )
{
* bufptr + + = state = ( char ) ch ;
}
else if ( strchr ( PDFIO_NUMBER_CHARS , ch ) ! = NULL )
{
// Number
state = ' N ' ;
* bufptr + + = ( char ) ch ;
}
else
{
// Keyword
state = ' K ' ;
* bufptr + + = ( char ) ch ;
}
switch ( state )
{
2021-05-05 03:31:58 +02:00
case ' ( ' : // Literal string
2021-05-10 03:32:09 +02:00
while ( ( ch = get_char ( tb ) ) ! = EOF )
2021-05-02 04:38:11 +02:00
{
2021-11-02 02:30:46 +01:00
if ( ch = = 0 )
saw_nul = true ;
2021-05-02 04:38:11 +02:00
if ( ch = = ' \\ ' )
{
// Quoted character...
int i ; // Looping var
2021-05-08 13:38:44 +02:00
switch ( ch = get_char ( tb ) )
2021-05-02 04:38:11 +02:00
{
case ' 0 ' : // Octal character escape
case ' 1 ' :
case ' 2 ' :
case ' 3 ' :
case ' 4 ' :
case ' 5 ' :
case ' 6 ' :
case ' 7 ' :
for ( ch - = ' 0 ' , i = 0 ; i < 2 ; i + + )
{
2021-05-08 13:38:44 +02:00
int tch = get_char ( tb ) ; // Next char
2021-05-02 04:38:11 +02:00
if ( tch > = ' 0 ' & & tch < = ' 7 ' )
2021-11-02 14:12:43 +01:00
{
2021-05-02 04:38:11 +02:00
ch = ( char ) ( ( ch < < 3 ) | ( tch - ' 0 ' ) ) ;
2021-11-02 14:12:43 +01:00
}
2021-05-02 04:38:11 +02:00
else
{
2021-05-08 13:38:44 +02:00
tb - > bufptr - - ;
2021-05-02 04:38:11 +02:00
break ;
}
}
break ;
case ' \\ ' :
case ' ( ' :
case ' ) ' :
break ;
case ' n ' :
ch = ' \n ' ;
break ;
case ' r ' :
ch = ' \r ' ;
break ;
case ' t ' :
ch = ' \t ' ;
break ;
2021-05-01 23:50:52 +02:00
2021-05-02 04:38:11 +02:00
case ' b ' :
ch = ' \b ' ;
break ;
2021-05-01 23:50:52 +02:00
2021-05-02 04:38:11 +02:00
case ' f ' :
ch = ' \f ' ;
break ;
default :
2021-05-10 03:32:09 +02:00
// Ignore blackslash per PDF spec...
break ;
2021-05-02 04:38:11 +02:00
}
}
2021-05-10 03:32:09 +02:00
else if ( ch = = ' ( ' )
{
// Keep track of parenthesis
parens + + ;
}
else if ( ch = = ' ) ' )
{
if ( parens = = 0 )
break ;
parens - - ;
}
2021-05-02 04:38:11 +02:00
if ( bufptr < bufend )
{
// Normal character...
* bufptr + + = ( char ) ch ;
}
else
{
// Out of space
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Token too large. " ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
}
if ( ch ! = ' ) ' )
{
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Unterminated string literal. " ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
2021-11-02 02:30:46 +01:00
if ( saw_nul )
{
// Convert to a hex (binary) string...
char * litptr , // Pointer to literal character
* hexptr ; // Pointer to hex character
size_t bytes = ( size_t ) ( bufptr - buffer - 1 ) ;
// Bytes of data...
static const char * hexchars = " 0123456789ABCDEF " ;
// Hex digits
PDFIO_DEBUG ( " _pdfioTokenRead: Converting nul-containing string to binary. \n " ) ;
if ( ( 2 * ( bytes + 1 ) ) > bufsize )
{
// Out of space...
_pdfioFileError ( tb - > pdf , " Token too large. " ) ;
return ( false ) ;
}
* buffer = ' < ' ;
for ( litptr = bufptr - 1 , hexptr = buffer + 2 * bytes - 1 ; litptr > buffer ; litptr - - , hexptr - = 2 )
{
2021-11-02 14:12:43 +01:00
int litch = * litptr ; // Grab the character
hexptr [ 0 ] = hexchars [ ( litch > > 4 ) & 15 ] ;
hexptr [ 1 ] = hexchars [ litch & 15 ] ;
2021-11-02 02:30:46 +01:00
}
bufptr = buffer + 2 * bytes + 1 ;
}
2021-05-02 04:38:11 +02:00
break ;
case ' K ' : // keyword
2021-05-08 13:38:44 +02:00
while ( ( ch = get_char ( tb ) ) ! = EOF & & ! isspace ( ch ) )
2021-05-02 04:38:11 +02:00
{
if ( strchr ( PDFIO_DELIM_CHARS , ch ) ! = NULL )
{
// End of keyword...
2021-05-08 13:38:44 +02:00
tb - > bufptr - - ;
2021-05-02 04:38:11 +02:00
break ;
}
else if ( bufptr < bufend )
{
// Normal character...
* bufptr + + = ( char ) ch ;
}
else
{
// Out of space...
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Token too large. " ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
}
break ;
case ' N ' : // number
2021-05-08 13:38:44 +02:00
while ( ( ch = get_char ( tb ) ) ! = EOF & & ! isspace ( ch ) )
2021-05-02 04:38:11 +02:00
{
if ( ! isdigit ( ch ) & & ch ! = ' . ' )
{
// End of number...
2021-05-08 13:38:44 +02:00
tb - > bufptr - - ;
2021-05-02 04:38:11 +02:00
break ;
}
else if ( bufptr < bufend )
{
// Normal character...
* bufptr + + = ( char ) ch ;
}
else
{
// Out of space...
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Token too large. " ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
}
break ;
case ' / ' : // "/name"
2021-05-08 13:38:44 +02:00
while ( ( ch = get_char ( tb ) ) ! = EOF & & ! isspace ( ch ) )
2021-05-02 04:38:11 +02:00
{
2021-05-05 03:31:58 +02:00
if ( strchr ( PDFIO_DELIM_CHARS , ch ) ! = NULL )
{
// End of keyword...
2021-05-08 13:38:44 +02:00
tb - > bufptr - - ;
2021-05-05 03:31:58 +02:00
break ;
}
else if ( ch = = ' # ' )
2021-05-02 04:38:11 +02:00
{
// Quoted character (#xx) in name...
int i ; // Looping var
for ( i = 0 , ch = 0 ; i < 2 ; i + + )
{
2021-05-08 13:38:44 +02:00
int tch = get_char ( tb ) ;
2021-05-02 04:38:11 +02:00
if ( ! isxdigit ( tch & 255 ) )
{
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Bad # escape in name. " ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
else if ( isdigit ( tch ) )
2021-06-04 17:03:24 +02:00
ch = ( ( ch & 255 ) < < 4 ) | ( tch - ' 0 ' ) ;
2021-05-02 04:38:11 +02:00
else
2021-06-04 17:03:24 +02:00
ch = ( ( ch & 255 ) < < 4 ) | ( tolower ( tch ) - ' a ' + 10 ) ;
2021-05-02 04:38:11 +02:00
}
}
if ( bufptr < bufend )
{
* bufptr + + = ( char ) ch ;
}
else
{
// Out of space
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Token too large. " ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
}
break ;
case ' < ' : // Potential hex string
2021-05-08 13:38:44 +02:00
if ( ( ch = get_char ( tb ) ) = = ' < ' )
2021-05-02 04:38:11 +02:00
{
// Dictionary delimiter
* bufptr + + = ( char ) ch ;
break ;
}
2023-10-06 16:46:30 +02:00
else if ( ch = = ' > ' )
{
// Issue #46: Empty hex string from Microsoft PDF generator; treat as
// empty literal string...
* buffer = ' ( ' ;
break ;
}
2021-05-02 04:38:11 +02:00
else if ( ! isspace ( ch & 255 ) & & ! isxdigit ( ch & 255 ) )
{
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Syntax error: '<%c' " , ch ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
2021-10-26 01:36:39 +02:00
do
2021-05-02 04:38:11 +02:00
{
if ( isxdigit ( ch ) )
{
if ( bufptr < bufend )
{
// Hex digit
* bufptr + + = ( char ) ch ;
}
else
{
// Too large
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Token too large. " ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
}
else if ( ! isspace ( ch ) )
{
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Invalid hex string character '%c'. " , ch ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
}
2021-10-26 01:36:39 +02:00
while ( ( ch = get_char ( tb ) ) ! = EOF & & ch ! = ' > ' ) ;
2021-05-02 04:38:11 +02:00
if ( ch = = EOF )
{
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Unterminated hex string. " ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
break ;
case ' > ' : // Dictionary
2021-05-08 13:38:44 +02:00
if ( ( ch = get_char ( tb ) ) = = ' > ' )
2021-05-02 04:38:11 +02:00
{
* bufptr + + = ' > ' ;
}
else
{
2021-05-08 13:38:44 +02:00
_pdfioFileError ( tb - > pdf , " Syntax error: '>%c'. " , ch ) ;
2021-05-02 04:38:11 +02:00
return ( false ) ;
}
break ;
}
* bufptr = ' \0 ' ;
2023-02-04 02:39:04 +01:00
// PDFIO_DEBUG("_pdfioTokenRead: Read '%s'.\n", buffer);
2021-05-04 18:59:10 +02:00
2021-05-02 04:38:11 +02:00
return ( bufptr > buffer ) ;
2021-05-01 23:50:52 +02:00
}
2021-05-02 04:38:11 +02:00
//
// 'get_char()' - Get a character from the token buffer.
//
static int // O - Character or `EOF` on end-of-file
2021-05-08 13:38:44 +02:00
get_char ( _pdfio_token_t * tb ) // I - Token buffer
2021-05-02 04:38:11 +02:00
{
ssize_t bytes ; // Bytes peeked
// Refill the buffer as needed...
if ( tb - > bufptr > = tb - > bufend )
{
// Consume previous bytes...
if ( tb - > bufend > tb - > buffer )
2021-05-10 23:37:57 +02:00
{
PDFIO_DEBUG ( " get_char: Consuming %d bytes. \n " , ( int ) ( tb - > bufend - tb - > buffer ) ) ;
2021-05-08 13:38:44 +02:00
( tb - > consume_cb ) ( tb - > cb_data , ( size_t ) ( tb - > bufend - tb - > buffer ) ) ;
2021-05-10 23:37:57 +02:00
}
2021-05-02 04:38:11 +02:00
// Peek new bytes...
2021-05-08 13:38:44 +02:00
if ( ( bytes = ( tb - > peek_cb ) ( tb - > cb_data , tb - > buffer , sizeof ( tb - > buffer ) ) ) < = 0 )
2021-05-02 04:38:11 +02:00
{
tb - > bufptr = tb - > bufend = tb - > buffer ;
return ( EOF ) ;
}
// Update pointers...
tb - > bufptr = tb - > buffer ;
tb - > bufend = tb - > buffer + bytes ;
2021-05-10 23:37:57 +02:00
2023-02-04 02:39:04 +01:00
#if 0
2021-05-10 23:37:57 +02:00
# ifdef DEBUG
unsigned char * ptr ; // Pointer into buffer
PDFIO_DEBUG ( " get_char: Read ' " ) ;
for ( ptr = tb - > buffer ; ptr < tb - > bufend ; ptr + + )
{
if ( * ptr < ' ' | | * ptr = = 0x7f )
PDFIO_DEBUG ( " \\ %03o " , * ptr ) ;
else
PDFIO_DEBUG ( " %c " , * ptr ) ;
}
PDFIO_DEBUG ( " ' \n " ) ;
# endif // DEBUG
2023-02-04 02:39:04 +01:00
# endif // 0
2021-05-02 04:38:11 +02:00
}
// Return the next character...
return ( * ( tb - > bufptr ) + + ) ;
}