Token parsing code.

This commit is contained in:
Michael R Sweet 2021-05-01 22:38:11 -04:00
parent e107b94c83
commit 4b05ca6436
No known key found for this signature in database
GPG Key ID: 999559A027815955
5 changed files with 409 additions and 19 deletions

View File

@ -23,6 +23,23 @@ static ssize_t read_buffer(pdfio_file_t *pdf, char *buffer, size_t bytes);
static bool write_buffer(pdfio_file_t *pdf, const void *buffer, size_t bytes);
//
// '_pdfioFileConsume()' - Consume bytes from the file.
//
bool // O - `true` on sucess, `false` on EOF
_pdfioFileConsume(pdfio_file_t *pdf, // I - PDF file
size_t bytes) // I - Bytes to consume
{
if ((size_t)(pdf->bufend - pdf->bufptr) > bytes)
pdf->bufptr += bytes;
else if (_pdfioFileSeek(pdf, (off_t)bytes, SEEK_CUR) < 0)
return (false);
return (true);
}
//
// '_pdfioFileDefaultError()' - Default error callback.
//
@ -114,7 +131,7 @@ _pdfioFileGetToken(pdfio_file_t *pdf, // I - PDF file
char *buffer,// I - String buffer
size_t bufsize)// I - Size of string buffer
{
return (_pdfioTokenRead(buffer, bufsize, (_pdfio_token_cb_t)_pdfioFilePeek, (_pdfio_token_cb_t)_pdfioFileRead, pdf));
return (_pdfioTokenRead(pdf, buffer, bufsize, (_pdfio_tpeek_cb_t)_pdfioFilePeek, (_pdfio_tconsume_cb_t)_pdfioFileConsume, pdf));
}

View File

@ -165,7 +165,8 @@ struct _pdfio_stream_s // Stream
z_stream flate; // Flate filter state
};
typedef ssize_t (*_pdfio_token_cb_t)(void *data, void *buffer, size_t bufsize);
typedef ssize_t (*_pdfio_tconsume_cb_t)(void *data, size_t bytes);
typedef ssize_t (*_pdfio_tpeek_cb_t)(void *data, void *buffer, size_t bytes);
//
@ -182,6 +183,7 @@ extern pdfio_dict_t *_pdfioDictRead(pdfio_file_t *pdf) PDFIO_INTERNAL;
extern bool _pdfioDictSetValue(pdfio_dict_t *dict, const char *key, _pdfio_value_t *value) PDFIO_INTERNAL;
extern bool _pdfioDictWrite(pdfio_dict_t *dict, off_t *length) PDFIO_INTERNAL;
extern bool _pdfioFileConsume(pdfio_file_t *pdf, size_t bytes) PDFIO_INTERNAL;
extern bool _pdfioFileDefaultError(pdfio_file_t *pdf, const char *message, void *data) PDFIO_INTERNAL;
extern bool _pdfioFileError(pdfio_file_t *pdf, const char *format, ...) PDFIO_FORMAT(2,3) PDFIO_INTERNAL;
extern bool _pdfioFileFlush(pdfio_file_t *pdf) PDFIO_INTERNAL;
@ -202,7 +204,7 @@ extern void _pdfioStreamDelete(pdfio_stream_t *st) PDFIO_INTERNAL;
extern bool _pdfioStringIsAllocated(pdfio_file_t *pdf, const char *s) PDFIO_INTERNAL;
extern bool _pdfioTokenRead(char *buffer, size_t bufsize, _pdfio_token_cb_t peek_cb, _pdfio_token_cb_t read_cb, void *data);
extern bool _pdfioTokenRead(pdfio_file_t *pdf, char *buffer, size_t bufsize, _pdfio_tpeek_cb_t peek_cb, _pdfio_tconsume_cb_t consume_cb, void *data);
extern _pdfio_value_t *_pdfioValueCopy(pdfio_file_t *pdfdst, _pdfio_value_t *vdst, pdfio_file_t *pdfsrc, _pdfio_value_t *vsrc) PDFIO_INTERNAL;
extern void _pdfioValueDelete(_pdfio_value_t *v) PDFIO_INTERNAL;

View File

@ -27,6 +27,21 @@ pdfioStreamClose(pdfio_stream_t *st) // I - Stream
}
//
// 'pdfioStreamConsume()' - Consume bytes from the stream.
//
bool // O - `true` on success, `false` on EOF
pdfioStreamConsume(pdfio_stream_t *st, // I - Stream
size_t bytes)// I - Number of bytes to consume
{
// TODO: Implement me
(void)st;
(void)bytes;
return (false);
}
//
// '_pdfioStreamDelete()' - Free all memory used by a stream.
//
@ -57,7 +72,7 @@ pdfioStreamGetToken(
char *buffer, // I - String buffer
size_t bufsize) // I - Size of string buffer
{
return (_pdfioTokenRead(buffer, bufsize, (_pdfio_token_cb_t)pdfioStreamPeek, (_pdfio_token_cb_t)pdfioStreamRead, st));
return (_pdfioTokenRead(st->pdf, buffer, bufsize, (_pdfio_tpeek_cb_t)pdfioStreamPeek, (_pdfio_tconsume_cb_t)pdfioStreamConsume, st));
}

View File

@ -21,11 +21,11 @@
// (string) "(" and ")" delimit a string
// [array] "[" and "]" delimit an array
// <hex-string> "<" and ">" delimit a hex string
// {...} "{" and "}" delimit ???
// {...} "{" and "}" are reserved as future delimiters
// /name "/" starts a name with any special characters
// quoted as "#HH" where HH is the byte value in hex.
// %comment "%" starts a comment to the end of a line
// keyword A keyword consists of upper/lowercase letters
// keyword A keyword consists of other unreserved characters
// [-+]?[0-9]*(.[0-9]*)? A number optionally starts with "+" or "-".
//
// Newlines are CR, LF, or CR LF.
@ -36,27 +36,382 @@
//
//
// Constants...
//
#define PDFIO_NUMBER_CHARS "0123456789-+."
#define PDFIO_DELIM_CHARS "<>(){}[]/%"
//
// Types...
//
typedef struct _pdfio_tbuffer_s // Token reading buffer
{
unsigned char buffer[32], // Buffer
*bufptr, // Pointer into buffer
*bufend; // Last valid byte in buffer
_pdfio_tpeek_cb_t peek_cb; // Peek callback
_pdfio_tconsume_cb_t consume_cb; // Consume callback
void *data; // Callback data
} _pdfio_tbuffer_t;
//
// Local functions...
//
static int get_char(_pdfio_tbuffer_t *tb);
//
// '_pdfioTokenRead()' - Read a token from a file/stream.
//
bool // O - `true` on success, `false` on failure
_pdfioTokenRead(
char *buffer, // I - String buffer
size_t bufsize, // I - Size of string buffer
_pdfio_token_cb_t peek_cb, // I - "peek" callback
_pdfio_token_cb_t read_cb, // I - "read" callback
void *data) // I - Callback data
pdfio_file_t *pdf, // I - PDF file
char *buffer, // I - String buffer
size_t bufsize, // I - Size of string buffer
_pdfio_tpeek_cb_t peek_cb, // I - "peek" callback
_pdfio_tconsume_cb_t consume_cb, // I - "consume" callback
void *data) // I - Callback data
{
char *bufptr, // Pointer into buffer
*bufend, // End of buffer
temp[256], // Temporary buffer
*tempptr, // Pointer into temporary buffer
*tempend; // End of temporary buffer
ssize_t bytes; // Bytes read/peeked
size_t len; // Length of value
_pdfio_tbuffer_t tb; // Token buffer
int ch; // Character
char *bufptr, // Pointer into buffer
*bufend, // End of buffer
state = '\0'; // Current state
return (false);
//
// "state" is:
//
// - '\0' for idle
// - ')' for literal string
// - '/' for name
// - '<' for possible hex string or dict
// - '>' for possible dict
// - '%' for comment
// - 'K' for keyword
// - 'N' for number
// - 'X' for hex string
// Read the next token, skipping any leading whitespace...
memset(&tb, 0, sizeof(tb));
tb.peek_cb = peek_cb;
tb.consume_cb = consume_cb;
tb.data = data;
bufptr = buffer;
bufend = buffer + bufsize - 1;
// Skip leading whitespace...
while ((ch = get_char(&tb)) != EOF)
{
if (ch == '%')
{
// Skip comment
while ((ch = get_char(&tb)) != EOF)
{
if (ch == '\n' || ch == '\r')
break;
}
}
else if (!isspace(ch))
break;
}
if (ch == EOF)
return (false);
// Check for delimiters...
if (strchr(PDFIO_DELIM_CHARS, ch) != NULL)
{
*bufptr++ = state = (char)ch;
}
else if (strchr(PDFIO_NUMBER_CHARS, ch) != NULL)
{
// Number
state = 'N';
*bufptr++ = (char)ch;
}
else
{
// Keyword
state = 'K';
*bufptr++ = (char)ch;
}
switch (state)
{
case ')' : // Literal string
while ((ch = get_char(&tb)) != EOF && ch != ')')
{
if (ch == '\\')
{
// Quoted character...
int i; // Looping var
switch (ch = get_char(&tb))
{
case '0' : // Octal character escape
case '1' :
case '2' :
case '3' :
case '4' :
case '5' :
case '6' :
case '7' :
for (ch -= '0', i = 0; i < 2; i ++)
{
int tch = get_char(&tb); // Next char
if (tch >= '0' && tch <= '7')
ch = (char)((ch << 3) | (tch - '0'));
else
{
tb.bufptr --;
break;
}
}
break;
case '\\' :
case '(' :
case ')' :
break;
case 'n' :
ch = '\n';
break;
case 'r' :
ch = '\r';
break;
case 't' :
ch = '\t';
break;
case 'b' :
ch = '\b';
break;
case 'f' :
ch = '\f';
break;
default :
_pdfioFileError(pdf, "Unknown escape '\\%c' in literal string.", ch);
return (false);
}
}
if (bufptr < bufend)
{
// Normal character...
*bufptr++ = (char)ch;
}
else
{
// Out of space
_pdfioFileError(pdf, "Token too large.");
return (false);
}
}
if (ch != ')')
{
_pdfioFileError(pdf, "Unterminated string literal.");
return (false);
}
break;
case 'K' : // keyword
while ((ch = get_char(&tb)) != EOF && !isspace(ch))
{
if (strchr(PDFIO_DELIM_CHARS, ch) != NULL)
{
// End of keyword...
tb.bufptr --;
break;
}
else if (bufptr < bufend)
{
// Normal character...
*bufptr++ = (char)ch;
}
else
{
// Out of space...
_pdfioFileError(pdf, "Token too large.");
return (false);
}
}
break;
case 'N' : // number
while ((ch = get_char(&tb)) != EOF && !isspace(ch))
{
if (!isdigit(ch) && ch != '.')
{
// End of number...
break;
}
else if (bufptr < bufend)
{
// Normal character...
*bufptr++ = (char)ch;
}
else
{
// Out of space...
_pdfioFileError(pdf, "Token too large.");
return (false);
}
}
break;
case '/' : // "/name"
while ((ch = get_char(&tb)) != EOF && !isspace(ch))
{
if (ch == '#')
{
// Quoted character (#xx) in name...
int i; // Looping var
for (i = 0, ch = 0; i < 2; i ++)
{
int tch = get_char(&tb);
if (!isxdigit(tch & 255))
{
_pdfioFileError(pdf, "Bad # escape in name.");
return (false);
}
else if (isdigit(tch))
ch = (char)((ch << 4) | (tch - '0'));
else
ch = (char)((ch << 4) | (tolower(tch) - 'a' + 10));
}
}
if (bufptr < bufend)
{
*bufptr++ = (char)ch;
}
else
{
// Out of space
_pdfioFileError(pdf, "Token too large.");
return (false);
}
}
break;
case '<' : // Potential hex string
if ((ch = get_char(&tb)) == '<')
{
// Dictionary delimiter
*bufptr++ = (char)ch;
break;
}
else if (!isspace(ch & 255) && !isxdigit(ch & 255))
{
_pdfioFileError(pdf, "Syntax error: '<%c'", ch);
return (false);
}
// Fall through to parse a hex string...
case 'X' : // Hex string
while ((ch = get_char(&tb)) != EOF && ch != '>')
{
if (isxdigit(ch))
{
if (bufptr < bufend)
{
// Hex digit
*bufptr++ = (char)ch;
}
else
{
// Too large
_pdfioFileError(pdf, "Token too large.");
return (false);
}
}
else if (!isspace(ch))
{
_pdfioFileError(pdf, "Invalid hex string character '%c'.", ch);
return (false);
}
}
if (ch == EOF)
{
_pdfioFileError(pdf, "Unterminated hex string.");
return (false);
}
break;
case '>' : // Dictionary
if ((ch = get_char(&tb)) == '>')
{
*bufptr++ = '>';
}
else
{
_pdfioFileError(pdf, "Syntax error: '>%c'.", ch);
return (false);
}
break;
}
while (tb.bufptr < tb.bufend && isspace(*(tb.bufptr)))
tb.bufptr ++;
if (tb.bufptr > tb.buffer)
(consume_cb)(data, (size_t)(tb.bufptr - tb.buffer));
*bufptr = '\0';
return (bufptr > buffer);
}
//
// 'get_char()' - Get a character from the token buffer.
//
static int // O - Character or `EOF` on end-of-file
get_char(_pdfio_tbuffer_t *tb) // I - Token buffer
{
ssize_t bytes; // Bytes peeked
// Refill the buffer as needed...
if (tb->bufptr >= tb->bufend)
{
// Consume previous bytes...
if (tb->bufend > tb->buffer)
(tb->consume_cb)(tb->data, (size_t)(tb->bufend - tb->buffer));
// Peek new bytes...
if ((bytes = (tb->peek_cb)(tb->data, tb->buffer, sizeof(tb->buffer))) < 0)
{
tb->bufptr = tb->bufend = tb->buffer;
return (EOF);
}
// Update pointers...
tb->bufptr = tb->buffer;
tb->bufend = tb->buffer + bytes;
}
// Return the next character...
return (*(tb->bufptr)++);
}

View File

@ -161,6 +161,7 @@ extern pdfio_stream_t *pdfioObjOpenStream(pdfio_obj_t *obj) PDFIO_PUBLIC;
extern pdfio_obj_t *pdfioPageCopy(pdfio_file_t *pdf, pdfio_obj_t *src) PDFIO_PUBLIC;
extern bool pdfioStreamClose(pdfio_stream_t *st) PDFIO_PUBLIC;
extern bool pdfioStreamConsume(pdfio_stream_t *st, size_t bytes) PDFIO_PUBLIC;
extern bool pdfioStreamGetToken(pdfio_stream_t *st, char *buffer, size_t bufsize) PDFIO_PUBLIC;
extern ssize_t pdfioStreamPeek(pdfio_stream_t *st, void *buffer, size_t bytes) PDFIO_PUBLIC;
extern bool pdfioStreamPrintf(pdfio_stream_t *st, const char *format, ...) PDFIO_PUBLIC PDFIO_FORMAT(2,3);