diff --git a/pdfio-common.c b/pdfio-common.c index 9f08037..bcf217e 100644 --- a/pdfio-common.c +++ b/pdfio-common.c @@ -23,6 +23,23 @@ static ssize_t read_buffer(pdfio_file_t *pdf, char *buffer, size_t bytes); static bool write_buffer(pdfio_file_t *pdf, const void *buffer, size_t bytes); +// +// '_pdfioFileConsume()' - Consume bytes from the file. +// + +bool // O - `true` on sucess, `false` on EOF +_pdfioFileConsume(pdfio_file_t *pdf, // I - PDF file + size_t bytes) // I - Bytes to consume +{ + if ((size_t)(pdf->bufend - pdf->bufptr) > bytes) + pdf->bufptr += bytes; + else if (_pdfioFileSeek(pdf, (off_t)bytes, SEEK_CUR) < 0) + return (false); + + return (true); +} + + // // '_pdfioFileDefaultError()' - Default error callback. // @@ -114,7 +131,7 @@ _pdfioFileGetToken(pdfio_file_t *pdf, // I - PDF file char *buffer,// I - String buffer size_t bufsize)// I - Size of string buffer { - return (_pdfioTokenRead(buffer, bufsize, (_pdfio_token_cb_t)_pdfioFilePeek, (_pdfio_token_cb_t)_pdfioFileRead, pdf)); + return (_pdfioTokenRead(pdf, buffer, bufsize, (_pdfio_tpeek_cb_t)_pdfioFilePeek, (_pdfio_tconsume_cb_t)_pdfioFileConsume, pdf)); } diff --git a/pdfio-private.h b/pdfio-private.h index 74e18b8..e9a84ca 100644 --- a/pdfio-private.h +++ b/pdfio-private.h @@ -165,7 +165,8 @@ struct _pdfio_stream_s // Stream z_stream flate; // Flate filter state }; -typedef ssize_t (*_pdfio_token_cb_t)(void *data, void *buffer, size_t bufsize); +typedef ssize_t (*_pdfio_tconsume_cb_t)(void *data, size_t bytes); +typedef ssize_t (*_pdfio_tpeek_cb_t)(void *data, void *buffer, size_t bytes); // @@ -182,6 +183,7 @@ extern pdfio_dict_t *_pdfioDictRead(pdfio_file_t *pdf) PDFIO_INTERNAL; extern bool _pdfioDictSetValue(pdfio_dict_t *dict, const char *key, _pdfio_value_t *value) PDFIO_INTERNAL; extern bool _pdfioDictWrite(pdfio_dict_t *dict, off_t *length) PDFIO_INTERNAL; +extern bool _pdfioFileConsume(pdfio_file_t *pdf, size_t bytes) PDFIO_INTERNAL; extern bool _pdfioFileDefaultError(pdfio_file_t *pdf, const char *message, void *data) PDFIO_INTERNAL; extern bool _pdfioFileError(pdfio_file_t *pdf, const char *format, ...) PDFIO_FORMAT(2,3) PDFIO_INTERNAL; extern bool _pdfioFileFlush(pdfio_file_t *pdf) PDFIO_INTERNAL; @@ -202,7 +204,7 @@ extern void _pdfioStreamDelete(pdfio_stream_t *st) PDFIO_INTERNAL; extern bool _pdfioStringIsAllocated(pdfio_file_t *pdf, const char *s) PDFIO_INTERNAL; -extern bool _pdfioTokenRead(char *buffer, size_t bufsize, _pdfio_token_cb_t peek_cb, _pdfio_token_cb_t read_cb, void *data); +extern bool _pdfioTokenRead(pdfio_file_t *pdf, char *buffer, size_t bufsize, _pdfio_tpeek_cb_t peek_cb, _pdfio_tconsume_cb_t consume_cb, void *data); extern _pdfio_value_t *_pdfioValueCopy(pdfio_file_t *pdfdst, _pdfio_value_t *vdst, pdfio_file_t *pdfsrc, _pdfio_value_t *vsrc) PDFIO_INTERNAL; extern void _pdfioValueDelete(_pdfio_value_t *v) PDFIO_INTERNAL; diff --git a/pdfio-stream.c b/pdfio-stream.c index e3e6408..1daf28f 100644 --- a/pdfio-stream.c +++ b/pdfio-stream.c @@ -27,6 +27,21 @@ pdfioStreamClose(pdfio_stream_t *st) // I - Stream } +// +// 'pdfioStreamConsume()' - Consume bytes from the stream. +// + +bool // O - `true` on success, `false` on EOF +pdfioStreamConsume(pdfio_stream_t *st, // I - Stream + size_t bytes)// I - Number of bytes to consume +{ + // TODO: Implement me + (void)st; + (void)bytes; + return (false); +} + + // // '_pdfioStreamDelete()' - Free all memory used by a stream. // @@ -57,7 +72,7 @@ pdfioStreamGetToken( char *buffer, // I - String buffer size_t bufsize) // I - Size of string buffer { - return (_pdfioTokenRead(buffer, bufsize, (_pdfio_token_cb_t)pdfioStreamPeek, (_pdfio_token_cb_t)pdfioStreamRead, st)); + return (_pdfioTokenRead(st->pdf, buffer, bufsize, (_pdfio_tpeek_cb_t)pdfioStreamPeek, (_pdfio_tconsume_cb_t)pdfioStreamConsume, st)); } diff --git a/pdfio-token.c b/pdfio-token.c index 25a6b71..8225615 100644 --- a/pdfio-token.c +++ b/pdfio-token.c @@ -21,11 +21,11 @@ // (string) "(" and ")" delimit a string // [array] "[" and "]" delimit an array // "<" and ">" delimit a hex string -// {...} "{" and "}" delimit ??? +// {...} "{" and "}" are reserved as future delimiters // /name "/" starts a name with any special characters // quoted as "#HH" where HH is the byte value in hex. // %comment "%" starts a comment to the end of a line -// keyword A keyword consists of upper/lowercase letters +// keyword A keyword consists of other unreserved characters // [-+]?[0-9]*(.[0-9]*)? A number optionally starts with "+" or "-". // // Newlines are CR, LF, or CR LF. @@ -36,27 +36,382 @@ // +// +// Constants... +// + +#define PDFIO_NUMBER_CHARS "0123456789-+." +#define PDFIO_DELIM_CHARS "<>(){}[]/%" + + +// +// Types... +// + +typedef struct _pdfio_tbuffer_s // Token reading buffer +{ + unsigned char buffer[32], // Buffer + *bufptr, // Pointer into buffer + *bufend; // Last valid byte in buffer + _pdfio_tpeek_cb_t peek_cb; // Peek callback + _pdfio_tconsume_cb_t consume_cb; // Consume callback + void *data; // Callback data +} _pdfio_tbuffer_t; + + +// +// Local functions... +// + +static int get_char(_pdfio_tbuffer_t *tb); + + // // '_pdfioTokenRead()' - Read a token from a file/stream. // bool // O - `true` on success, `false` on failure _pdfioTokenRead( - char *buffer, // I - String buffer - size_t bufsize, // I - Size of string buffer - _pdfio_token_cb_t peek_cb, // I - "peek" callback - _pdfio_token_cb_t read_cb, // I - "read" callback - void *data) // I - Callback data + pdfio_file_t *pdf, // I - PDF file + char *buffer, // I - String buffer + size_t bufsize, // I - Size of string buffer + _pdfio_tpeek_cb_t peek_cb, // I - "peek" callback + _pdfio_tconsume_cb_t consume_cb, // I - "consume" callback + void *data) // I - Callback data { - char *bufptr, // Pointer into buffer - *bufend, // End of buffer - temp[256], // Temporary buffer - *tempptr, // Pointer into temporary buffer - *tempend; // End of temporary buffer - ssize_t bytes; // Bytes read/peeked - size_t len; // Length of value + _pdfio_tbuffer_t tb; // Token buffer + int ch; // Character + char *bufptr, // Pointer into buffer + *bufend, // End of buffer + state = '\0'; // Current state - return (false); + // + // "state" is: + // + // - '\0' for idle + // - ')' for literal string + // - '/' for name + // - '<' for possible hex string or dict + // - '>' for possible dict + // - '%' for comment + // - 'K' for keyword + // - 'N' for number + // - 'X' for hex string + + // Read the next token, skipping any leading whitespace... + memset(&tb, 0, sizeof(tb)); + tb.peek_cb = peek_cb; + tb.consume_cb = consume_cb; + tb.data = data; + + bufptr = buffer; + bufend = buffer + bufsize - 1; + + // Skip leading whitespace... + while ((ch = get_char(&tb)) != EOF) + { + if (ch == '%') + { + // Skip comment + while ((ch = get_char(&tb)) != EOF) + { + if (ch == '\n' || ch == '\r') + break; + } + } + else if (!isspace(ch)) + break; + } + + if (ch == EOF) + return (false); + + // Check for delimiters... + if (strchr(PDFIO_DELIM_CHARS, ch) != NULL) + { + *bufptr++ = state = (char)ch; + } + else if (strchr(PDFIO_NUMBER_CHARS, ch) != NULL) + { + // Number + state = 'N'; + *bufptr++ = (char)ch; + } + else + { + // Keyword + state = 'K'; + *bufptr++ = (char)ch; + } + + switch (state) + { + case ')' : // Literal string + while ((ch = get_char(&tb)) != EOF && ch != ')') + { + if (ch == '\\') + { + // Quoted character... + int i; // Looping var + + switch (ch = get_char(&tb)) + { + case '0' : // Octal character escape + case '1' : + case '2' : + case '3' : + case '4' : + case '5' : + case '6' : + case '7' : + for (ch -= '0', i = 0; i < 2; i ++) + { + int tch = get_char(&tb); // Next char + + if (tch >= '0' && tch <= '7') + ch = (char)((ch << 3) | (tch - '0')); + else + { + tb.bufptr --; + break; + } + } + break; + + case '\\' : + case '(' : + case ')' : + break; + + case 'n' : + ch = '\n'; + break; + + case 'r' : + ch = '\r'; + break; + + case 't' : + ch = '\t'; + break; + + case 'b' : + ch = '\b'; + break; + + case 'f' : + ch = '\f'; + break; + + default : + _pdfioFileError(pdf, "Unknown escape '\\%c' in literal string.", ch); + return (false); + } + } + + if (bufptr < bufend) + { + // Normal character... + *bufptr++ = (char)ch; + } + else + { + // Out of space + _pdfioFileError(pdf, "Token too large."); + return (false); + } + } + + if (ch != ')') + { + _pdfioFileError(pdf, "Unterminated string literal."); + return (false); + } + break; + + case 'K' : // keyword + while ((ch = get_char(&tb)) != EOF && !isspace(ch)) + { + if (strchr(PDFIO_DELIM_CHARS, ch) != NULL) + { + // End of keyword... + tb.bufptr --; + break; + } + else if (bufptr < bufend) + { + // Normal character... + *bufptr++ = (char)ch; + } + else + { + // Out of space... + _pdfioFileError(pdf, "Token too large."); + return (false); + } + } + break; + + case 'N' : // number + while ((ch = get_char(&tb)) != EOF && !isspace(ch)) + { + if (!isdigit(ch) && ch != '.') + { + // End of number... + break; + } + else if (bufptr < bufend) + { + // Normal character... + *bufptr++ = (char)ch; + } + else + { + // Out of space... + _pdfioFileError(pdf, "Token too large."); + return (false); + } + } + break; + + case '/' : // "/name" + while ((ch = get_char(&tb)) != EOF && !isspace(ch)) + { + if (ch == '#') + { + // Quoted character (#xx) in name... + int i; // Looping var + + for (i = 0, ch = 0; i < 2; i ++) + { + int tch = get_char(&tb); + + if (!isxdigit(tch & 255)) + { + _pdfioFileError(pdf, "Bad # escape in name."); + return (false); + } + else if (isdigit(tch)) + ch = (char)((ch << 4) | (tch - '0')); + else + ch = (char)((ch << 4) | (tolower(tch) - 'a' + 10)); + } + } + + if (bufptr < bufend) + { + *bufptr++ = (char)ch; + } + else + { + // Out of space + _pdfioFileError(pdf, "Token too large."); + return (false); + } + } + break; + + case '<' : // Potential hex string + if ((ch = get_char(&tb)) == '<') + { + // Dictionary delimiter + *bufptr++ = (char)ch; + break; + } + else if (!isspace(ch & 255) && !isxdigit(ch & 255)) + { + _pdfioFileError(pdf, "Syntax error: '<%c'", ch); + return (false); + } + + // Fall through to parse a hex string... + + case 'X' : // Hex string + while ((ch = get_char(&tb)) != EOF && ch != '>') + { + if (isxdigit(ch)) + { + if (bufptr < bufend) + { + // Hex digit + *bufptr++ = (char)ch; + } + else + { + // Too large + _pdfioFileError(pdf, "Token too large."); + return (false); + } + } + else if (!isspace(ch)) + { + _pdfioFileError(pdf, "Invalid hex string character '%c'.", ch); + return (false); + } + } + + if (ch == EOF) + { + _pdfioFileError(pdf, "Unterminated hex string."); + return (false); + } + break; + + case '>' : // Dictionary + if ((ch = get_char(&tb)) == '>') + { + *bufptr++ = '>'; + } + else + { + _pdfioFileError(pdf, "Syntax error: '>%c'.", ch); + return (false); + } + break; + } + + while (tb.bufptr < tb.bufend && isspace(*(tb.bufptr))) + tb.bufptr ++; + + if (tb.bufptr > tb.buffer) + (consume_cb)(data, (size_t)(tb.bufptr - tb.buffer)); + + *bufptr = '\0'; + + return (bufptr > buffer); } + +// +// 'get_char()' - Get a character from the token buffer. +// + +static int // O - Character or `EOF` on end-of-file +get_char(_pdfio_tbuffer_t *tb) // I - Token buffer +{ + ssize_t bytes; // Bytes peeked + + + // Refill the buffer as needed... + if (tb->bufptr >= tb->bufend) + { + // Consume previous bytes... + if (tb->bufend > tb->buffer) + (tb->consume_cb)(tb->data, (size_t)(tb->bufend - tb->buffer)); + + // Peek new bytes... + if ((bytes = (tb->peek_cb)(tb->data, tb->buffer, sizeof(tb->buffer))) < 0) + { + tb->bufptr = tb->bufend = tb->buffer; + return (EOF); + } + + // Update pointers... + tb->bufptr = tb->buffer; + tb->bufend = tb->buffer + bytes; + } + + // Return the next character... + return (*(tb->bufptr)++); +} diff --git a/pdfio.h b/pdfio.h index fd9beef..7950de3 100644 --- a/pdfio.h +++ b/pdfio.h @@ -161,6 +161,7 @@ extern pdfio_stream_t *pdfioObjOpenStream(pdfio_obj_t *obj) PDFIO_PUBLIC; extern pdfio_obj_t *pdfioPageCopy(pdfio_file_t *pdf, pdfio_obj_t *src) PDFIO_PUBLIC; extern bool pdfioStreamClose(pdfio_stream_t *st) PDFIO_PUBLIC; +extern bool pdfioStreamConsume(pdfio_stream_t *st, size_t bytes) PDFIO_PUBLIC; extern bool pdfioStreamGetToken(pdfio_stream_t *st, char *buffer, size_t bufsize) PDFIO_PUBLIC; extern ssize_t pdfioStreamPeek(pdfio_stream_t *st, void *buffer, size_t bytes) PDFIO_PUBLIC; extern bool pdfioStreamPrintf(pdfio_stream_t *st, const char *format, ...) PDFIO_PUBLIC PDFIO_FORMAT(2,3);