Improve PDF parser.

Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
yzrh 2023-01-01 18:58:43 +00:00
parent 9019a18449
commit cde014cffb

View file

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org> * Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
*/ */
@ -19,12 +19,12 @@ static void *
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
{ {
const char whitespace[6] = { const char whitespace[6] = {
' ',
'\r', '\r',
'\n', '\n',
'\f', '\f',
'\t', '\t',
'\0', '\0'
' '
}; };
char tmp[s1 + 1]; char tmp[s1 + 1];
@ -57,13 +57,18 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
end = ftell(*fp); end = ftell(*fp);
fseek(*fp, cur, SEEK_SET); fseek(*fp, cur, SEEK_SET);
int head = 0; long head = 0;
int tail = 0; long tail = 0;
char *pos; char *pos;
char *tmp; char *tmp;
for (;;) { for (;;) {
if (cur + size_buf < end) {
fread(buf, size_buf, 1, *fp); fread(buf, size_buf, 1, *fp);
} else {
fread(buf, end - cur, 1, *fp);
memset(buf + end - cur, 0, size_buf - end + cur);
}
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
head = cur + (pos - buf) + 7; head = cur + (pos - buf) + 7;
@ -72,8 +77,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
/* We need to check if it is the object stored in stream */ /* We need to check if it is the object stored in stream */
while (memcmp(pos + 7, while (memcmp(pos + 7,
"\r\nendstream", 11) == 0 && "\r\nendstream", 11) == 0 &&
(tmp = _memmem_whitespace(pos + 6, (tmp = _memmem_whitespace(pos + 7,
size_buf - (pos - buf) - 6, size_buf - (pos - buf) - 7,
"endobj", 6)) != NULL) "endobj", 6)) != NULL)
pos = tmp; pos = tmp;
@ -102,13 +107,16 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
ptr->address = head; ptr->address = head;
ptr->size = tail - head; ptr->size = tail - head;
fseek(*fp, tail + 6, SEEK_SET); fseek(*fp, tail + 7, SEEK_SET);
head = tail = 0; head = tail = 0;
} else if (head > 0 && tail > 0) {
fseek(*fp, head, SEEK_SET);
tail = 0;
} else { } else {
fseek(*fp, -6, SEEK_CUR); fseek(*fp, -7, SEEK_CUR);
} }
if ((cur = ftell(*fp)) + 6 >= end) if ((cur = ftell(*fp)) + 7 >= end)
break; break;
} }
@ -159,11 +167,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
* A dictionary object may have nested dictionary, * A dictionary object may have nested dictionary,
* but it should not be in a stream * but it should not be in a stream
*/ */
while ((tmp = _memmem_whitespace(tail + 2, while ((tmp = _memmem_whitespace(tail + 3,
ptr->size - (tail - buf) - 2, ptr->size - (tail - buf) - 3,
">>", 2)) != NULL && ">>", 2)) != NULL &&
memmem(tail + 2, memmem(tail + 3,
ptr->size - (tail - buf) - 2, ptr->size - (tail - buf) - 3,
"stream\r\n", 8) == NULL) "stream\r\n", 8) == NULL)
tail = tmp; tail = tmp;
@ -190,8 +198,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
while (_memmem_whitespace(tail, while (_memmem_whitespace(tail,
ptr->size - (tail - buf), ptr->size - (tail - buf),
"endobj", 6) != NULL && "endobj", 6) != NULL &&
(tmp = _memmem_whitespace(tail + 9, (tmp = _memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 9, ptr->size - (tail - buf) - 10,
"endstream", 9)) != NULL) "endstream", 9)) != NULL)
tail = tmp; tail = tmp;