Improve PDF parser.

Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
yzrh 2023-01-01 18:58:43 +00:00
parent 9019a18449
commit cde014cffb

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -19,12 +19,12 @@ static void *
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
{
const char whitespace[6] = {
' ',
'\r',
'\n',
'\f',
'\t',
'\0',
' '
'\0'
};
char tmp[s1 + 1];
@ -34,7 +34,7 @@ _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
for (int i = 0; i < 6; i++) {
tmp[s1] = whitespace[i];
if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
return ret;
}
@ -57,13 +57,18 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
end = ftell(*fp);
fseek(*fp, cur, SEEK_SET);
int head = 0;
int tail = 0;
long head = 0;
long tail = 0;
char *pos;
char *tmp;
for (;;) {
if (cur + size_buf < end) {
fread(buf, size_buf, 1, *fp);
} else {
fread(buf, end - cur, 1, *fp);
memset(buf + end - cur, 0, size_buf - end + cur);
}
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
head = cur + (pos - buf) + 7;
@ -72,8 +77,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
/* We need to check if it is the object stored in stream */
while (memcmp(pos + 7,
"\r\nendstream", 11) == 0 &&
(tmp = _memmem_whitespace(pos + 6,
size_buf - (pos - buf) - 6,
(tmp = _memmem_whitespace(pos + 7,
size_buf - (pos - buf) - 7,
"endobj", 6)) != NULL)
pos = tmp;
@ -102,13 +107,16 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
ptr->address = head;
ptr->size = tail - head;
fseek(*fp, tail + 6, SEEK_SET);
fseek(*fp, tail + 7, SEEK_SET);
head = tail = 0;
} else if (head > 0 && tail > 0) {
fseek(*fp, head, SEEK_SET);
tail = 0;
} else {
fseek(*fp, -6, SEEK_CUR);
fseek(*fp, -7, SEEK_CUR);
}
if ((cur = ftell(*fp)) + 6 >= end)
if ((cur = ftell(*fp)) + 7 >= end)
break;
}
@ -159,11 +167,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
while ((tmp = _memmem_whitespace(tail + 2,
ptr->size - (tail - buf) - 2,
while ((tmp = _memmem_whitespace(tail + 3,
ptr->size - (tail - buf) - 3,
">>", 2)) != NULL &&
memmem(tail + 2,
ptr->size - (tail - buf) - 2,
memmem(tail + 3,
ptr->size - (tail - buf) - 3,
"stream\r\n", 8) == NULL)
tail = tmp;
@ -190,8 +198,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
while (_memmem_whitespace(tail,
ptr->size - (tail - buf),
"endobj", 6) != NULL &&
(tmp = _memmem_whitespace(tail + 9,
ptr->size - (tail - buf) - 9,
(tmp = _memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
"endstream", 9)) != NULL)
tail = tmp;