Improve PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
parent
9019a18449
commit
cde014cffb
1 changed files with 26 additions and 18 deletions
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
|
||||
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
@ -19,12 +19,12 @@ static void *
|
|||
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
|
||||
{
|
||||
const char whitespace[6] = {
|
||||
' ',
|
||||
'\r',
|
||||
'\n',
|
||||
'\f',
|
||||
'\t',
|
||||
'\0',
|
||||
' '
|
||||
'\0'
|
||||
};
|
||||
|
||||
char tmp[s1 + 1];
|
||||
|
@ -34,7 +34,7 @@ _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
|
|||
|
||||
for (int i = 0; i < 6; i++) {
|
||||
tmp[s1] = whitespace[i];
|
||||
if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
|
||||
if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -57,13 +57,18 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
|
|||
end = ftell(*fp);
|
||||
fseek(*fp, cur, SEEK_SET);
|
||||
|
||||
int head = 0;
|
||||
int tail = 0;
|
||||
long head = 0;
|
||||
long tail = 0;
|
||||
char *pos;
|
||||
char *tmp;
|
||||
|
||||
for (;;) {
|
||||
if (cur + size_buf < end) {
|
||||
fread(buf, size_buf, 1, *fp);
|
||||
} else {
|
||||
fread(buf, end - cur, 1, *fp);
|
||||
memset(buf + end - cur, 0, size_buf - end + cur);
|
||||
}
|
||||
|
||||
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
|
||||
head = cur + (pos - buf) + 7;
|
||||
|
@ -72,8 +77,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
|
|||
/* We need to check if it is the object stored in stream */
|
||||
while (memcmp(pos + 7,
|
||||
"\r\nendstream", 11) == 0 &&
|
||||
(tmp = _memmem_whitespace(pos + 6,
|
||||
size_buf - (pos - buf) - 6,
|
||||
(tmp = _memmem_whitespace(pos + 7,
|
||||
size_buf - (pos - buf) - 7,
|
||||
"endobj", 6)) != NULL)
|
||||
pos = tmp;
|
||||
|
||||
|
@ -102,13 +107,16 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
|
|||
ptr->address = head;
|
||||
ptr->size = tail - head;
|
||||
|
||||
fseek(*fp, tail + 6, SEEK_SET);
|
||||
fseek(*fp, tail + 7, SEEK_SET);
|
||||
head = tail = 0;
|
||||
} else if (head > 0 && tail > 0) {
|
||||
fseek(*fp, head, SEEK_SET);
|
||||
tail = 0;
|
||||
} else {
|
||||
fseek(*fp, -6, SEEK_CUR);
|
||||
fseek(*fp, -7, SEEK_CUR);
|
||||
}
|
||||
|
||||
if ((cur = ftell(*fp)) + 6 >= end)
|
||||
if ((cur = ftell(*fp)) + 7 >= end)
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -159,11 +167,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
|
|||
* A dictionary object may have nested dictionary,
|
||||
* but it should not be in a stream
|
||||
*/
|
||||
while ((tmp = _memmem_whitespace(tail + 2,
|
||||
ptr->size - (tail - buf) - 2,
|
||||
while ((tmp = _memmem_whitespace(tail + 3,
|
||||
ptr->size - (tail - buf) - 3,
|
||||
">>", 2)) != NULL &&
|
||||
memmem(tail + 2,
|
||||
ptr->size - (tail - buf) - 2,
|
||||
memmem(tail + 3,
|
||||
ptr->size - (tail - buf) - 3,
|
||||
"stream\r\n", 8) == NULL)
|
||||
tail = tmp;
|
||||
|
||||
|
@ -190,8 +198,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
|
|||
while (_memmem_whitespace(tail,
|
||||
ptr->size - (tail - buf),
|
||||
"endobj", 6) != NULL &&
|
||||
(tmp = _memmem_whitespace(tail + 9,
|
||||
ptr->size - (tail - buf) - 9,
|
||||
(tmp = _memmem_whitespace(tail + 10,
|
||||
ptr->size - (tail - buf) - 10,
|
||||
"endstream", 9)) != NULL)
|
||||
tail = tmp;
|
||||
|
||||
|
|
Loading…
Reference in a new issue