Improve PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
parent
9019a18449
commit
cde014cffb
1 changed files with 26 additions and 18 deletions
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
|
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
@ -19,12 +19,12 @@ static void *
|
||||||
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
|
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
|
||||||
{
|
{
|
||||||
const char whitespace[6] = {
|
const char whitespace[6] = {
|
||||||
|
' ',
|
||||||
'\r',
|
'\r',
|
||||||
'\n',
|
'\n',
|
||||||
'\f',
|
'\f',
|
||||||
'\t',
|
'\t',
|
||||||
'\0',
|
'\0'
|
||||||
' '
|
|
||||||
};
|
};
|
||||||
|
|
||||||
char tmp[s1 + 1];
|
char tmp[s1 + 1];
|
||||||
|
@ -34,7 +34,7 @@ _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
|
||||||
|
|
||||||
for (int i = 0; i < 6; i++) {
|
for (int i = 0; i < 6; i++) {
|
||||||
tmp[s1] = whitespace[i];
|
tmp[s1] = whitespace[i];
|
||||||
if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
|
if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,13 +57,18 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
|
||||||
end = ftell(*fp);
|
end = ftell(*fp);
|
||||||
fseek(*fp, cur, SEEK_SET);
|
fseek(*fp, cur, SEEK_SET);
|
||||||
|
|
||||||
int head = 0;
|
long head = 0;
|
||||||
int tail = 0;
|
long tail = 0;
|
||||||
char *pos;
|
char *pos;
|
||||||
char *tmp;
|
char *tmp;
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
|
if (cur + size_buf < end) {
|
||||||
fread(buf, size_buf, 1, *fp);
|
fread(buf, size_buf, 1, *fp);
|
||||||
|
} else {
|
||||||
|
fread(buf, end - cur, 1, *fp);
|
||||||
|
memset(buf + end - cur, 0, size_buf - end + cur);
|
||||||
|
}
|
||||||
|
|
||||||
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
|
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
|
||||||
head = cur + (pos - buf) + 7;
|
head = cur + (pos - buf) + 7;
|
||||||
|
@ -72,8 +77,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
|
||||||
/* We need to check if it is the object stored in stream */
|
/* We need to check if it is the object stored in stream */
|
||||||
while (memcmp(pos + 7,
|
while (memcmp(pos + 7,
|
||||||
"\r\nendstream", 11) == 0 &&
|
"\r\nendstream", 11) == 0 &&
|
||||||
(tmp = _memmem_whitespace(pos + 6,
|
(tmp = _memmem_whitespace(pos + 7,
|
||||||
size_buf - (pos - buf) - 6,
|
size_buf - (pos - buf) - 7,
|
||||||
"endobj", 6)) != NULL)
|
"endobj", 6)) != NULL)
|
||||||
pos = tmp;
|
pos = tmp;
|
||||||
|
|
||||||
|
@ -102,13 +107,16 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
|
||||||
ptr->address = head;
|
ptr->address = head;
|
||||||
ptr->size = tail - head;
|
ptr->size = tail - head;
|
||||||
|
|
||||||
fseek(*fp, tail + 6, SEEK_SET);
|
fseek(*fp, tail + 7, SEEK_SET);
|
||||||
head = tail = 0;
|
head = tail = 0;
|
||||||
|
} else if (head > 0 && tail > 0) {
|
||||||
|
fseek(*fp, head, SEEK_SET);
|
||||||
|
tail = 0;
|
||||||
} else {
|
} else {
|
||||||
fseek(*fp, -6, SEEK_CUR);
|
fseek(*fp, -7, SEEK_CUR);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((cur = ftell(*fp)) + 6 >= end)
|
if ((cur = ftell(*fp)) + 7 >= end)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,11 +167,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
|
||||||
* A dictionary object may have nested dictionary,
|
* A dictionary object may have nested dictionary,
|
||||||
* but it should not be in a stream
|
* but it should not be in a stream
|
||||||
*/
|
*/
|
||||||
while ((tmp = _memmem_whitespace(tail + 2,
|
while ((tmp = _memmem_whitespace(tail + 3,
|
||||||
ptr->size - (tail - buf) - 2,
|
ptr->size - (tail - buf) - 3,
|
||||||
">>", 2)) != NULL &&
|
">>", 2)) != NULL &&
|
||||||
memmem(tail + 2,
|
memmem(tail + 3,
|
||||||
ptr->size - (tail - buf) - 2,
|
ptr->size - (tail - buf) - 3,
|
||||||
"stream\r\n", 8) == NULL)
|
"stream\r\n", 8) == NULL)
|
||||||
tail = tmp;
|
tail = tmp;
|
||||||
|
|
||||||
|
@ -190,8 +198,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
|
||||||
while (_memmem_whitespace(tail,
|
while (_memmem_whitespace(tail,
|
||||||
ptr->size - (tail - buf),
|
ptr->size - (tail - buf),
|
||||||
"endobj", 6) != NULL &&
|
"endobj", 6) != NULL &&
|
||||||
(tmp = _memmem_whitespace(tail + 9,
|
(tmp = _memmem_whitespace(tail + 10,
|
||||||
ptr->size - (tail - buf) - 9,
|
ptr->size - (tail - buf) - 10,
|
||||||
"endstream", 9)) != NULL)
|
"endstream", 9)) != NULL)
|
||||||
tail = tmp;
|
tail = tmp;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue