diff --git a/src/pdf_parser.c b/src/pdf_parser.c index ed7bfba..70d72d5 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -79,8 +79,25 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) memset(buf + end - cur, 0, size_buf - end + cur); } - if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) - head = cur + (pos - buf) + 7; + if (head == 0) { + /* Hack needed for invalid object */ + pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6); + tmp = memmem(buf, size_buf, " 0 obj", 6); + + while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b) + tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6); + + if (pos != NULL && tmp != NULL) { + if (pos - buf < tmp - buf) + head = cur + (pos - buf) + 7; + else + head = cur + (tmp - buf) + 6; + } else if (pos != NULL) { + head = cur + (pos - buf) + 7; + } else if (tmp != NULL) { + head = cur + (tmp - buf) + 6; + } + } if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) { /* We need to check if it is the object stored in stream */ @@ -156,9 +173,46 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if (buf == NULL) return 1; - fseek(*fp, ptr->address - 15, SEEK_SET); + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + + /* Handle incomplete object */ + head = buf; + while ((tmp = _memmem_whitespace(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 7; + + /* Hack needed for invalid object */ + while ((tmp = memmem(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 6; + + if (head - buf > 0) { + ptr->address += head - buf; + ptr->size -= head - buf; + + tmp = realloc(buf, ptr->size); + + if (tmp == NULL) + return 1; + + buf = tmp; + + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + } + + /* Hack needed for invalid object */ + fseek(*fp, ptr->address - 14, SEEK_SET); fread(str, 8, 1, *fp); + if (str[7] < '0' || str[7] > '9') { + fseek(*fp, ptr->address - 15, SEEK_SET); + fread(str, 8, 1, *fp); + } + for (int i = 7; i >= 0; i--) { if (str[i] < '0' || str[i] > '9') { if (i < 7) @@ -170,11 +224,10 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) } } - fseek(*fp, ptr->address, SEEK_SET); - fread(buf, ptr->size, 1, *fp); - if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && - (tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { + ((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL || + /* Hack needed for invalid object */ + (tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) { /* * A dictionary object may have nested dictionary, * but it should not be in a stream @@ -187,6 +240,15 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) "stream\r\n", 8) == NULL) tail = tmp; + /* Hack needed for invalid object */ + while ((tmp = memmem(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL && + memmem(tail + 2, + (tmp - tail) - 2, + "stream\r\n", 8) == NULL) + tail = tmp; + ptr->dictionary_size = tail - head + 2; ptr->dictionary = malloc(ptr->dictionary_size + 1); @@ -226,27 +288,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) free(buf); } else { - /* Handle incomplete object */ - head = buf; - while ((tmp = _memmem_whitespace(head, - ptr->size - (head - buf), - " 0 obj", 6)) != NULL) - head = tmp + 7; - - if (head - buf > 0) { - ptr->object_size = ptr->size - (head - buf); - ptr->object = malloc(ptr->object_size); - - if (ptr->object == NULL) - return 1; - - memcpy(ptr->object, head, ptr->object_size); - - free(buf); - } else { - ptr->object_size = ptr->size; - ptr->object = buf; - } + ptr->object_size = ptr->size; + ptr->object = buf; } ptr = ptr->next;