From 8276423eb8395eae3e3002442307272eff1c9e8f Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 13:51:13 +0000 Subject: [PATCH] Prioritise incomplete object during deduplication. Signed-off-by: yzrh --- CHANGE.md | 3 +++ src/cnki_pdf.c | 30 ++++++++++++++++++++---------- src/pdf_parser.c | 25 ++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index 063d93a..3e05e10 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -2,6 +2,8 @@ ================== * Support HN text overlay. +* Handle invalid PDF object token in CAJ and KDH. +* Handle inaccuracy page count in CAJ and KDH. 0.2.5 (2023-01-XX) ================== @@ -9,6 +11,7 @@ * Improve PDF parser. * Handle duplicated object in CAJ. * Handle duplicated image in HN. +* Handle incomplete PDF object in CAJ and KDH. * Fix JBIG decoder. 0.2.4 (2022-12-31) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index af38aa6..76931ea 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -163,10 +163,16 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) printf("Deleting duplicated object\n"); ptr = *pdf; - while (ptr != NULL && ptr->next != NULL) { - if (ptr->id == ptr->next->id) { - pdf_get_obj(&ptr, ptr->id, &tmp); - pdf_obj_del(&ptr, ptr->id); + while (ptr->next != NULL && ptr->next->next != NULL) { + if (ptr->next->id == ptr->next->next->id) { + /* Keep the bigger one, the smaller one is usually incomplete */ + if (ptr->next->size < ptr->next->next->size) { + pdf_get_obj(&ptr, ptr->next->id, &tmp); + pdf_obj_del(&ptr, ptr->next->id); + } else { + pdf_get_obj(&ptr->next, ptr->next->id, &tmp); + pdf_obj_del(&ptr->next, ptr->next->id); + } tmp->next = NULL; pdf_obj_destroy(&tmp); @@ -174,7 +180,9 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) ret++; if ((*param)->stat > 1) - printf("Deleted duplicated object %d.\n", ptr->id); + printf("Deleted duplicated object %d.\n", ptr->next->id); + + continue; } ptr = ptr->next; @@ -236,6 +244,10 @@ cnki_pdf(cnki_t **param) printf("Loaded %d object(s)\n", pdf_get_count(&pdf)); + pdf_obj_sort(&pdf); + + _pdf_obj_dedup(param, &pdf); + int dictionary_size; char *dictionary; @@ -301,7 +313,7 @@ cnki_pdf(cnki_t **param) snprintf(buf, 64, "]\n/Count %d\n>>", - pdf_get_kid_count(&pdf, parent[i])); + pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]); strcat(dictionary, buf); pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); @@ -354,7 +366,7 @@ cnki_pdf(cnki_t **param) } else { for (int i = 0; i < parent[0]; i++) if (parent_missing[i] == 1) - root = i; + root = parent[i + 1]; } if (root == 0) @@ -471,8 +483,6 @@ cnki_pdf(cnki_t **param) _pdf_obj_sort(param, &pdf); - _pdf_obj_dedup(param, &pdf); - _pdf_dump(param, &pdf); pdf_obj_destroy(&pdf); @@ -510,7 +520,7 @@ cnki_pdf_hn(cnki_t **param) if (root_kid == NULL) return 1; - memset(root_kid, 0, (*param)->file_stat->page); + memset(root_kid, 0, (*param)->file_stat->page * sizeof(int)); object_hn_t *ptr = (*param)->object_hn; while (ptr != NULL) { diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 781bafa..ed7bfba 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -183,7 +183,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && memmem(tail + 3, - ptr->size - (tail - buf) - 3, + (tmp - tail) - 3, "stream\r\n", 8) == NULL) tail = tmp; @@ -226,8 +226,27 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) free(buf); } else { - ptr->object_size = ptr->size; - ptr->object = buf; + /* Handle incomplete object */ + head = buf; + while ((tmp = _memmem_whitespace(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 7; + + if (head - buf > 0) { + ptr->object_size = ptr->size - (head - buf); + ptr->object = malloc(ptr->object_size); + + if (ptr->object == NULL) + return 1; + + memcpy(ptr->object, head, ptr->object_size); + + free(buf); + } else { + ptr->object_size = ptr->size; + ptr->object = buf; + } } ptr = ptr->next;