Prioritise incomplete object during deduplication.
Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
parent
7ac0971a17
commit
8276423eb8
3 changed files with 45 additions and 13 deletions
|
@ -2,6 +2,8 @@
|
||||||
==================
|
==================
|
||||||
|
|
||||||
* Support HN text overlay.
|
* Support HN text overlay.
|
||||||
|
* Handle invalid PDF object token in CAJ and KDH.
|
||||||
|
* Handle inaccuracy page count in CAJ and KDH.
|
||||||
|
|
||||||
0.2.5 (2023-01-XX)
|
0.2.5 (2023-01-XX)
|
||||||
==================
|
==================
|
||||||
|
@ -9,6 +11,7 @@
|
||||||
* Improve PDF parser.
|
* Improve PDF parser.
|
||||||
* Handle duplicated object in CAJ.
|
* Handle duplicated object in CAJ.
|
||||||
* Handle duplicated image in HN.
|
* Handle duplicated image in HN.
|
||||||
|
* Handle incomplete PDF object in CAJ and KDH.
|
||||||
* Fix JBIG decoder.
|
* Fix JBIG decoder.
|
||||||
|
|
||||||
0.2.4 (2022-12-31)
|
0.2.4 (2022-12-31)
|
||||||
|
|
|
@ -163,10 +163,16 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf)
|
||||||
printf("Deleting duplicated object\n");
|
printf("Deleting duplicated object\n");
|
||||||
|
|
||||||
ptr = *pdf;
|
ptr = *pdf;
|
||||||
while (ptr != NULL && ptr->next != NULL) {
|
while (ptr->next != NULL && ptr->next->next != NULL) {
|
||||||
if (ptr->id == ptr->next->id) {
|
if (ptr->next->id == ptr->next->next->id) {
|
||||||
pdf_get_obj(&ptr, ptr->id, &tmp);
|
/* Keep the bigger one, the smaller one is usually incomplete */
|
||||||
pdf_obj_del(&ptr, ptr->id);
|
if (ptr->next->size < ptr->next->next->size) {
|
||||||
|
pdf_get_obj(&ptr, ptr->next->id, &tmp);
|
||||||
|
pdf_obj_del(&ptr, ptr->next->id);
|
||||||
|
} else {
|
||||||
|
pdf_get_obj(&ptr->next, ptr->next->id, &tmp);
|
||||||
|
pdf_obj_del(&ptr->next, ptr->next->id);
|
||||||
|
}
|
||||||
|
|
||||||
tmp->next = NULL;
|
tmp->next = NULL;
|
||||||
pdf_obj_destroy(&tmp);
|
pdf_obj_destroy(&tmp);
|
||||||
|
@ -174,7 +180,9 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf)
|
||||||
ret++;
|
ret++;
|
||||||
|
|
||||||
if ((*param)->stat > 1)
|
if ((*param)->stat > 1)
|
||||||
printf("Deleted duplicated object %d.\n", ptr->id);
|
printf("Deleted duplicated object %d.\n", ptr->next->id);
|
||||||
|
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
ptr = ptr->next;
|
ptr = ptr->next;
|
||||||
|
@ -236,6 +244,10 @@ cnki_pdf(cnki_t **param)
|
||||||
printf("Loaded %d object(s)\n",
|
printf("Loaded %d object(s)\n",
|
||||||
pdf_get_count(&pdf));
|
pdf_get_count(&pdf));
|
||||||
|
|
||||||
|
pdf_obj_sort(&pdf);
|
||||||
|
|
||||||
|
_pdf_obj_dedup(param, &pdf);
|
||||||
|
|
||||||
int dictionary_size;
|
int dictionary_size;
|
||||||
char *dictionary;
|
char *dictionary;
|
||||||
|
|
||||||
|
@ -301,7 +313,7 @@ cnki_pdf(cnki_t **param)
|
||||||
|
|
||||||
snprintf(buf, 64,
|
snprintf(buf, 64,
|
||||||
"]\n/Count %d\n>>",
|
"]\n/Count %d\n>>",
|
||||||
pdf_get_kid_count(&pdf, parent[i]));
|
pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]);
|
||||||
strcat(dictionary, buf);
|
strcat(dictionary, buf);
|
||||||
|
|
||||||
pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0);
|
pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0);
|
||||||
|
@ -354,7 +366,7 @@ cnki_pdf(cnki_t **param)
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < parent[0]; i++)
|
for (int i = 0; i < parent[0]; i++)
|
||||||
if (parent_missing[i] == 1)
|
if (parent_missing[i] == 1)
|
||||||
root = i;
|
root = parent[i + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (root == 0)
|
if (root == 0)
|
||||||
|
@ -471,8 +483,6 @@ cnki_pdf(cnki_t **param)
|
||||||
|
|
||||||
_pdf_obj_sort(param, &pdf);
|
_pdf_obj_sort(param, &pdf);
|
||||||
|
|
||||||
_pdf_obj_dedup(param, &pdf);
|
|
||||||
|
|
||||||
_pdf_dump(param, &pdf);
|
_pdf_dump(param, &pdf);
|
||||||
|
|
||||||
pdf_obj_destroy(&pdf);
|
pdf_obj_destroy(&pdf);
|
||||||
|
@ -510,7 +520,7 @@ cnki_pdf_hn(cnki_t **param)
|
||||||
if (root_kid == NULL)
|
if (root_kid == NULL)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
memset(root_kid, 0, (*param)->file_stat->page);
|
memset(root_kid, 0, (*param)->file_stat->page * sizeof(int));
|
||||||
|
|
||||||
object_hn_t *ptr = (*param)->object_hn;
|
object_hn_t *ptr = (*param)->object_hn;
|
||||||
while (ptr != NULL) {
|
while (ptr != NULL) {
|
||||||
|
|
|
@ -183,7 +183,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
|
||||||
ptr->size - (tail - buf) - 3,
|
ptr->size - (tail - buf) - 3,
|
||||||
">>", 2)) != NULL &&
|
">>", 2)) != NULL &&
|
||||||
memmem(tail + 3,
|
memmem(tail + 3,
|
||||||
ptr->size - (tail - buf) - 3,
|
(tmp - tail) - 3,
|
||||||
"stream\r\n", 8) == NULL)
|
"stream\r\n", 8) == NULL)
|
||||||
tail = tmp;
|
tail = tmp;
|
||||||
|
|
||||||
|
@ -226,8 +226,27 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
|
||||||
|
|
||||||
free(buf);
|
free(buf);
|
||||||
} else {
|
} else {
|
||||||
ptr->object_size = ptr->size;
|
/* Handle incomplete object */
|
||||||
ptr->object = buf;
|
head = buf;
|
||||||
|
while ((tmp = _memmem_whitespace(head,
|
||||||
|
ptr->size - (head - buf),
|
||||||
|
" 0 obj", 6)) != NULL)
|
||||||
|
head = tmp + 7;
|
||||||
|
|
||||||
|
if (head - buf > 0) {
|
||||||
|
ptr->object_size = ptr->size - (head - buf);
|
||||||
|
ptr->object = malloc(ptr->object_size);
|
||||||
|
|
||||||
|
if (ptr->object == NULL)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
memcpy(ptr->object, head, ptr->object_size);
|
||||||
|
|
||||||
|
free(buf);
|
||||||
|
} else {
|
||||||
|
ptr->object_size = ptr->size;
|
||||||
|
ptr->object = buf;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ptr = ptr->next;
|
ptr = ptr->next;
|
||||||
|
|
Loading…
Reference in a new issue