Compare commits

...

10 commits

Author SHA1 Message Date
2fa2b760ae Fix HN text parsing.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-15 15:34:46 +00:00
dd5854678c Fix JBIG2 allocation.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-06 12:02:43 +00:00
123d62141c Add document information dictionary to output.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 19:15:01 +00:00
283446dba5 Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 17:32:13 +00:00
13cb0a1b8d Fix invalid token parsing.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 11:21:54 +00:00
a7ecc15614 Replace catalog object only if root object does not exist.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:50:25 +00:00
56ffe14d5a Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:29:07 +00:00
c2afbb3cbc Handle invalid PDF object.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:19:06 +00:00
8cd8a8fbba Replace catalog object if found.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:07:57 +00:00
8276423eb8 Prioritise incomplete object during deduplication.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 13:51:13 +00:00
6 changed files with 215 additions and 106 deletions

View file

@ -2,13 +2,17 @@
================== ==================
* Support HN text overlay. * Support HN text overlay.
* Support HN page with text.
* Handle inaccurate page count in CAJ and KDH.
0.2.5 (2023-01-XX) 0.2.5 (2023-01-05)
================== ==================
* Improve PDF parser. * Improve PDF parser.
* Handle duplicated object in CAJ. * Handle duplicated object in CAJ.
* Handle duplicated image in HN. * Handle duplicated image in HN.
* Handle incomplete PDF object in CAJ and KDH.
* Handle invalid PDF object token in CAJ and KDH.
* Fix JBIG decoder. * Fix JBIG decoder.
0.2.4 (2022-12-31) 0.2.4 (2022-12-31)

View file

@ -163,10 +163,16 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf)
printf("Deleting duplicated object\n"); printf("Deleting duplicated object\n");
ptr = *pdf; ptr = *pdf;
while (ptr != NULL && ptr->next != NULL) { while (ptr->next != NULL && ptr->next->next != NULL) {
if (ptr->id == ptr->next->id) { if (ptr->next->id == ptr->next->next->id) {
pdf_get_obj(&ptr, ptr->id, &tmp); /* Keep the bigger one, the smaller one is usually incomplete */
pdf_obj_del(&ptr, ptr->id); if (ptr->next->size < ptr->next->next->size) {
pdf_get_obj(&ptr, ptr->next->id, &tmp);
pdf_obj_del(&ptr, ptr->next->id);
} else {
pdf_get_obj(&ptr->next, ptr->next->id, &tmp);
pdf_obj_del(&ptr->next, ptr->next->id);
}
tmp->next = NULL; tmp->next = NULL;
pdf_obj_destroy(&tmp); pdf_obj_destroy(&tmp);
@ -174,7 +180,9 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf)
ret++; ret++;
if ((*param)->stat > 1) if ((*param)->stat > 1)
printf("Deleted duplicated object %d.\n", ptr->id); printf("Deleted duplicated object %d.\n", ptr->next->id);
continue;
} }
ptr = ptr->next; ptr = ptr->next;
@ -250,6 +258,10 @@ cnki_pdf(cnki_t **param)
if ((*param)->stat > 0) if ((*param)->stat > 0)
printf("Discovered %d parent object(s)\n", parent[0]); printf("Discovered %d parent object(s)\n", parent[0]);
pdf_obj_sort(&pdf);
_pdf_obj_dedup(param, &pdf);
int8_t *parent_missing; int8_t *parent_missing;
int *kid; int *kid;
@ -301,7 +313,7 @@ cnki_pdf(cnki_t **param)
snprintf(buf, 64, snprintf(buf, 64,
"]\n/Count %d\n>>", "]\n/Count %d\n>>",
pdf_get_kid_count(&pdf, parent[i])); pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]);
strcat(dictionary, buf); strcat(dictionary, buf);
pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0);
@ -354,7 +366,7 @@ cnki_pdf(cnki_t **param)
} else { } else {
for (int i = 0; i < parent[0]; i++) for (int i = 0; i < parent[0]; i++)
if (parent_missing[i] == 1) if (parent_missing[i] == 1)
root = i; root = parent[i + 1];
} }
if (root == 0) if (root == 0)
@ -363,9 +375,11 @@ cnki_pdf(cnki_t **param)
printf("Root object is %d.\n", root); printf("Root object is %d.\n", root);
} }
int root_gen;
pdf_object_t *tmp; pdf_object_t *tmp;
if (pdf_get_obj(&pdf, root, &tmp) != 0) { if ((root_gen = pdf_get_obj(&pdf, root, &tmp)) != 0) {
if ((*param)->stat > 0) if ((*param)->stat > 0)
printf("Root object is missing\n"); printf("Root object is missing\n");
@ -407,6 +421,20 @@ cnki_pdf(cnki_t **param)
int outline = _pdf_cnki_outline(param, &pdf); int outline = _pdf_cnki_outline(param, &pdf);
snprintf(buf, 64,
"<<\n/Type /Catalog\n/Pages %d 0 R\n",
root);
strcat(dictionary, buf);
if (outline != -1) {
snprintf(buf, 64,
"/Outlines %d 0 R\n/PageMode /UseOutlines\n",
outline);
strcat(dictionary, buf);
}
strcat(dictionary, ">>");
if ((*param)->stat > 1) if ((*param)->stat > 1)
printf("Searching for catalog object\n"); printf("Searching for catalog object\n");
@ -415,6 +443,16 @@ cnki_pdf(cnki_t **param)
if (catalog != 0) { if (catalog != 0) {
if ((*param)->stat > 0) if ((*param)->stat > 0)
printf("Catalog object is %d.\n", catalog); printf("Catalog object is %d.\n", catalog);
if (root_gen != 0) {
if ((*param)->stat > 1)
printf("Replacing catalog object\n");
pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0);
if ((*param)->stat > 0)
printf("Replaced catalog object\n");
}
} else { } else {
if ((*param)->stat > 0) if ((*param)->stat > 0)
printf("Catalog object is missing\n"); printf("Catalog object is missing\n");
@ -422,20 +460,6 @@ cnki_pdf(cnki_t **param)
if ((*param)->stat > 1) if ((*param)->stat > 1)
printf("Generating catalog object\n"); printf("Generating catalog object\n");
snprintf(buf, 64,
"<<\n/Type /Catalog\n/Pages %d 0 R\n",
root);
strcat(dictionary, buf);
if (outline != -1) {
snprintf(buf, 64,
"/Outlines %d 0 R\n/PageMode /UseOutlines\n",
outline);
strcat(dictionary, buf);
}
strcat(dictionary, ">>");
pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0);
if ((*param)->stat > 0) if ((*param)->stat > 0)
@ -471,8 +495,6 @@ cnki_pdf(cnki_t **param)
_pdf_obj_sort(param, &pdf); _pdf_obj_sort(param, &pdf);
_pdf_obj_dedup(param, &pdf);
_pdf_dump(param, &pdf); _pdf_dump(param, &pdf);
pdf_obj_destroy(&pdf); pdf_obj_destroy(&pdf);
@ -510,7 +532,7 @@ cnki_pdf_hn(cnki_t **param)
if (root_kid == NULL) if (root_kid == NULL)
return 1; return 1;
memset(root_kid, 0, (*param)->file_stat->page); memset(root_kid, 0, (*param)->file_stat->page * sizeof(int));
object_hn_t *ptr = (*param)->object_hn; object_hn_t *ptr = (*param)->object_hn;
while (ptr != NULL) { while (ptr != NULL) {
@ -828,73 +850,75 @@ cnki_pdf_hn(cnki_t **param)
for (int i = 0, j = 0; i < ptr->text_size - 1;) { for (int i = 0, j = 0; i < ptr->text_size - 1;) {
switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) { switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) {
case 0x8001: case 0x8001:
if (ptr->address_next > ptr->address) if (ptr->address_next <= ptr->address) {
strcat(dictionary, "T*\n"); if (i + 7 >= ptr->text_size) {
case 0x8070: i += 2;
if (ptr->address_next > ptr->address) { break;
i += 4; }
for (;;) { conv_src[0] = ptr->text[i + 7];
if (i + 3 >= ptr->text_size || conv_src[1] = ptr->text[i + 6];
(unsigned char) ptr->text[i + 1] == 0x80)
break;
conv_src[0] = ptr->text[i + 3]; //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n")
conv_src[1] = ptr->text[i + 2]; //strcat(dictionary, buf);
//snprintf(buf, 64, "%f %f Td\n"); conv_size = 6;
//strcat(dictionary, buf);
conv_size = 6; if (strconv(&conv_dst, "UTF-16BE",
conv_src, "GB18030", &conv_size) == 0) {
if (strconv(&conv_dst, "UTF-16BE", if (conv_size - 2 > 0) {
conv_src, "GB18030", &conv_size) == 0) { strcat(dictionary, "<");
if (conv_size - 2 > 0) { for (int k = 0; k < conv_size - 2; k++) {
strcat(dictionary, "<feff"); snprintf(conv_hex, 3,
for (int k = 0; k < conv_size - 2; k++) { "%02x", (unsigned char) conv_dst[k]);
snprintf(conv_hex, 3, strcat(dictionary, conv_hex);
"%02x", (unsigned char) conv_dst[k]);
strcat(dictionary, conv_hex);
}
strcat(dictionary, "> Tj\n");
} }
free(conv_dst); strcat(dictionary, "> Tj\n");
} }
free(conv_dst);
i += 4;
} }
i += 8;
break; break;
} }
if (i + 7 >= ptr->text_size) { strcat(dictionary, "T*\n");
i += 2; case 0x8070:
i += 4;
if (ptr->address_next <= ptr->address)
break; break;
}
conv_src[0] = ptr->text[i + 7]; for (;;) {
conv_src[1] = ptr->text[i + 6]; if (i + 3 >= ptr->text_size ||
(unsigned char) ptr->text[i + 1] == 0x80)
break;
//snprintf(buf, 64, "%f %f Td\n"); conv_src[0] = ptr->text[i + 3];
//strcat(dictionary, buf); conv_src[1] = ptr->text[i + 2];
conv_size = 6; //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n")
//strcat(dictionary, buf);
if (strconv(&conv_dst, "UTF-16BE", conv_size = 6;
conv_src, "GB18030", &conv_size) == 0) {
if (conv_size - 2 > 0) { if (strconv(&conv_dst, "UTF-16BE",
strcat(dictionary, "<feff"); conv_src, "GB18030", &conv_size) == 0) {
for (int k = 0; k < conv_size - 2; k++) { if (conv_size - 2 > 0) {
snprintf(conv_hex, 3, strcat(dictionary, "<");
"%02x", (unsigned char) conv_dst[k]); for (int k = 0; k < conv_size - 2; k++) {
strcat(dictionary, conv_hex); snprintf(conv_hex, 3,
"%02x", (unsigned char) conv_dst[k]);
strcat(dictionary, conv_hex);
}
strcat(dictionary, "> Tj\n");
} }
strcat(dictionary, "> Tj\n"); free(conv_dst);
} }
free(conv_dst);
i += 4;
} }
i += 8;
break; break;
case 0x800a: case 0x800a:
if (i + 27 >= ptr->text_size || j >= ptr->image_length) { if (i + 27 >= ptr->text_size || j >= ptr->image_length) {

View file

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2022, yzrh <yzrh@noema.org> * Copyright (c) 2022-2023, yzrh <yzrh@noema.org>
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
*/ */
@ -31,5 +31,6 @@ strdec_jbig2(char **bitmap,
} }
jbig2_release_page(ctx, image); jbig2_release_page(ctx, image);
jbig2_ctx_free(ctx);
return 0; return 0;
} }

View file

@ -79,8 +79,25 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
memset(buf + end - cur, 0, size_buf - end + cur); memset(buf + end - cur, 0, size_buf - end + cur);
} }
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) if (head == 0) {
head = cur + (pos - buf) + 7; /* Hack needed for invalid object */
pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6);
tmp = memmem(buf, size_buf, " 0 obj", 6);
while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b)
tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6);
if (pos != NULL && tmp != NULL) {
if (pos - buf < tmp - buf)
head = cur + (pos - buf) + 7;
else
head = cur + (tmp - buf) + 6;
} else if (pos != NULL) {
head = cur + (pos - buf) + 7;
} else if (tmp != NULL) {
head = cur + (tmp - buf) + 6;
}
}
if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) { if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) {
/* We need to check if it is the object stored in stream */ /* We need to check if it is the object stored in stream */
@ -156,9 +173,46 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
if (buf == NULL) if (buf == NULL)
return 1; return 1;
fseek(*fp, ptr->address - 15, SEEK_SET); fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
/* Handle incomplete object */
head = buf;
while ((tmp = _memmem_whitespace(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 7;
/* Hack needed for invalid object */
while ((tmp = memmem(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 6;
if (head - buf > 0) {
ptr->address += head - buf;
ptr->size -= head - buf;
tmp = realloc(buf, ptr->size);
if (tmp == NULL)
return 1;
buf = tmp;
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
}
/* Hack needed for invalid object */
fseek(*fp, ptr->address - 14, SEEK_SET);
fread(str, 8, 1, *fp); fread(str, 8, 1, *fp);
if (str[7] < '0' || str[7] > '9') {
fseek(*fp, ptr->address - 15, SEEK_SET);
fread(str, 8, 1, *fp);
}
for (int i = 7; i >= 0; i--) { for (int i = 7; i >= 0; i--) {
if (str[i] < '0' || str[i] > '9') { if (str[i] < '0' || str[i] > '9') {
if (i < 7) if (i < 7)
@ -170,22 +224,35 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
} }
} }
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL &&
(tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { ((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL ||
/* /* Hack needed for invalid object */
* A dictionary object may have nested dictionary, (tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) {
* but it should not be in a stream if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) {
*/ tail = memmem(buf, ptr->size, ">>", 2);
while ((tmp = _memmem_whitespace(tail + 3,
ptr->size - (tail - buf) - 3, while (ptr->size - (tail - buf) > 2 &&
">>", 2)) != NULL && (tmp = memmem(tail + 2,
memmem(tail + 3, ptr->size - (tail - buf) - 2,
ptr->size - (tail - buf) - 3, ">>", 2)) != NULL &&
"stream\r\n", 8) == NULL) memmem(tail + 2,
tail = tmp; (tmp - tail) - 2,
"stream\r\n", 8) == NULL)
tail = tmp;
} else {
/*
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
while (ptr->size - (tail - buf) > 3 &&
(tmp = _memmem_whitespace(tail + 3,
ptr->size - (tail - buf) - 3,
">>", 2)) != NULL &&
memmem(tail + 3,
(tmp - tail) - 3,
"stream\r\n", 8) == NULL)
tail = tmp;
}
ptr->dictionary_size = tail - head + 2; ptr->dictionary_size = tail - head + 2;
ptr->dictionary = malloc(ptr->dictionary_size + 1); ptr->dictionary = malloc(ptr->dictionary_size + 1);

View file

@ -1,19 +1,39 @@
/* /*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org> * Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
*/ */
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <time.h> #include <time.h>
#include "version.h"
#include "md5.h" #include "md5.h"
#include "pdf.h" #include "pdf.h"
static int
_info_obj(pdf_object_t **pdf)
{
char dictionary[128] = "<<\n"
"/Producer (Melon " VERSION "." RELEASE "." PATCH EXTRA ")\n"
"/CreationDate (D:";
char buf[64];
time_t timestamp = time(NULL);
strftime(buf, 64, "%Y%m%d%H%M%S", gmtime(&timestamp));
strcat(dictionary, buf);
strcat(dictionary, "+00'00')\n>>");
return pdf_obj_append(pdf, 0, NULL, dictionary, NULL, 0);
}
int int
pdf_dump_obj(pdf_object_t **pdf, FILE **fp) pdf_dump_obj(pdf_object_t **pdf, FILE **fp)
{ {
if (*pdf == NULL || *fp == NULL) if (*pdf == NULL || *fp == NULL || _info_obj(pdf) != 0)
return 1; return 1;
long cur; long cur;
@ -152,18 +172,11 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref)
while (ptr->next != NULL) while (ptr->next != NULL)
ptr = ptr->next; ptr = ptr->next;
/*
* TODO: Document information dictionary
* `"/Producer (Melon)"'
* `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"'
*
* Trailer dictionary
* `"/Info %d 0 R"'
*/
fprintf(*fp, fprintf(*fp,
"/Size %d\n/Root %d 0 R\n", "/Size %d\n/Root %d 0 R\n/Info %d 0 R\n",
ptr->id + 1, ptr->id + 1,
pdf_get_catalog_id(pdf)); pdf_get_catalog_id(pdf),
ptr->id);
fputs("/ID [", *fp); fputs("/ID [", *fp);

View file

@ -5,6 +5,6 @@
*/ */
#define VERSION "0" #define VERSION "0"
#define RELEASE "2" #define RELEASE "3"
#define PATCH "5" #define PATCH "0"
#define EXTRA "" #define EXTRA ""