diff --git a/CHANGE.md b/CHANGE.md index 063d93a..9071c91 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -2,13 +2,17 @@ ================== * Support HN text overlay. +* Support HN page with text. +* Handle inaccurate page count in CAJ and KDH. -0.2.5 (2023-01-XX) +0.2.5 (2023-01-05) ================== * Improve PDF parser. * Handle duplicated object in CAJ. * Handle duplicated image in HN. +* Handle incomplete PDF object in CAJ and KDH. +* Handle invalid PDF object token in CAJ and KDH. * Fix JBIG decoder. 0.2.4 (2022-12-31) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index af38aa6..d96ea49 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -163,10 +163,16 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) printf("Deleting duplicated object\n"); ptr = *pdf; - while (ptr != NULL && ptr->next != NULL) { - if (ptr->id == ptr->next->id) { - pdf_get_obj(&ptr, ptr->id, &tmp); - pdf_obj_del(&ptr, ptr->id); + while (ptr->next != NULL && ptr->next->next != NULL) { + if (ptr->next->id == ptr->next->next->id) { + /* Keep the bigger one, the smaller one is usually incomplete */ + if (ptr->next->size < ptr->next->next->size) { + pdf_get_obj(&ptr, ptr->next->id, &tmp); + pdf_obj_del(&ptr, ptr->next->id); + } else { + pdf_get_obj(&ptr->next, ptr->next->id, &tmp); + pdf_obj_del(&ptr->next, ptr->next->id); + } tmp->next = NULL; pdf_obj_destroy(&tmp); @@ -174,7 +180,9 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) ret++; if ((*param)->stat > 1) - printf("Deleted duplicated object %d.\n", ptr->id); + printf("Deleted duplicated object %d.\n", ptr->next->id); + + continue; } ptr = ptr->next; @@ -250,6 +258,10 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Discovered %d parent object(s)\n", parent[0]); + pdf_obj_sort(&pdf); + + _pdf_obj_dedup(param, &pdf); + int8_t *parent_missing; int *kid; @@ -301,7 +313,7 @@ cnki_pdf(cnki_t **param) snprintf(buf, 64, "]\n/Count %d\n>>", - pdf_get_kid_count(&pdf, parent[i])); + pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]); strcat(dictionary, buf); pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); @@ -354,7 +366,7 @@ cnki_pdf(cnki_t **param) } else { for (int i = 0; i < parent[0]; i++) if (parent_missing[i] == 1) - root = i; + root = parent[i + 1]; } if (root == 0) @@ -363,9 +375,11 @@ cnki_pdf(cnki_t **param) printf("Root object is %d.\n", root); } + int root_gen; + pdf_object_t *tmp; - if (pdf_get_obj(&pdf, root, &tmp) != 0) { + if ((root_gen = pdf_get_obj(&pdf, root, &tmp)) != 0) { if ((*param)->stat > 0) printf("Root object is missing\n"); @@ -407,6 +421,20 @@ cnki_pdf(cnki_t **param) int outline = _pdf_cnki_outline(param, &pdf); + snprintf(buf, 64, + "<<\n/Type /Catalog\n/Pages %d 0 R\n", + root); + strcat(dictionary, buf); + + if (outline != -1) { + snprintf(buf, 64, + "/Outlines %d 0 R\n/PageMode /UseOutlines\n", + outline); + strcat(dictionary, buf); + } + + strcat(dictionary, ">>"); + if ((*param)->stat > 1) printf("Searching for catalog object\n"); @@ -415,6 +443,16 @@ cnki_pdf(cnki_t **param) if (catalog != 0) { if ((*param)->stat > 0) printf("Catalog object is %d.\n", catalog); + + if (root_gen != 0) { + if ((*param)->stat > 1) + printf("Replacing catalog object\n"); + + pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0); + + if ((*param)->stat > 0) + printf("Replaced catalog object\n"); + } } else { if ((*param)->stat > 0) printf("Catalog object is missing\n"); @@ -422,20 +460,6 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 1) printf("Generating catalog object\n"); - snprintf(buf, 64, - "<<\n/Type /Catalog\n/Pages %d 0 R\n", - root); - strcat(dictionary, buf); - - if (outline != -1) { - snprintf(buf, 64, - "/Outlines %d 0 R\n/PageMode /UseOutlines\n", - outline); - strcat(dictionary, buf); - } - - strcat(dictionary, ">>"); - pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); if ((*param)->stat > 0) @@ -471,8 +495,6 @@ cnki_pdf(cnki_t **param) _pdf_obj_sort(param, &pdf); - _pdf_obj_dedup(param, &pdf); - _pdf_dump(param, &pdf); pdf_obj_destroy(&pdf); @@ -510,7 +532,7 @@ cnki_pdf_hn(cnki_t **param) if (root_kid == NULL) return 1; - memset(root_kid, 0, (*param)->file_stat->page); + memset(root_kid, 0, (*param)->file_stat->page * sizeof(int)); object_hn_t *ptr = (*param)->object_hn; while (ptr != NULL) { @@ -828,73 +850,75 @@ cnki_pdf_hn(cnki_t **param) for (int i = 0, j = 0; i < ptr->text_size - 1;) { switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) { case 0x8001: - if (ptr->address_next > ptr->address) - strcat(dictionary, "T*\n"); - case 0x8070: - if (ptr->address_next > ptr->address) { - i += 4; + if (ptr->address_next <= ptr->address) { + if (i + 7 >= ptr->text_size) { + i += 2; + break; + } - for (;;) { - if (i + 3 >= ptr->text_size || - (unsigned char) ptr->text[i + 1] == 0x80) - break; + conv_src[0] = ptr->text[i + 7]; + conv_src[1] = ptr->text[i + 6]; - conv_src[0] = ptr->text[i + 3]; - conv_src[1] = ptr->text[i + 2]; + //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") + //strcat(dictionary, buf); - //snprintf(buf, 64, "%f %f Td\n"); - //strcat(dictionary, buf); + conv_size = 6; - conv_size = 6; - - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, " Tj\n"); + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + if (conv_size - 2 > 0) { + strcat(dictionary, "<"); + for (int k = 0; k < conv_size - 2; k++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[k]); + strcat(dictionary, conv_hex); } - free(conv_dst); + strcat(dictionary, "> Tj\n"); } - - i += 4; + free(conv_dst); } + i += 8; break; } - if (i + 7 >= ptr->text_size) { - i += 2; + strcat(dictionary, "T*\n"); + case 0x8070: + i += 4; + + if (ptr->address_next <= ptr->address) break; - } - conv_src[0] = ptr->text[i + 7]; - conv_src[1] = ptr->text[i + 6]; + for (;;) { + if (i + 3 >= ptr->text_size || + (unsigned char) ptr->text[i + 1] == 0x80) + break; - //snprintf(buf, 64, "%f %f Td\n"); - //strcat(dictionary, buf); + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; - conv_size = 6; + //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") + //strcat(dictionary, buf); - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, " 0) { + strcat(dictionary, "<"); + for (int k = 0; k < conv_size - 2; k++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[k]); + strcat(dictionary, conv_hex); + } + strcat(dictionary, "> Tj\n"); } - strcat(dictionary, "> Tj\n"); + free(conv_dst); } - free(conv_dst); + + i += 4; } - i += 8; break; case 0x800a: if (i + 27 >= ptr->text_size || j >= ptr->image_length) { diff --git a/src/jbig2.c b/src/jbig2.c index 9b3a9be..ea9233c 100644 --- a/src/jbig2.c +++ b/src/jbig2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, yzrh + * Copyright (c) 2022-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -31,5 +31,6 @@ strdec_jbig2(char **bitmap, } jbig2_release_page(ctx, image); + jbig2_ctx_free(ctx); return 0; } diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 781bafa..e6d8ac6 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -79,8 +79,25 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) memset(buf + end - cur, 0, size_buf - end + cur); } - if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) - head = cur + (pos - buf) + 7; + if (head == 0) { + /* Hack needed for invalid object */ + pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6); + tmp = memmem(buf, size_buf, " 0 obj", 6); + + while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b) + tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6); + + if (pos != NULL && tmp != NULL) { + if (pos - buf < tmp - buf) + head = cur + (pos - buf) + 7; + else + head = cur + (tmp - buf) + 6; + } else if (pos != NULL) { + head = cur + (pos - buf) + 7; + } else if (tmp != NULL) { + head = cur + (tmp - buf) + 6; + } + } if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) { /* We need to check if it is the object stored in stream */ @@ -156,9 +173,46 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if (buf == NULL) return 1; - fseek(*fp, ptr->address - 15, SEEK_SET); + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + + /* Handle incomplete object */ + head = buf; + while ((tmp = _memmem_whitespace(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 7; + + /* Hack needed for invalid object */ + while ((tmp = memmem(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 6; + + if (head - buf > 0) { + ptr->address += head - buf; + ptr->size -= head - buf; + + tmp = realloc(buf, ptr->size); + + if (tmp == NULL) + return 1; + + buf = tmp; + + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + } + + /* Hack needed for invalid object */ + fseek(*fp, ptr->address - 14, SEEK_SET); fread(str, 8, 1, *fp); + if (str[7] < '0' || str[7] > '9') { + fseek(*fp, ptr->address - 15, SEEK_SET); + fread(str, 8, 1, *fp); + } + for (int i = 7; i >= 0; i--) { if (str[i] < '0' || str[i] > '9') { if (i < 7) @@ -170,22 +224,35 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) } } - fseek(*fp, ptr->address, SEEK_SET); - fread(buf, ptr->size, 1, *fp); - if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && - (tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { - /* - * A dictionary object may have nested dictionary, - * but it should not be in a stream - */ - while ((tmp = _memmem_whitespace(tail + 3, - ptr->size - (tail - buf) - 3, - ">>", 2)) != NULL && - memmem(tail + 3, - ptr->size - (tail - buf) - 3, - "stream\r\n", 8) == NULL) - tail = tmp; + ((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL || + /* Hack needed for invalid object */ + (tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) { + if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) { + tail = memmem(buf, ptr->size, ">>", 2); + + while (ptr->size - (tail - buf) > 2 && + (tmp = memmem(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL && + memmem(tail + 2, + (tmp - tail) - 2, + "stream\r\n", 8) == NULL) + tail = tmp; + } else { + /* + * A dictionary object may have nested dictionary, + * but it should not be in a stream + */ + while (ptr->size - (tail - buf) > 3 && + (tmp = _memmem_whitespace(tail + 3, + ptr->size - (tail - buf) - 3, + ">>", 2)) != NULL && + memmem(tail + 3, + (tmp - tail) - 3, + "stream\r\n", 8) == NULL) + tail = tmp; + } ptr->dictionary_size = tail - head + 2; ptr->dictionary = malloc(ptr->dictionary_size + 1); diff --git a/src/pdf_writer.c b/src/pdf_writer.c index 465d26b..6afa89b 100644 --- a/src/pdf_writer.c +++ b/src/pdf_writer.c @@ -1,19 +1,39 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ #include +#include #include +#include "version.h" #include "md5.h" #include "pdf.h" +static int +_info_obj(pdf_object_t **pdf) +{ + char dictionary[128] = "<<\n" + "/Producer (Melon " VERSION "." RELEASE "." PATCH EXTRA ")\n" + "/CreationDate (D:"; + + char buf[64]; + + time_t timestamp = time(NULL); + + strftime(buf, 64, "%Y%m%d%H%M%S", gmtime(×tamp)); + strcat(dictionary, buf); + strcat(dictionary, "+00'00')\n>>"); + + return pdf_obj_append(pdf, 0, NULL, dictionary, NULL, 0); +} + int pdf_dump_obj(pdf_object_t **pdf, FILE **fp) { - if (*pdf == NULL || *fp == NULL) + if (*pdf == NULL || *fp == NULL || _info_obj(pdf) != 0) return 1; long cur; @@ -152,18 +172,11 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) while (ptr->next != NULL) ptr = ptr->next; - /* - * TODO: Document information dictionary - * `"/Producer (Melon)"' - * `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"' - * - * Trailer dictionary - * `"/Info %d 0 R"' - */ fprintf(*fp, - "/Size %d\n/Root %d 0 R\n", + "/Size %d\n/Root %d 0 R\n/Info %d 0 R\n", ptr->id + 1, - pdf_get_catalog_id(pdf)); + pdf_get_catalog_id(pdf), + ptr->id); fputs("/ID [", *fp); diff --git a/src/version.h b/src/version.h index 46eeb34..c3ff314 100644 --- a/src/version.h +++ b/src/version.h @@ -5,6 +5,6 @@ */ #define VERSION "0" -#define RELEASE "2" -#define PATCH "5" +#define RELEASE "3" +#define PATCH "0" #define EXTRA ""