From 8276423eb8395eae3e3002442307272eff1c9e8f Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 13:51:13 +0000 Subject: [PATCH 01/10] Prioritise incomplete object during deduplication. Signed-off-by: yzrh --- CHANGE.md | 3 +++ src/cnki_pdf.c | 30 ++++++++++++++++++++---------- src/pdf_parser.c | 25 ++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index 063d93a..3e05e10 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -2,6 +2,8 @@ ================== * Support HN text overlay. +* Handle invalid PDF object token in CAJ and KDH. +* Handle inaccuracy page count in CAJ and KDH. 0.2.5 (2023-01-XX) ================== @@ -9,6 +11,7 @@ * Improve PDF parser. * Handle duplicated object in CAJ. * Handle duplicated image in HN. +* Handle incomplete PDF object in CAJ and KDH. * Fix JBIG decoder. 0.2.4 (2022-12-31) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index af38aa6..76931ea 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -163,10 +163,16 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) printf("Deleting duplicated object\n"); ptr = *pdf; - while (ptr != NULL && ptr->next != NULL) { - if (ptr->id == ptr->next->id) { - pdf_get_obj(&ptr, ptr->id, &tmp); - pdf_obj_del(&ptr, ptr->id); + while (ptr->next != NULL && ptr->next->next != NULL) { + if (ptr->next->id == ptr->next->next->id) { + /* Keep the bigger one, the smaller one is usually incomplete */ + if (ptr->next->size < ptr->next->next->size) { + pdf_get_obj(&ptr, ptr->next->id, &tmp); + pdf_obj_del(&ptr, ptr->next->id); + } else { + pdf_get_obj(&ptr->next, ptr->next->id, &tmp); + pdf_obj_del(&ptr->next, ptr->next->id); + } tmp->next = NULL; pdf_obj_destroy(&tmp); @@ -174,7 +180,9 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) ret++; if ((*param)->stat > 1) - printf("Deleted duplicated object %d.\n", ptr->id); + printf("Deleted duplicated object %d.\n", ptr->next->id); + + continue; } ptr = ptr->next; @@ -236,6 +244,10 @@ cnki_pdf(cnki_t **param) printf("Loaded %d object(s)\n", pdf_get_count(&pdf)); + pdf_obj_sort(&pdf); + + _pdf_obj_dedup(param, &pdf); + int dictionary_size; char *dictionary; @@ -301,7 +313,7 @@ cnki_pdf(cnki_t **param) snprintf(buf, 64, "]\n/Count %d\n>>", - pdf_get_kid_count(&pdf, parent[i])); + pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]); strcat(dictionary, buf); pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); @@ -354,7 +366,7 @@ cnki_pdf(cnki_t **param) } else { for (int i = 0; i < parent[0]; i++) if (parent_missing[i] == 1) - root = i; + root = parent[i + 1]; } if (root == 0) @@ -471,8 +483,6 @@ cnki_pdf(cnki_t **param) _pdf_obj_sort(param, &pdf); - _pdf_obj_dedup(param, &pdf); - _pdf_dump(param, &pdf); pdf_obj_destroy(&pdf); @@ -510,7 +520,7 @@ cnki_pdf_hn(cnki_t **param) if (root_kid == NULL) return 1; - memset(root_kid, 0, (*param)->file_stat->page); + memset(root_kid, 0, (*param)->file_stat->page * sizeof(int)); object_hn_t *ptr = (*param)->object_hn; while (ptr != NULL) { diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 781bafa..ed7bfba 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -183,7 +183,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && memmem(tail + 3, - ptr->size - (tail - buf) - 3, + (tmp - tail) - 3, "stream\r\n", 8) == NULL) tail = tmp; @@ -226,8 +226,27 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) free(buf); } else { - ptr->object_size = ptr->size; - ptr->object = buf; + /* Handle incomplete object */ + head = buf; + while ((tmp = _memmem_whitespace(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 7; + + if (head - buf > 0) { + ptr->object_size = ptr->size - (head - buf); + ptr->object = malloc(ptr->object_size); + + if (ptr->object == NULL) + return 1; + + memcpy(ptr->object, head, ptr->object_size); + + free(buf); + } else { + ptr->object_size = ptr->size; + ptr->object = buf; + } } ptr = ptr->next; From 8cd8a8fbbadaeee6563d6cb5d7c648570d78b2fc Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 17:07:57 +0000 Subject: [PATCH 02/10] Replace catalog object if found. Signed-off-by: yzrh --- src/cnki_pdf.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 76931ea..87fe3f6 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -244,10 +244,6 @@ cnki_pdf(cnki_t **param) printf("Loaded %d object(s)\n", pdf_get_count(&pdf)); - pdf_obj_sort(&pdf); - - _pdf_obj_dedup(param, &pdf); - int dictionary_size; char *dictionary; @@ -262,6 +258,10 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Discovered %d parent object(s)\n", parent[0]); + pdf_obj_sort(&pdf); + + _pdf_obj_dedup(param, &pdf); + int8_t *parent_missing; int *kid; @@ -419,6 +419,20 @@ cnki_pdf(cnki_t **param) int outline = _pdf_cnki_outline(param, &pdf); + snprintf(buf, 64, + "<<\n/Type /Catalog\n/Pages %d 0 R\n", + root); + strcat(dictionary, buf); + + if (outline != -1) { + snprintf(buf, 64, + "/Outlines %d 0 R\n/PageMode /UseOutlines\n", + outline); + strcat(dictionary, buf); + } + + strcat(dictionary, ">>"); + if ((*param)->stat > 1) printf("Searching for catalog object\n"); @@ -427,6 +441,14 @@ cnki_pdf(cnki_t **param) if (catalog != 0) { if ((*param)->stat > 0) printf("Catalog object is %d.\n", catalog); + + if ((*param)->stat > 1) + printf("Replacing catalog object\n"); + + pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0); + + if ((*param)->stat > 0) + printf("Replaced catalog object\n"); } else { if ((*param)->stat > 0) printf("Catalog object is missing\n"); @@ -434,20 +456,6 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 1) printf("Generating catalog object\n"); - snprintf(buf, 64, - "<<\n/Type /Catalog\n/Pages %d 0 R\n", - root); - strcat(dictionary, buf); - - if (outline != -1) { - snprintf(buf, 64, - "/Outlines %d 0 R\n/PageMode /UseOutlines\n", - outline); - strcat(dictionary, buf); - } - - strcat(dictionary, ">>"); - pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); if ((*param)->stat > 0) From c2afbb3cbc947dec4d2878c9c3608306039f9c8b Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 17:19:06 +0000 Subject: [PATCH 03/10] Handle invalid PDF object. Signed-off-by: yzrh --- src/pdf_parser.c | 99 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 28 deletions(-) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index ed7bfba..70d72d5 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -79,8 +79,25 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) memset(buf + end - cur, 0, size_buf - end + cur); } - if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) - head = cur + (pos - buf) + 7; + if (head == 0) { + /* Hack needed for invalid object */ + pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6); + tmp = memmem(buf, size_buf, " 0 obj", 6); + + while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b) + tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6); + + if (pos != NULL && tmp != NULL) { + if (pos - buf < tmp - buf) + head = cur + (pos - buf) + 7; + else + head = cur + (tmp - buf) + 6; + } else if (pos != NULL) { + head = cur + (pos - buf) + 7; + } else if (tmp != NULL) { + head = cur + (tmp - buf) + 6; + } + } if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) { /* We need to check if it is the object stored in stream */ @@ -156,9 +173,46 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if (buf == NULL) return 1; - fseek(*fp, ptr->address - 15, SEEK_SET); + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + + /* Handle incomplete object */ + head = buf; + while ((tmp = _memmem_whitespace(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 7; + + /* Hack needed for invalid object */ + while ((tmp = memmem(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 6; + + if (head - buf > 0) { + ptr->address += head - buf; + ptr->size -= head - buf; + + tmp = realloc(buf, ptr->size); + + if (tmp == NULL) + return 1; + + buf = tmp; + + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + } + + /* Hack needed for invalid object */ + fseek(*fp, ptr->address - 14, SEEK_SET); fread(str, 8, 1, *fp); + if (str[7] < '0' || str[7] > '9') { + fseek(*fp, ptr->address - 15, SEEK_SET); + fread(str, 8, 1, *fp); + } + for (int i = 7; i >= 0; i--) { if (str[i] < '0' || str[i] > '9') { if (i < 7) @@ -170,11 +224,10 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) } } - fseek(*fp, ptr->address, SEEK_SET); - fread(buf, ptr->size, 1, *fp); - if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && - (tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { + ((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL || + /* Hack needed for invalid object */ + (tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) { /* * A dictionary object may have nested dictionary, * but it should not be in a stream @@ -187,6 +240,15 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) "stream\r\n", 8) == NULL) tail = tmp; + /* Hack needed for invalid object */ + while ((tmp = memmem(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL && + memmem(tail + 2, + (tmp - tail) - 2, + "stream\r\n", 8) == NULL) + tail = tmp; + ptr->dictionary_size = tail - head + 2; ptr->dictionary = malloc(ptr->dictionary_size + 1); @@ -226,27 +288,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) free(buf); } else { - /* Handle incomplete object */ - head = buf; - while ((tmp = _memmem_whitespace(head, - ptr->size - (head - buf), - " 0 obj", 6)) != NULL) - head = tmp + 7; - - if (head - buf > 0) { - ptr->object_size = ptr->size - (head - buf); - ptr->object = malloc(ptr->object_size); - - if (ptr->object == NULL) - return 1; - - memcpy(ptr->object, head, ptr->object_size); - - free(buf); - } else { - ptr->object_size = ptr->size; - ptr->object = buf; - } + ptr->object_size = ptr->size; + ptr->object = buf; } ptr = ptr->next; From 56ffe14d5a8c10163850d3e682c67a282d9b8abc Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 17:29:07 +0000 Subject: [PATCH 04/10] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGE.md b/CHANGE.md index 3e05e10..4b5a830 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -2,7 +2,6 @@ ================== * Support HN text overlay. -* Handle invalid PDF object token in CAJ and KDH. * Handle inaccuracy page count in CAJ and KDH. 0.2.5 (2023-01-XX) @@ -12,6 +11,7 @@ * Handle duplicated object in CAJ. * Handle duplicated image in HN. * Handle incomplete PDF object in CAJ and KDH. +* Handle invalid PDF object token in CAJ and KDH. * Fix JBIG decoder. 0.2.4 (2022-12-31) From a7ecc156141b15cfae8b309697e12deae2740841 Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 17:50:25 +0000 Subject: [PATCH 05/10] Replace catalog object only if root object does not exist. Signed-off-by: yzrh --- src/cnki_pdf.c | 16 ++++++++++------ src/pdf_parser.c | 6 ++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 87fe3f6..0c1ebb0 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -375,9 +375,11 @@ cnki_pdf(cnki_t **param) printf("Root object is %d.\n", root); } + int root_gen; + pdf_object_t *tmp; - if (pdf_get_obj(&pdf, root, &tmp) != 0) { + if ((root_gen = pdf_get_obj(&pdf, root, &tmp)) != 0) { if ((*param)->stat > 0) printf("Root object is missing\n"); @@ -442,13 +444,15 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Catalog object is %d.\n", catalog); - if ((*param)->stat > 1) - printf("Replacing catalog object\n"); + if (root_gen != 0) { + if ((*param)->stat > 1) + printf("Replacing catalog object\n"); - pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0); + pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0); - if ((*param)->stat > 0) - printf("Replaced catalog object\n"); + if ((*param)->stat > 0) + printf("Replaced catalog object\n"); + } } else { if ((*param)->stat > 0) printf("Catalog object is missing\n"); diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 70d72d5..6520fd5 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -232,7 +232,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) * A dictionary object may have nested dictionary, * but it should not be in a stream */ - while ((tmp = _memmem_whitespace(tail + 3, + while (ptr->size - (tail - buf) > 3 && + (tmp = _memmem_whitespace(tail + 3, ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && memmem(tail + 3, @@ -241,7 +242,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) tail = tmp; /* Hack needed for invalid object */ - while ((tmp = memmem(tail + 2, + while (ptr->size - (tail - buf) > 2 && + (tmp = memmem(tail + 2, ptr->size - (tail - buf) - 2, ">>", 2)) != NULL && memmem(tail + 2, From 13cb0a1b8dd7cdee9af519dab10d6b1c1036c321 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 5 Jan 2023 11:21:54 +0000 Subject: [PATCH 06/10] Fix invalid token parsing. Signed-off-by: yzrh --- src/pdf_parser.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 6520fd5..e6d8ac6 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -228,28 +228,31 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) ((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL || /* Hack needed for invalid object */ (tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) { - /* - * A dictionary object may have nested dictionary, - * but it should not be in a stream - */ - while (ptr->size - (tail - buf) > 3 && - (tmp = _memmem_whitespace(tail + 3, - ptr->size - (tail - buf) - 3, - ">>", 2)) != NULL && - memmem(tail + 3, - (tmp - tail) - 3, - "stream\r\n", 8) == NULL) - tail = tmp; + if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) { + tail = memmem(buf, ptr->size, ">>", 2); - /* Hack needed for invalid object */ - while (ptr->size - (tail - buf) > 2 && - (tmp = memmem(tail + 2, - ptr->size - (tail - buf) - 2, - ">>", 2)) != NULL && - memmem(tail + 2, - (tmp - tail) - 2, - "stream\r\n", 8) == NULL) - tail = tmp; + while (ptr->size - (tail - buf) > 2 && + (tmp = memmem(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL && + memmem(tail + 2, + (tmp - tail) - 2, + "stream\r\n", 8) == NULL) + tail = tmp; + } else { + /* + * A dictionary object may have nested dictionary, + * but it should not be in a stream + */ + while (ptr->size - (tail - buf) > 3 && + (tmp = _memmem_whitespace(tail + 3, + ptr->size - (tail - buf) - 3, + ">>", 2)) != NULL && + memmem(tail + 3, + (tmp - tail) - 3, + "stream\r\n", 8) == NULL) + tail = tmp; + } ptr->dictionary_size = tail - head + 2; ptr->dictionary = malloc(ptr->dictionary_size + 1); From 283446dba5ce6e1a61b46377999a45f0b85a6937 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 5 Jan 2023 17:32:13 +0000 Subject: [PATCH 07/10] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index 4b5a830..9071c91 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -2,9 +2,10 @@ ================== * Support HN text overlay. -* Handle inaccuracy page count in CAJ and KDH. +* Support HN page with text. +* Handle inaccurate page count in CAJ and KDH. -0.2.5 (2023-01-XX) +0.2.5 (2023-01-05) ================== * Improve PDF parser. From 123d62141cce0cbeb2ae6eb80b669af7db1c8c72 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 5 Jan 2023 19:13:37 +0000 Subject: [PATCH 08/10] Add document information dictionary to output. Signed-off-by: yzrh --- src/pdf_writer.c | 37 +++++++++++++++++++++++++------------ src/version.h | 4 ++-- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/pdf_writer.c b/src/pdf_writer.c index 465d26b..6afa89b 100644 --- a/src/pdf_writer.c +++ b/src/pdf_writer.c @@ -1,19 +1,39 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ #include +#include #include +#include "version.h" #include "md5.h" #include "pdf.h" +static int +_info_obj(pdf_object_t **pdf) +{ + char dictionary[128] = "<<\n" + "/Producer (Melon " VERSION "." RELEASE "." PATCH EXTRA ")\n" + "/CreationDate (D:"; + + char buf[64]; + + time_t timestamp = time(NULL); + + strftime(buf, 64, "%Y%m%d%H%M%S", gmtime(×tamp)); + strcat(dictionary, buf); + strcat(dictionary, "+00'00')\n>>"); + + return pdf_obj_append(pdf, 0, NULL, dictionary, NULL, 0); +} + int pdf_dump_obj(pdf_object_t **pdf, FILE **fp) { - if (*pdf == NULL || *fp == NULL) + if (*pdf == NULL || *fp == NULL || _info_obj(pdf) != 0) return 1; long cur; @@ -152,18 +172,11 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) while (ptr->next != NULL) ptr = ptr->next; - /* - * TODO: Document information dictionary - * `"/Producer (Melon)"' - * `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"' - * - * Trailer dictionary - * `"/Info %d 0 R"' - */ fprintf(*fp, - "/Size %d\n/Root %d 0 R\n", + "/Size %d\n/Root %d 0 R\n/Info %d 0 R\n", ptr->id + 1, - pdf_get_catalog_id(pdf)); + pdf_get_catalog_id(pdf), + ptr->id); fputs("/ID [", *fp); diff --git a/src/version.h b/src/version.h index 46eeb34..c3ff314 100644 --- a/src/version.h +++ b/src/version.h @@ -5,6 +5,6 @@ */ #define VERSION "0" -#define RELEASE "2" -#define PATCH "5" +#define RELEASE "3" +#define PATCH "0" #define EXTRA "" From dd5854678cfe7bab499925175b5b40314d71fede Mon Sep 17 00:00:00 2001 From: yzrh Date: Fri, 6 Jan 2023 12:00:01 +0000 Subject: [PATCH 09/10] Fix JBIG2 allocation. Signed-off-by: yzrh --- src/jbig2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/jbig2.c b/src/jbig2.c index 9b3a9be..ea9233c 100644 --- a/src/jbig2.c +++ b/src/jbig2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, yzrh + * Copyright (c) 2022-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -31,5 +31,6 @@ strdec_jbig2(char **bitmap, } jbig2_release_page(ctx, image); + jbig2_ctx_free(ctx); return 0; } From 2fa2b760aef552982250dad346bd255be08cd9bb Mon Sep 17 00:00:00 2001 From: yzrh Date: Sat, 14 Jan 2023 23:52:28 +0000 Subject: [PATCH 10/10] Fix HN text parsing. Signed-off-by: yzrh --- src/cnki_pdf.c | 96 ++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 0c1ebb0..d96ea49 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -850,73 +850,75 @@ cnki_pdf_hn(cnki_t **param) for (int i = 0, j = 0; i < ptr->text_size - 1;) { switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) { case 0x8001: - if (ptr->address_next > ptr->address) - strcat(dictionary, "T*\n"); - case 0x8070: - if (ptr->address_next > ptr->address) { - i += 4; + if (ptr->address_next <= ptr->address) { + if (i + 7 >= ptr->text_size) { + i += 2; + break; + } - for (;;) { - if (i + 3 >= ptr->text_size || - (unsigned char) ptr->text[i + 1] == 0x80) - break; + conv_src[0] = ptr->text[i + 7]; + conv_src[1] = ptr->text[i + 6]; - conv_src[0] = ptr->text[i + 3]; - conv_src[1] = ptr->text[i + 2]; + //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") + //strcat(dictionary, buf); - //snprintf(buf, 64, "%f %f Td\n"); - //strcat(dictionary, buf); + conv_size = 6; - conv_size = 6; - - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, " Tj\n"); + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + if (conv_size - 2 > 0) { + strcat(dictionary, "<"); + for (int k = 0; k < conv_size - 2; k++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[k]); + strcat(dictionary, conv_hex); } - free(conv_dst); + strcat(dictionary, "> Tj\n"); } - - i += 4; + free(conv_dst); } + i += 8; break; } - if (i + 7 >= ptr->text_size) { - i += 2; + strcat(dictionary, "T*\n"); + case 0x8070: + i += 4; + + if (ptr->address_next <= ptr->address) break; - } - conv_src[0] = ptr->text[i + 7]; - conv_src[1] = ptr->text[i + 6]; + for (;;) { + if (i + 3 >= ptr->text_size || + (unsigned char) ptr->text[i + 1] == 0x80) + break; - //snprintf(buf, 64, "%f %f Td\n"); - //strcat(dictionary, buf); + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; - conv_size = 6; + //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") + //strcat(dictionary, buf); - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, " 0) { + strcat(dictionary, "<"); + for (int k = 0; k < conv_size - 2; k++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[k]); + strcat(dictionary, conv_hex); + } + strcat(dictionary, "> Tj\n"); } - strcat(dictionary, "> Tj\n"); + free(conv_dst); } - free(conv_dst); + + i += 4; } - i += 8; break; case 0x800a: if (i + 27 >= ptr->text_size || j >= ptr->image_length) {