From 1f62c53da6edc5a82a1e0eceb401b2274cd4a0d1 Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 30 Dec 2020 17:06:55 +0000 Subject: [PATCH] Produce PDF directly from KDH. --- README.md | 6 +---- src/cnki_kdh.c | 23 ++++++++++++++--- src/cnki_pdf.c | 66 ++++++++++++++++++++++++++++++++++++------------ src/pdf.h | 1 + src/pdf_get.c | 21 +++++++++++++++ src/pdf_parser.c | 10 ++++++-- src/pdf_writer.c | 4 +-- 7 files changed, 103 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 9bd01af..fbc3d66 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,11 @@ Melon: Converter that produces PDF from CNKI proprietary formats Development ----------- -Currently, PDF, CAJ, and KDH can be converted. Please report +Currently, CAJ and KDH can be converted. Please report any failures with a sample that can reproduce the behaviour. HN support is being worked on. -KDH is essentially an invalid PDF file xor'ed with a predetermined key. -You may want to convert the decrypted KDH to valid PDF, although some -PDF readers can display the invalid PDF. - Usage ===== diff --git a/src/cnki_kdh.c b/src/cnki_kdh.c index 450242d..78e8957 100644 --- a/src/cnki_kdh.c +++ b/src/cnki_kdh.c @@ -27,6 +27,11 @@ cnki_kdh(cnki_t **param) char buf[(*param)->size_buf]; + FILE *tmp = tmpfile(); + + if (tmp == NULL) + return 1; + for (;;) { fread(buf, (*param)->size_buf, 1, (*param)->fp_i); @@ -35,15 +40,27 @@ cnki_kdh(cnki_t **param) key_cur++; } - fwrite(buf, (*param)->size_buf, 1, (*param)->fp_o); + fwrite(buf, (*param)->size_buf, 1, tmp); if (ftell((*param)->fp_i) == size) break; } if ((*param)->stat > 0) - printf("Decryption ended total %ld byte(s) written\n", - ftell((*param)->fp_o)); + printf("Decrypted %ld byte(s)\n", ftell(tmp)); + + fseek(tmp, 0, SEEK_SET); + + FILE *orig = (*param)->fp_i; + (*param)->fp_i = tmp; + + cnki_pdf(param); + + (*param)->fp_i = orig; + fclose(tmp); + + if ((*param)->stat > 0) + printf("Conversion ended\n"); return 0; } diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 6964cac..d8eabff 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -232,29 +232,63 @@ cnki_pdf(cnki_t **param) } if ((*param)->stat > 1) - printf("Generating '/Catalog' dictionary\n"); + printf("Searching for catalog object\n"); - snprintf(buf, 64, - "<<\n/Type /Catalog\n/Pages %d 0 R\n", - root); - strcat(dictionary, buf); + int catalog = pdf_get_catalog_id(&pdf); + + if (catalog != 0) { + if ((*param)->stat > 0) + printf("catalog object is %d.\n", catalog); + } else { + if ((*param)->stat > 0) + printf("catalog object is missing\n"); + + if ((*param)->stat > 1) + printf("Generating catalog object\n"); - if (ids != NULL) { snprintf(buf, 64, - "/Outlines %d 0 R\n/PageMode /UseOutlines\n", - ids[0]); + "<<\n/Type /Catalog\n/Pages %d 0 R\n", + root); strcat(dictionary, buf); + + if (ids != NULL) { + snprintf(buf, 64, + "/Outlines %d 0 R\n/PageMode /UseOutlines\n", + ids[0]); + strcat(dictionary, buf); + } + + strcat(dictionary, ">>\n"); + + pdf_obj_append(&pdf, 0, NULL, dictionary, NULL); + + if ((*param)->stat > 0) + printf("Generated catalog object\n"); } - strcat(dictionary, ">>\n"); + if ((*param)->stat > 1) + printf("Searching for xref object\n"); - pdf_obj_append(&pdf, 0, NULL, dictionary, NULL); + int xref = pdf_get_xref_id(&pdf); + + if (xref != 0) { + if ((*param)->stat > 0) + printf("xref object is %d.\n", xref); + + if ((*param)->stat > 1) + printf("Deleting xref object\n"); + + pdf_obj_del(&pdf, xref); + + if ((*param)->stat > 0) + printf("Deleted xref object\n"); + } else { + if ((*param)->stat > 0) + printf("xref object is missing\n"); + } free(dictionary); - if ((*param)->stat > 0) - printf("Generated '/Catalog' dictionary\n"); - if ((*param)->stat > 1) printf("Sorting object(s)\n"); @@ -312,7 +346,7 @@ cnki_pdf(cnki_t **param) pdf_get_count(&pdf), ftell((*param)->fp_o)); - long xref = ftell((*param)->fp_o); + long cur_xref = ftell((*param)->fp_o); if ((*param)->stat > 1) printf("Writing cross-reference table\n"); @@ -323,7 +357,7 @@ cnki_pdf(cnki_t **param) } else { if ((*param)->stat > 0) printf("Cross-reference table %ld byte(s) written\n", - ftell((*param)->fp_o) - xref); + ftell((*param)->fp_o) - cur_xref); } if ((*param)->stat > 1) @@ -332,7 +366,7 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) cur = ftell((*param)->fp_o); - if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) { + if (pdf_dump_trailer(&pdf, &(*param)->fp_o, cur_xref) != 0) { if ((*param)->stat > 0) printf("Trailer not written\n"); } else { diff --git a/src/pdf.h b/src/pdf.h index 394da5a..a5630d8 100644 --- a/src/pdf.h +++ b/src/pdf.h @@ -54,6 +54,7 @@ int pdf_get_size(pdf_object_t **pdf); int pdf_get_free_id(pdf_object_t **pdf); int pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count); int pdf_get_catalog_id(pdf_object_t **pdf); +int pdf_get_xref_id(pdf_object_t **pdf); int pdf_get_parent_id(pdf_object_t **pdf, int **id); int pdf_get_kid_id(pdf_object_t **pdf, int id, int **kid); int pdf_get_kid_count(pdf_object_t **pdf, int id); diff --git a/src/pdf_get.c b/src/pdf_get.c index a441b1e..95d5b66 100644 --- a/src/pdf_get.c +++ b/src/pdf_get.c @@ -161,6 +161,27 @@ pdf_get_catalog_id(pdf_object_t **pdf) return catalog_id; } +int +pdf_get_xref_id(pdf_object_t **pdf) +{ + if (*pdf == NULL) + return 1; + + int xref_id = 0; + + pdf_object_t *ptr = (*pdf)->next; + + while (ptr != NULL) { + if (ptr->dictionary != NULL && + strstr(ptr->dictionary, "/XRef") != NULL) + xref_id = ptr->id; + + ptr = ptr->next; + } + + return xref_id; +} + int pdf_get_parent_id(pdf_object_t **pdf, int **id) { diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 9c361a2..9531d28 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -154,10 +154,16 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && (tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { - /* A dictionary object may have nested dictionary */ + /* + * A dictionary object may have nested dictionary, + * but it should not be in a stream + */ while ((tmp = _memmem_whitespace(tail + 2, ptr->size - (tail - buf) - 2, - ">>", 2)) != NULL) + ">>", 2)) != NULL && + memmem(tail + 2, + ptr->size - (tail - buf) - 2, + "stream\r\n", 8) == NULL) tail = tmp; ptr->dictionary_size = tail - head + 2; diff --git a/src/pdf_writer.c b/src/pdf_writer.c index 8d5fc16..cda998a 100644 --- a/src/pdf_writer.c +++ b/src/pdf_writer.c @@ -27,10 +27,10 @@ pdf_dump_obj(pdf_object_t **pdf, FILE **fp) fprintf(*fp, "%d 0 obj\n", ptr->id); if (ptr->dictionary != NULL) { - fputs(ptr->dictionary, *fp); + fwrite(ptr->dictionary, ptr->dictionary_size, 1, *fp); fputs("\n", *fp); } else if (ptr->object != NULL) { - fputs(ptr->object, *fp); + fwrite(ptr->object, ptr->object_size, 1, *fp); fputs("\n", *fp); } else if (ptr->stream == NULL) { fputs("null\n", *fp);