From 1ce3f89574fa7256ab019eefc96a7362165cca52 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 29 Dec 2022 21:10:03 +0000 Subject: [PATCH] Handle combination of text and image in page content. Signed-off-by: yzrh --- src/cnki_pdf.c | 125 ++++++++++++++++++++++++------------------------- src/version.h | 2 +- 2 files changed, 63 insertions(+), 64 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 328a989..3d446e4 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -460,6 +460,9 @@ cnki_pdf_hn(cnki_t **param) if (pdf_obj_create(&pdf) != 0) return 1; + int font = pdf_get_free_id(&pdf); + pdf_obj_append(&pdf, font, NULL, "<<\n/Type /Font\n/Subtype /TrueType\n/BaseFont /NotoSansCJKSC\n>>", NULL, 0); + if ((*param)->stat > 1) printf("Generating PDF object(s)\n"); @@ -480,15 +483,12 @@ cnki_pdf_hn(cnki_t **param) while (ptr != NULL) { /* * External object (ptr->image_length) + - * content object + * resource object + + * content object + * page object */ int *ids = NULL; - if (ptr->image_length > 0) - pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); - else - pdf_get_free_ids(&pdf, &ids, 2); + pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); int bitmap_size; char *bitmap; @@ -721,19 +721,39 @@ cnki_pdf_hn(cnki_t **param) if ((*param)->stat > 2) printf("Not extracted.\n"); - pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0); + pdf_obj_append(&pdf, ids[i], "null", NULL, NULL, 0); } else { if ((*param)->stat > 2) printf("Unsupported format.\n"); - pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0); + pdf_obj_append(&pdf, ids[i], "null", NULL, NULL, 0); } } - if (ptr->image_length > 0) { - memset(dictionary, 0, dictionary_size); + if (ptr->image_length > 0) + free(dictionary); - strcat(dictionary, "<<\n/XObject <<"); + dictionary_size = 128 + 2 * ptr->text_size + 128 * ptr->image_length; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(ids); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n"); + + if (ptr->text_size > 0) { + snprintf(buf, 64, "/Font <>\n", font); + strcat(dictionary, buf); + } + + if (ptr->image_length > 0) { + strcat(dictionary, "/XObject <<"); for (int i = 0; i < ptr->image_length; i++) { snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); @@ -743,13 +763,15 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, " "); } - strcat(dictionary, ">>\n>>"); - - pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); - - free(dictionary); + strcat(dictionary, ">>\n"); } + strcat(dictionary, ">>"); + + pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); + + memset(dictionary, 0, dictionary_size); + int conv_size; char *conv_dst; char conv_src[2]; @@ -766,20 +788,10 @@ cnki_pdf_hn(cnki_t **param) ptr->text = stream; } - dictionary_size = 64 + 2 * ptr->text_size; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; - } - - memset(dictionary, 0, dictionary_size); - strcat(dictionary, "BT\n"); + strcat(dictionary, "/F0 10 Tf\n"); + for (int i = 0, j = 0; i < ptr->text_size - 1;) { switch ((uint16_t) (ptr->text[i + 1] << 8 | ptr->text[i])) { case 0x8001: @@ -801,6 +813,9 @@ cnki_pdf_hn(cnki_t **param) conv_src[0] = ptr->text[i + 3]; conv_src[1] = ptr->text[i + 2]; + //snprintf(buf, 64, "%f %f Td\n"); + //strcat(dictionary, buf); + conv_size = 6; if (strconv(&conv_dst, "UTF-16BE", @@ -831,6 +846,9 @@ cnki_pdf_hn(cnki_t **param) conv_src[0] = ptr->text[i + 7]; conv_src[1] = ptr->text[i + 6]; + //snprintf(buf, 64, "%f %f Td\n"); + //strcat(dictionary, buf); + conv_size = 6; if (strconv(&conv_dst, "UTF-16BE", @@ -885,23 +903,11 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "ET"); - /* FIXME: Use the text somehow? */ - free(dictionary); - } - - dictionary_size = 128 + 128 * ptr->image_length; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; + if (ptr->image_length > 0) + strcat(dictionary, "\n"); } if (ptr->image_length > 0) { - memset(dictionary, 0, dictionary_size); - char resize_str[64] = "0.25 0 0 0.25 0 0 cm\n"; double resize_x = 1; double resize_y = 1; @@ -970,10 +976,13 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "\n"); } + free(dim); + } + + if (strlen(dictionary) > 0) { if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { free(root_kid); free(ids); - free(dim); free(dictionary); return 1; } @@ -993,6 +1002,9 @@ cnki_pdf_hn(cnki_t **param) NULL, dictionary, stream, stream_size); free(stream); + } else { + pdf_obj_append(&pdf, ids[ptr->image_length + 1], + "null", NULL, NULL, 0); } memset(dictionary, 0, dictionary_size); @@ -1002,32 +1014,19 @@ cnki_pdf_hn(cnki_t **param) /* A4 paper */ strcat(dictionary, "/MediaBox [0 0 595.2756 841.8898]\n"); - if (ptr->image_length > 0) { - free(dim); + snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); + strcat(dictionary, buf); - snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); - strcat(dictionary, buf); + snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); + strcat(dictionary, buf); - snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); - strcat(dictionary, buf); + /* Add /Parent when we know root */ + pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); - /* Add /Parent when we know root */ - pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); - - root_kid[cnt++] = ids[ptr->image_length + 2]; - } else { - snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length]); - strcat(dictionary, buf); - - /* Add /Parent when we know root */ - pdf_obj_append(&pdf, ids[ptr->image_length + 1], NULL, dictionary, NULL, 0); - - root_kid[cnt++] = ids[ptr->image_length + 1]; - } - - free(dictionary); + root_kid[cnt++] = ids[ptr->image_length + 2]; free(ids); + free(dictionary); ptr = ptr->next; } diff --git a/src/version.h b/src/version.h index db25a27..08b70ea 100644 --- a/src/version.h +++ b/src/version.h @@ -6,5 +6,5 @@ #define VERSION "0" #define RELEASE "2" -#define PATCH "2" +#define PATCH "3" #define EXTRA ""