diff --git a/README.md b/README.md index 471282e..b94a3e7 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,9 @@ Specify output file Set buffer size (default 512k) -v, --verbose -Print more information (twice for even more, three times for HN image decoding information as well) +Print more information (twice for even more, three times for HN image processing information as well) Thanks ====== -This project is inspired by [https://github.com/JeziL/caj2pdf](https://github.com/JeziL/caj2pdf) +This project is inspired by [https://github.com/caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf) diff --git a/src/cnki.c b/src/cnki.c index cc49d73..8c2e6e6 100644 --- a/src/cnki.c +++ b/src/cnki.c @@ -138,7 +138,7 @@ cnki_info(cnki_t **param) if ((*param)->file_stat->outline > 0) { if ((*param)->stat > 1) { printf("Loading outline(s)\n"); - printf("\t%16s\t%-24s\t%12s\t%12s\t%5s\n", + printf("\t%19s\t%-24s\t%12s\t%12s\t%5s\n", "title", "hierarchy", "page", diff --git a/src/cnki.h b/src/cnki.h index 193e69b..e9cc5d1 100644 --- a/src/cnki.h +++ b/src/cnki.h @@ -58,6 +58,10 @@ typedef struct _hn_image_t { int32_t format; /* hn_code */ int32_t address; int32_t size; + int16_t x; + int16_t y; + int16_t w; + int16_t h; char *image; } hn_image_t; diff --git a/src/cnki_hn.c b/src/cnki_hn.c index 4d32092..c2f76ec 100644 --- a/src/cnki_hn.c +++ b/src/cnki_hn.c @@ -93,6 +93,10 @@ cnki_hn(cnki_t **param) fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); + ptr->image_data[i].x = 0; + ptr->image_data[i].y = 0; + ptr->image_data[i].w = 0; + ptr->image_data[i].h = 0; fseek((*param)->fp_i, ptr->image_data[i].address + ptr->image_data[i].size, SEEK_SET); diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 603ce01..0cb30ca 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -524,7 +524,7 @@ cnki_pdf_hn(cnki_t **param) "/Subtype /Image\n"); if ((*param)->stat > 2) - printf("\tDecoding data, page %04d item %02d format %d... ", + printf("\tProcessing image, page %04d item %d format %d... ", ptr->page, i, ptr->image_data[i].format); switch (ptr->image_data[i].format) { @@ -700,7 +700,7 @@ cnki_pdf_hn(cnki_t **param) snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); strcat(dictionary, buf); - if (i + 1 < ptr->image_length) + if (i < ptr->image_length - 1) strcat(dictionary, " "); } @@ -739,43 +739,112 @@ cnki_pdf_hn(cnki_t **param) memset(dictionary, 0, dictionary_size); - strcat(dictionary, "text_size; i += 6) { - if (i + 5 >= ptr->text_size) - break; + for (int i = 0, j = 0; i < ptr->text_size - 1;) { + switch ((uint16_t) (ptr->text[i + 1] << 8 | ptr->text[i])) { + case 0x8001: + if (ptr->address_next <= ptr->address) { + i += 2; + break; + } - conv_src[0] = ptr->text[i + 5]; - conv_src[1] = ptr->text[i + 4]; + strcat(dictionary, "T*\n"); + case 0x8070: + if (ptr->address_next > ptr->address) { + i += 4; - if ((conv_src[0] << 8 | conv_src[1]) == 0xa389) { - strcat(dictionary, "a389"); - continue; - } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38a) { - strcat(dictionary, "a38a"); - continue; - } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38d) { - strcat(dictionary, "a38d"); - continue; - } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa3a0) { - strcat(dictionary, "a3a0"); - continue; - } + for (;;) { + if (i + 3 >= ptr->text_size || + (unsigned char) ptr->text[i + 1] == 0x80) + break; - conv_size = 6; + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - for (int j = 0; j < conv_size - 2; j++) { - snprintf(conv_hex, 3, - "%02x", (unsigned char) conv_dst[j]); - strcat(dictionary, conv_hex); - } - free(conv_dst); + conv_size = 6; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + if (conv_size - 2 > 0) { + strcat(dictionary, " Tj\n"); + } + free(conv_dst); + } + + i += 4; + } + + break; + } + + if (i + 7 >= ptr->text_size) { + i += 2; + break; + } + + conv_src[0] = ptr->text[i + 7]; + conv_src[1] = ptr->text[i + 6]; + + conv_size = 6; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + if (conv_size - 2 > 0) { + strcat(dictionary, " Tj\n"); + } + free(conv_dst); + } + + i += 8; + break; + case 0x800a: + if (i + 27 >= ptr->text_size || j >= ptr->image_length) { + i += 2; + break; + } + + if (ptr->image_length > 0) { + ptr->image_data[j].x = + ptr->text[i + 5] << 8 | ptr->text[i + 4]; + ptr->image_data[j].y = + ptr->text[i + 7] << 8 | ptr->text[i + 6]; + ptr->image_data[j].w = + ptr->text[i + 9] << 8 | ptr->text[i + 8]; + ptr->image_data[j].h = + ptr->text[i + 11] << 8 | ptr->text[i + 10]; + + if ((*param)->stat > 2) + printf("\tItem %d: origin (%4d, %4d), width %4d, height %4d\n", + j, + ptr->image_data[j].x, + ptr->image_data[j].y, + ptr->image_data[j].w, + ptr->image_data[j].h); + } + + i += 28; + j++; + break; + default: + i += 4; + break; } } - strcat(dictionary, ">"); + strcat(dictionary, "ET"); /* FIXME: Use the text somehow? */ free(dictionary); @@ -794,20 +863,14 @@ cnki_pdf_hn(cnki_t **param) if (ptr->image_length > 0) { memset(dictionary, 0, dictionary_size); - strcat(dictionary, "q\n"); - - strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); - - double resize_x; - double resize_y; - - for (int i = 0; i < ptr->image_length; i++) { - if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) - continue; + char resize_str[64] = "0.25 0 0 0.25 0 0 cm\n"; + double resize_x = 1; + double resize_y = 1; + if (dim[0] > 0 && dim[1] > 0) { /* Scale within bound of A4 paper */ - resize_x = 595.276 * 4 / dim[i * 2]; - resize_y = 841.89 * 4 / dim[i * 2 + 1]; + resize_x = 4 * 595.2756 / dim[0]; + resize_y = 4 * 841.8898 / dim[1]; if (resize_y < resize_x) snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", @@ -815,9 +878,18 @@ cnki_pdf_hn(cnki_t **param) else snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", resize_x, resize_x); - strcat(dictionary, buf); + strcat(resize_str, buf); + } - /* Apply transformation matrix */ + for (int i = 0; i < ptr->image_length; i++) { + if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) + continue; + + strcat(dictionary, "q\n"); + + strcat(dictionary, resize_str); + + /* Rotate image */ if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { snprintf(buf, 64, "1 0 0 1 0 %d cm\n", dim[i * 2 + 1]); @@ -826,15 +898,38 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "1 0 0 -1 0 0 cm\n"); } + /* Translate figure */ + if (i > 0) { + double origin_x = 0.4043745 * ptr->image_data[i].x; + double origin_y = 0.4043561 * ptr->image_data[i].y; + + if (origin_x < 0) + origin_x += (2381.102 - dim[i * 2]) / 2; + + if (origin_y < 0) + origin_y += (3367.559 + dim[i * 2 + 1]) / 2; + + if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) + origin_y = -3367.559 + origin_y + dim[i * 2 + 1]; + else + origin_y = 3367.559 - origin_y - dim[i * 2 + 1]; + + snprintf(buf, 64, "1 0 0 1 %f %f cm\n", origin_x, origin_y); + strcat(dictionary, buf); + } + snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", dim[i * 2], dim[i * 2 + 1]); strcat(dictionary, buf); snprintf(buf, 64, "/Im%d Do\n", i); strcat(dictionary, buf); - } - strcat(dictionary, "Q"); + strcat(dictionary, "Q"); + + if (i < ptr->image_length - 1) + strcat(dictionary, "\n"); + } if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { free(root_kid); @@ -866,7 +961,7 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "<<\n/Type /Page\n"); /* A4 paper */ - strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n"); + strcat(dictionary, "/MediaBox [0 0 595.2756 841.8898]\n"); if (ptr->image_length > 0) { free(dim); @@ -946,7 +1041,7 @@ cnki_pdf_hn(cnki_t **param) for (int i = 0; i < (*param)->file_stat->page; i++) { snprintf(buf, 64, "%d 0 R", root_kid[i]); strcat(dictionary, buf); - if (i + 1 < (*param)->file_stat->page) + if (i < (*param)->file_stat->page - 1) strcat(dictionary, " "); } diff --git a/src/version.h b/src/version.h index 7c1ca3b..db25a27 100644 --- a/src/version.h +++ b/src/version.h @@ -6,5 +6,5 @@ #define VERSION "0" #define RELEASE "2" -#define PATCH "1" +#define PATCH "2" #define EXTRA ""