Add preliminary support for HN figure placement.

Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
yzrh 2022-12-26 03:46:01 +00:00
parent 224a09a015
commit abce2fd2e4
6 changed files with 156 additions and 53 deletions

View file

@ -39,9 +39,9 @@ Specify output file
Set buffer size (default 512k)
-v, --verbose
Print more information (twice for even more, three times for HN image decoding information as well)
Print more information (twice for even more, three times for HN image processing information as well)
Thanks
======
This project is inspired by [https://github.com/JeziL/caj2pdf](https://github.com/JeziL/caj2pdf)
This project is inspired by [https://github.com/caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf)

View file

@ -138,7 +138,7 @@ cnki_info(cnki_t **param)
if ((*param)->file_stat->outline > 0) {
if ((*param)->stat > 1) {
printf("Loading outline(s)\n");
printf("\t%16s\t%-24s\t%12s\t%12s\t%5s\n",
printf("\t%19s\t%-24s\t%12s\t%12s\t%5s\n",
"title",
"hierarchy",
"page",

View file

@ -58,6 +58,10 @@ typedef struct _hn_image_t {
int32_t format; /* hn_code */
int32_t address;
int32_t size;
int16_t x;
int16_t y;
int16_t w;
int16_t h;
char *image;
} hn_image_t;

View file

@ -93,6 +93,10 @@ cnki_hn(cnki_t **param)
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
ptr->image_data[i].x = 0;
ptr->image_data[i].y = 0;
ptr->image_data[i].w = 0;
ptr->image_data[i].h = 0;
fseek((*param)->fp_i,
ptr->image_data[i].address + ptr->image_data[i].size,
SEEK_SET);

View file

@ -524,7 +524,7 @@ cnki_pdf_hn(cnki_t **param)
"/Subtype /Image\n");
if ((*param)->stat > 2)
printf("\tDecoding data, page %04d item %02d format %d... ",
printf("\tProcessing image, page %04d item %d format %d... ",
ptr->page, i, ptr->image_data[i].format);
switch (ptr->image_data[i].format) {
@ -700,7 +700,7 @@ cnki_pdf_hn(cnki_t **param)
snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]);
strcat(dictionary, buf);
if (i + 1 < ptr->image_length)
if (i < ptr->image_length - 1)
strcat(dictionary, " ");
}
@ -739,43 +739,112 @@ cnki_pdf_hn(cnki_t **param)
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<feff");
strcat(dictionary, "BT\n");
for (int i = 0; i < ptr->text_size; i += 6) {
if (i + 5 >= ptr->text_size)
for (int i = 0, j = 0; i < ptr->text_size - 1;) {
switch ((uint16_t) (ptr->text[i + 1] << 8 | ptr->text[i])) {
case 0x8001:
if (ptr->address_next <= ptr->address) {
i += 2;
break;
}
strcat(dictionary, "T*\n");
case 0x8070:
if (ptr->address_next > ptr->address) {
i += 4;
for (;;) {
if (i + 3 >= ptr->text_size ||
(unsigned char) ptr->text[i + 1] == 0x80)
break;
conv_src[0] = ptr->text[i + 5];
conv_src[1] = ptr->text[i + 4];
if ((conv_src[0] << 8 | conv_src[1]) == 0xa389) {
strcat(dictionary, "a389");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38a) {
strcat(dictionary, "a38a");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38d) {
strcat(dictionary, "a38d");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa3a0) {
strcat(dictionary, "a3a0");
continue;
}
conv_src[0] = ptr->text[i + 3];
conv_src[1] = ptr->text[i + 2];
conv_size = 6;
if (strconv(&conv_dst, "UTF-16BE",
conv_src, "GB18030", &conv_size) == 0) {
for (int j = 0; j < conv_size - 2; j++) {
if (conv_size - 2 > 0) {
strcat(dictionary, "<feff");
for (int k = 0; k < conv_size - 2; k++) {
snprintf(conv_hex, 3,
"%02x", (unsigned char) conv_dst[j]);
"%02x", (unsigned char) conv_dst[k]);
strcat(dictionary, conv_hex);
}
strcat(dictionary, "> Tj\n");
}
free(conv_dst);
}
i += 4;
}
strcat(dictionary, ">");
break;
}
if (i + 7 >= ptr->text_size) {
i += 2;
break;
}
conv_src[0] = ptr->text[i + 7];
conv_src[1] = ptr->text[i + 6];
conv_size = 6;
if (strconv(&conv_dst, "UTF-16BE",
conv_src, "GB18030", &conv_size) == 0) {
if (conv_size - 2 > 0) {
strcat(dictionary, "<feff");
for (int k = 0; k < conv_size - 2; k++) {
snprintf(conv_hex, 3,
"%02x", (unsigned char) conv_dst[k]);
strcat(dictionary, conv_hex);
}
strcat(dictionary, "> Tj\n");
}
free(conv_dst);
}
i += 8;
break;
case 0x800a:
if (i + 27 >= ptr->text_size || j >= ptr->image_length) {
i += 2;
break;
}
if (ptr->image_length > 0) {
ptr->image_data[j].x =
ptr->text[i + 5] << 8 | ptr->text[i + 4];
ptr->image_data[j].y =
ptr->text[i + 7] << 8 | ptr->text[i + 6];
ptr->image_data[j].w =
ptr->text[i + 9] << 8 | ptr->text[i + 8];
ptr->image_data[j].h =
ptr->text[i + 11] << 8 | ptr->text[i + 10];
if ((*param)->stat > 2)
printf("\tItem %d: origin (%4d, %4d), width %4d, height %4d\n",
j,
ptr->image_data[j].x,
ptr->image_data[j].y,
ptr->image_data[j].w,
ptr->image_data[j].h);
}
i += 28;
j++;
break;
default:
i += 4;
break;
}
}
strcat(dictionary, "ET");
/* FIXME: Use the text somehow? */
free(dictionary);
@ -794,20 +863,14 @@ cnki_pdf_hn(cnki_t **param)
if (ptr->image_length > 0) {
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "q\n");
strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n");
double resize_x;
double resize_y;
for (int i = 0; i < ptr->image_length; i++) {
if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0)
continue;
char resize_str[64] = "0.25 0 0 0.25 0 0 cm\n";
double resize_x = 1;
double resize_y = 1;
if (dim[0] > 0 && dim[1] > 0) {
/* Scale within bound of A4 paper */
resize_x = 595.276 * 4 / dim[i * 2];
resize_y = 841.89 * 4 / dim[i * 2 + 1];
resize_x = 4 * 595.2756 / dim[0];
resize_y = 4 * 841.8898 / dim[1];
if (resize_y < resize_x)
snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n",
@ -815,9 +878,18 @@ cnki_pdf_hn(cnki_t **param)
else
snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n",
resize_x, resize_x);
strcat(dictionary, buf);
strcat(resize_str, buf);
}
/* Apply transformation matrix */
for (int i = 0; i < ptr->image_length; i++) {
if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0)
continue;
strcat(dictionary, "q\n");
strcat(dictionary, resize_str);
/* Rotate image */
if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) {
snprintf(buf, 64, "1 0 0 1 0 %d cm\n",
dim[i * 2 + 1]);
@ -826,16 +898,39 @@ cnki_pdf_hn(cnki_t **param)
strcat(dictionary, "1 0 0 -1 0 0 cm\n");
}
/* Translate figure */
if (i > 0) {
double origin_x = 0.4043745 * ptr->image_data[i].x;
double origin_y = 0.4043561 * ptr->image_data[i].y;
if (origin_x < 0)
origin_x += (2381.102 - dim[i * 2]) / 2;
if (origin_y < 0)
origin_y += (3367.559 + dim[i * 2 + 1]) / 2;
if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1)
origin_y = -3367.559 + origin_y + dim[i * 2 + 1];
else
origin_y = 3367.559 - origin_y - dim[i * 2 + 1];
snprintf(buf, 64, "1 0 0 1 %f %f cm\n", origin_x, origin_y);
strcat(dictionary, buf);
}
snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n",
dim[i * 2], dim[i * 2 + 1]);
strcat(dictionary, buf);
snprintf(buf, 64, "/Im%d Do\n", i);
strcat(dictionary, buf);
}
strcat(dictionary, "Q");
if (i < ptr->image_length - 1)
strcat(dictionary, "\n");
}
if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) {
free(root_kid);
free(ids);
@ -866,7 +961,7 @@ cnki_pdf_hn(cnki_t **param)
strcat(dictionary, "<<\n/Type /Page\n");
/* A4 paper */
strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n");
strcat(dictionary, "/MediaBox [0 0 595.2756 841.8898]\n");
if (ptr->image_length > 0) {
free(dim);
@ -946,7 +1041,7 @@ cnki_pdf_hn(cnki_t **param)
for (int i = 0; i < (*param)->file_stat->page; i++) {
snprintf(buf, 64, "%d 0 R", root_kid[i]);
strcat(dictionary, buf);
if (i + 1 < (*param)->file_stat->page)
if (i < (*param)->file_stat->page - 1)
strcat(dictionary, " ");
}

View file

@ -6,5 +6,5 @@
#define VERSION "0"
#define RELEASE "2"
#define PATCH "1"
#define PATCH "2"
#define EXTRA ""