Handle headless HN and page with no image.

Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
yzrh 2022-12-25 18:03:01 +00:00
parent d2826fa075
commit c2ad6549fb
6 changed files with 248 additions and 191 deletions

View file

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org> * Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
*/ */
@ -54,6 +54,11 @@ cnki_destroy(cnki_t **param)
object_hn_t *ptr_hn; object_hn_t *ptr_hn;
while ((ptr_hn = (*param)->object_hn) != NULL) { while ((ptr_hn = (*param)->object_hn) != NULL) {
(*param)->object_hn = (*param)->object_hn->next; (*param)->object_hn = (*param)->object_hn->next;
free(ptr_hn->text);
if (ptr_hn->image_data != NULL)
for (int i = 0; i < ptr_hn->image_length; i++)
free(ptr_hn->image_data[i].image);
free(ptr_hn->image_data);
free(ptr_hn); free(ptr_hn);
} }
@ -71,12 +76,19 @@ cnki_info(cnki_t **param)
printf("Reading file header at 0x%x\n", ADDRESS_HEAD); printf("Reading file header at 0x%x\n", ADDRESS_HEAD);
int addr[2]; int addr[2];
unsigned char str[2];
fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET); fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET);
fread((*param)->file_stat->type, 4, 1, (*param)->fp_i); fread((*param)->file_stat->type, 4, 1, (*param)->fp_i);
if ((*param)->stat > 0) fread(str, 2, 1, (*param)->fp_i);
printf("File type is '%s'\n", (*param)->file_stat->type);
if ((*param)->stat > 0) {
if ((unsigned char) (*param)->file_stat->type[0] > 0x7f)
printf("File type is '%02x'\n", (unsigned char) (*param)->file_stat->type[0]);
else
printf("File type is '%s'\n", (*param)->file_stat->type);
}
if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) { if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) {
return 0; return 0;
@ -86,6 +98,9 @@ cnki_info(cnki_t **param)
} else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) { } else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) {
addr[0] = ADDRESS_HN_PAGE; addr[0] = ADDRESS_HN_PAGE;
addr[1] = ADDRESS_HN_OUTLINE; addr[1] = ADDRESS_HN_OUTLINE;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
addr[0] = ADDRESS_C8_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) { } else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) {
return 0; return 0;
} else { } else {
@ -102,6 +117,14 @@ cnki_info(cnki_t **param)
printf("Advised %d page(s)\n", printf("Advised %d page(s)\n",
(*param)->file_stat->page); (*param)->file_stat->page);
if (strncmp((*param)->file_stat->type, "HN", 2) == 0 && str[0] == 0xc8 && str[1] == 0x00) {
fseek((*param)->fp_i, 0xd8, SEEK_SET);
return 0;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
fseek((*param)->fp_i, 0x50, SEEK_SET);
return 0;
}
if ((*param)->stat > 1) if ((*param)->stat > 1)
printf("Reading outline count at 0x%x\n", addr[1]); printf("Reading outline count at 0x%x\n", addr[1]);

View file

@ -16,6 +16,8 @@
#define ADDRESS_HN_PAGE 0x0090 #define ADDRESS_HN_PAGE 0x0090
#define ADDRESS_HN_OUTLINE 0x0158 #define ADDRESS_HN_OUTLINE 0x0158
#define ADDRESS_C8_PAGE 0x0008
#define ADDRESS_KDH_BODY 0x00fe #define ADDRESS_KDH_BODY 0x00fe
#define KEY_KDH "FZHMEI" #define KEY_KDH "FZHMEI"
@ -64,7 +66,8 @@ typedef struct _object_hn_t {
int32_t text_size; int32_t text_size;
int16_t image_length; int16_t image_length;
int16_t page; int16_t page;
int32_t unknown[2]; /* TODO: what is it? */ int32_t unknown; /* TODO: what is it? */
int32_t address_next;
char *text; char *text;
struct _hn_image_t *image_data; struct _hn_image_t *image_data;
struct _object_hn_t *next; struct _object_hn_t *next;

View file

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org> * Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
*/ */
@ -27,12 +27,13 @@ cnki_hn(cnki_t **param)
if ((*param)->stat > 1) { if ((*param)->stat > 1) {
printf("Loading page(s)\n"); printf("Loading page(s)\n");
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n", printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4s\t%8s\t%8s\n",
"address", "address",
"text", "text",
"length", "length",
"page", "page",
"unknown", "unknown",
"next",
"code", "code",
"address", "address",
"image"); "image");
@ -44,7 +45,8 @@ cnki_hn(cnki_t **param)
fread(&ptr->text_size, 4, 1, (*param)->fp_i); fread(&ptr->text_size, 4, 1, (*param)->fp_i);
fread(&ptr->image_length, 2, 1, (*param)->fp_i); fread(&ptr->image_length, 2, 1, (*param)->fp_i);
fread(&ptr->page, 2, 1, (*param)->fp_i); fread(&ptr->page, 2, 1, (*param)->fp_i);
fread(&ptr->unknown, 8, 1, (*param)->fp_i); fread(&ptr->unknown, 4, 1, (*param)->fp_i);
fread(&ptr->address_next, 4, 1, (*param)->fp_i);
ptr->text = NULL; ptr->text = NULL;
ptr->image_data = NULL; ptr->image_data = NULL;
@ -62,66 +64,76 @@ cnki_hn(cnki_t **param)
ptr = (*param)->object_hn; ptr = (*param)->object_hn;
while (ptr != NULL) { while (ptr != NULL) {
ptr->text = malloc(ptr->text_size); if (ptr->text_size > 0) {
ptr->text = malloc(ptr->text_size);
if (ptr->text == NULL) if (ptr->text == NULL)
return 1; return 1;
fseek((*param)->fp_i, ptr->address, SEEK_SET); fseek((*param)->fp_i, ptr->address, SEEK_SET);
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i); fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
}
if ((*param)->stat > 1) if ((*param)->stat > 1)
printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}", printf("\t%08x\t%8d\t%6d\t%4d\t%8d\t%08x",
ptr->address, ptr->address,
ptr->text_size, ptr->text_size,
ptr->image_length, ptr->image_length,
ptr->page, ptr->page,
ptr->unknown[0], ptr->unknown,
ptr->unknown[1]); ptr->address_next);
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t)); if (ptr->image_length > 0) {
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
if (ptr->image_data == NULL) if (ptr->image_data == NULL)
return 1;
for (int i = 0; i < ptr->image_length; i++) {
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
fseek((*param)->fp_i,
ptr->image_data[i].address + ptr->image_data[i].size,
SEEK_SET);
}
for (int i = 0; i < ptr->image_length; i++) {
ptr->image_data[i].image = malloc(ptr->image_data[i].size);
if (ptr->image_data[i].image == NULL)
return 1; return 1;
fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET); for (int i = 0; i < ptr->image_length; i++) {
fread(ptr->image_data[i].image, fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
ptr->image_data[i].size, 1, fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
(*param)->fp_i); fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
fseek((*param)->fp_i,
ptr->image_data[i].address + ptr->image_data[i].size,
SEEK_SET);
}
if ((*param)->stat > 1) { for (int i = 0; i < ptr->image_length; i++) {
if (i == 0) { ptr->image_data[i].image = malloc(ptr->image_data[i].size);
printf("\t%4d\t%08x\t%8d\n",
ptr->image_data[i].format, if (ptr->image_data[i].image == NULL)
ptr->image_data[i].address, return 1;
ptr->image_data[i].size);
} else { fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET);
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n", fread(ptr->image_data[i].image,
"", ptr->image_data[i].size, 1,
"", (*param)->fp_i);
"",
"", if ((*param)->stat > 1) {
"", if (i == 0) {
ptr->image_data[i].format, printf("\t%4d\t%08x\t%8d\n",
ptr->image_data[i].address, ptr->image_data[i].format,
ptr->image_data[i].size); ptr->image_data[i].address,
ptr->image_data[i].size);
} else {
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4d\t%08x\t%8d\n",
"",
"",
"",
"",
"",
"",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
}
} }
} }
} else if ((*param)->stat > 1) {
printf("\t%4s\t%8s\t%8s\n",
"",
"",
"");
} }
ptr = ptr->next; ptr = ptr->next;

View file

@ -481,7 +481,10 @@ cnki_pdf_hn(cnki_t **param)
* page object * page object
*/ */
int *ids = NULL; int *ids = NULL;
pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); if (ptr->image_length > 0)
pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3);
else
pdf_get_free_ids(&pdf, &ids, 2);
int bitmap_size; int bitmap_size;
char *bitmap; char *bitmap;
@ -489,27 +492,31 @@ cnki_pdf_hn(cnki_t **param)
int stream_size; int stream_size;
char *stream; char *stream;
int *dim = malloc(2 * ptr->image_length * sizeof(int)); int *dim;
if (ptr->image_length > 0) {
dim = malloc(2 * ptr->image_length * sizeof(int));
if (dim == NULL) {
free(root_kid);
free(ids);
return 1;
}
dictionary_size = 256;
dictionary = malloc(dictionary_size);
if (dictionary == NULL) {
free(root_kid);
free(ids);
free(dim);
return 1;
}
}
int ret; int ret;
int info[3]; int info[3];
if (dim == NULL) {
free(root_kid);
free(ids);
return 1;
}
dictionary_size = 256;
dictionary = malloc(dictionary_size);
if (dictionary == NULL) {
free(root_kid);
free(ids);
free(dim);
return 1;
}
for (int i = 0; i < ptr->image_length; i++) { for (int i = 0; i < ptr->image_length; i++) {
memset(dictionary, 0, dictionary_size); memset(dictionary, 0, dictionary_size);
@ -684,66 +691,42 @@ cnki_pdf_hn(cnki_t **param)
} }
} }
memset(dictionary, 0, dictionary_size); if (ptr->image_length > 0) {
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<<\n/XObject <<"); strcat(dictionary, "<<\n/XObject <<");
for (int i = 0; i < ptr->image_length; i++) { for (int i = 0; i < ptr->image_length; i++) {
snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]);
strcat(dictionary, buf); strcat(dictionary, buf);
if (i + 1 < ptr->image_length) if (i + 1 < ptr->image_length)
strcat(dictionary, " "); strcat(dictionary, " ");
}
strcat(dictionary, ">>\n>>");
pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0);
free(dictionary);
} }
strcat(dictionary, ">>\n>>");
pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0);
free(dictionary);
int conv_size; int conv_size;
char *conv_dst; char *conv_dst;
char conv_src[2]; char conv_src[2];
char conv_hex[3]; char conv_hex[3];
if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) { if (ptr->text_size > 0) {
cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0 ||
strncmp(ptr->text, "COMPRESSTEXT", 12) == 0) {
cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size);
dictionary_size = 64 + 2 * stream_size; free(ptr->text);
dictionary = malloc(dictionary_size);
if (dictionary == NULL) { ptr->text_size = stream_size;
free(root_kid); ptr->text = stream;
free(ids);
free(dim);
return 1;
} }
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<feff");
for (int i = 0; i < stream_size; i += 16) {
conv_src[0] = stream[i + 7];
conv_src[1] = stream[i + 6];
conv_size = 6;
if (strconv(&conv_dst, "UTF-16BE",
conv_src, "GB18030", &conv_size) == 0) {
for (int j = 0; j < conv_size - 2; j++) {
snprintf(conv_hex, 3,
"%02x", (unsigned char) conv_dst[j]);
strcat(dictionary, conv_hex);
}
free(conv_dst);
}
}
free(stream);
strcat(dictionary, ">");
} else {
dictionary_size = 64 + 2 * ptr->text_size; dictionary_size = 64 + 2 * ptr->text_size;
dictionary = malloc(dictionary_size); dictionary = malloc(dictionary_size);
@ -758,9 +741,26 @@ cnki_pdf_hn(cnki_t **param)
strcat(dictionary, "<feff"); strcat(dictionary, "<feff");
for (int i = 0; i < ptr->text_size; i += 4) { for (int i = 0; i < ptr->text_size; i += 6) {
conv_src[0] = ptr->text[i + 3]; if (i + 5 >= ptr->text_size)
conv_src[1] = ptr->text[i + 2]; break;
conv_src[0] = ptr->text[i + 5];
conv_src[1] = ptr->text[i + 4];
if ((conv_src[0] << 8 | conv_src[1]) == 0xa389) {
strcat(dictionary, "a389");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38a) {
strcat(dictionary, "a38a");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38d) {
strcat(dictionary, "a38d");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa3a0) {
strcat(dictionary, "a3a0");
continue;
}
conv_size = 6; conv_size = 6;
@ -776,12 +776,12 @@ cnki_pdf_hn(cnki_t **param)
} }
strcat(dictionary, ">"); strcat(dictionary, ">");
/* FIXME: Use the text somehow? */
free(dictionary);
} }
/* FIXME: Use the text somehow? */ dictionary_size = 64 + 128 * ptr->image_length;
free(dictionary);
dictionary_size = 64 + 64 * ptr->image_length;
dictionary = malloc(dictionary_size); dictionary = malloc(dictionary_size);
if (dictionary == NULL) { if (dictionary == NULL) {
@ -791,96 +791,109 @@ cnki_pdf_hn(cnki_t **param)
return 1; return 1;
} }
memset(dictionary, 0, dictionary_size); if (ptr->image_length > 0) {
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "q\n"); strcat(dictionary, "q\n");
strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n");
double resize_x; double resize_x;
double resize_y; double resize_y;
for (int i = 0; i < ptr->image_length; i++) { for (int i = 0; i < ptr->image_length; i++) {
if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0)
continue; continue;
/* Scale within bound of A4 paper */ /* Scale within bound of A4 paper */
resize_x = 595.276 * 4 / dim[i * 2]; resize_x = 595.276 * 4 / dim[i * 2];
resize_y = 841.89 * 4 / dim[i * 2 + 1]; resize_y = 841.89 * 4 / dim[i * 2 + 1];
if (resize_y < resize_x) if (resize_y < resize_x)
snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n",
resize_y, resize_y); resize_y, resize_y);
else else
snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n",
resize_x, resize_x); resize_x, resize_x);
strcat(dictionary, buf);
/* Apply transformation matrix */
if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) {
snprintf(buf, 64, "1 0 0 1 0 %d cm\n",
dim[i * 2 + 1]);
strcat(dictionary, buf); strcat(dictionary, buf);
strcat(dictionary, "1 0 0 -1 0 0 cm\n"); /* Apply transformation matrix */
if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) {
snprintf(buf, 64, "1 0 0 1 0 %d cm\n",
dim[i * 2 + 1]);
strcat(dictionary, buf);
strcat(dictionary, "1 0 0 -1 0 0 cm\n");
}
snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n",
dim[i * 2], dim[i * 2 + 1]);
strcat(dictionary, buf);
snprintf(buf, 64, "/Im%d Do\n", i);
strcat(dictionary, buf);
} }
snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", strcat(dictionary, "Q");
dim[i * 2], dim[i * 2 + 1]);
if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) {
free(root_kid);
free(ids);
free(dim);
free(dictionary);
return 1;
}
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<<\n");
snprintf(buf, 64, "/Length %d\n", stream_size);
strcat(dictionary, buf); strcat(dictionary, buf);
snprintf(buf, 64, "/Im%d Do\n", i); strcat(dictionary, "/Filter /FlateDecode\n");
strcat(dictionary, buf);
strcat(dictionary, ">>");
pdf_obj_append(&pdf, ids[ptr->image_length + 1],
NULL, dictionary, stream, stream_size);
free(stream);
} }
strcat(dictionary, "Q");
if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) {
free(root_kid);
free(ids);
free(dim);
free(dictionary);
return 1;
}
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<<\n");
snprintf(buf, 64, "/Length %d\n", stream_size);
strcat(dictionary, buf);
strcat(dictionary, "/Filter /FlateDecode\n");
strcat(dictionary, ">>");
pdf_obj_append(&pdf, ids[ptr->image_length + 1],
NULL, dictionary, stream, stream_size);
free(stream);
memset(dictionary, 0, dictionary_size); memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<<\n/Type /Page\n"); strcat(dictionary, "<<\n/Type /Page\n");
snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]);
strcat(dictionary, buf);
snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]);
strcat(dictionary, buf);
/* A4 paper */ /* A4 paper */
strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n"); strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n");
/* Add /Parent when we know root */ if (ptr->image_length > 0) {
pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); free(dim);
snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]);
strcat(dictionary, buf);
snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]);
strcat(dictionary, buf);
/* Add /Parent when we know root */
pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0);
root_kid[cnt++] = ids[ptr->image_length + 2];
} else {
snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length]);
strcat(dictionary, buf);
/* Add /Parent when we know root */
pdf_obj_append(&pdf, ids[ptr->image_length + 1], NULL, dictionary, NULL, 0);
root_kid[cnt++] = ids[ptr->image_length + 1];
}
free(dictionary); free(dictionary);
root_kid[cnt++] = ids[ptr->image_length + 2];
free(ids); free(ids);
free(dim);
ptr = ptr->next; ptr = ptr->next;
} }

View file

@ -13,12 +13,17 @@ int
cnki_zlib(char **dst, int *dst_size, cnki_zlib(char **dst, int *dst_size,
const char * restrict src, int src_size) const char * restrict src, int src_size)
{ {
uint8_t padding = 0;
int32_t size; int32_t size;
memcpy(&size, src + 20, 4);
if (strncmp(src + 8, "COMPRESSTEXT", 12) == 0)
padding = 8;
memcpy(&size, src + 12 + padding, 4);
*dst_size = size; *dst_size = size;
if (strinflate(dst, size, src + 24, src_size - 24) != 0) if (strinflate(dst, size, src + 16 + padding, src_size - 16 - padding) != 0)
return 1; return 1;
return 0; return 0;

View file

@ -98,7 +98,8 @@ main(int argc, char **argv)
strerror(errno)); strerror(errno));
return EXIT_FAILURE; return EXIT_FAILURE;
} }
} else if (strncmp(param->file_stat->type, "HN", 2) == 0) { } else if (strncmp(param->file_stat->type, "HN", 2) == 0 ||
(unsigned char) param->file_stat->type[0] == 0xc8) {
if (cnki_hn(&param) != 0) { if (cnki_hn(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0], fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno)); strerror(errno));