Handle headless HN and page with no image.

Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
yzrh 2022-12-25 18:03:01 +00:00
parent d2826fa075
commit c2ad6549fb
6 changed files with 248 additions and 191 deletions

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -54,6 +54,11 @@ cnki_destroy(cnki_t **param)
object_hn_t *ptr_hn;
while ((ptr_hn = (*param)->object_hn) != NULL) {
(*param)->object_hn = (*param)->object_hn->next;
free(ptr_hn->text);
if (ptr_hn->image_data != NULL)
for (int i = 0; i < ptr_hn->image_length; i++)
free(ptr_hn->image_data[i].image);
free(ptr_hn->image_data);
free(ptr_hn);
}
@ -71,12 +76,19 @@ cnki_info(cnki_t **param)
printf("Reading file header at 0x%x\n", ADDRESS_HEAD);
int addr[2];
unsigned char str[2];
fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET);
fread((*param)->file_stat->type, 4, 1, (*param)->fp_i);
if ((*param)->stat > 0)
fread(str, 2, 1, (*param)->fp_i);
if ((*param)->stat > 0) {
if ((unsigned char) (*param)->file_stat->type[0] > 0x7f)
printf("File type is '%02x'\n", (unsigned char) (*param)->file_stat->type[0]);
else
printf("File type is '%s'\n", (*param)->file_stat->type);
}
if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) {
return 0;
@ -86,6 +98,9 @@ cnki_info(cnki_t **param)
} else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) {
addr[0] = ADDRESS_HN_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
addr[0] = ADDRESS_C8_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) {
return 0;
} else {
@ -102,6 +117,14 @@ cnki_info(cnki_t **param)
printf("Advised %d page(s)\n",
(*param)->file_stat->page);
if (strncmp((*param)->file_stat->type, "HN", 2) == 0 && str[0] == 0xc8 && str[1] == 0x00) {
fseek((*param)->fp_i, 0xd8, SEEK_SET);
return 0;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
fseek((*param)->fp_i, 0x50, SEEK_SET);
return 0;
}
if ((*param)->stat > 1)
printf("Reading outline count at 0x%x\n", addr[1]);

View file

@ -16,6 +16,8 @@
#define ADDRESS_HN_PAGE 0x0090
#define ADDRESS_HN_OUTLINE 0x0158
#define ADDRESS_C8_PAGE 0x0008
#define ADDRESS_KDH_BODY 0x00fe
#define KEY_KDH "FZHMEI"
@ -64,7 +66,8 @@ typedef struct _object_hn_t {
int32_t text_size;
int16_t image_length;
int16_t page;
int32_t unknown[2]; /* TODO: what is it? */
int32_t unknown; /* TODO: what is it? */
int32_t address_next;
char *text;
struct _hn_image_t *image_data;
struct _object_hn_t *next;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -27,12 +27,13 @@ cnki_hn(cnki_t **param)
if ((*param)->stat > 1) {
printf("Loading page(s)\n");
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n",
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4s\t%8s\t%8s\n",
"address",
"text",
"length",
"page",
"unknown",
"next",
"code",
"address",
"image");
@ -44,7 +45,8 @@ cnki_hn(cnki_t **param)
fread(&ptr->text_size, 4, 1, (*param)->fp_i);
fread(&ptr->image_length, 2, 1, (*param)->fp_i);
fread(&ptr->page, 2, 1, (*param)->fp_i);
fread(&ptr->unknown, 8, 1, (*param)->fp_i);
fread(&ptr->unknown, 4, 1, (*param)->fp_i);
fread(&ptr->address_next, 4, 1, (*param)->fp_i);
ptr->text = NULL;
ptr->image_data = NULL;
@ -62,6 +64,7 @@ cnki_hn(cnki_t **param)
ptr = (*param)->object_hn;
while (ptr != NULL) {
if (ptr->text_size > 0) {
ptr->text = malloc(ptr->text_size);
if (ptr->text == NULL)
@ -69,16 +72,18 @@ cnki_hn(cnki_t **param)
fseek((*param)->fp_i, ptr->address, SEEK_SET);
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
}
if ((*param)->stat > 1)
printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}",
printf("\t%08x\t%8d\t%6d\t%4d\t%8d\t%08x",
ptr->address,
ptr->text_size,
ptr->image_length,
ptr->page,
ptr->unknown[0],
ptr->unknown[1]);
ptr->unknown,
ptr->address_next);
if (ptr->image_length > 0) {
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
if (ptr->image_data == NULL)
@ -111,7 +116,8 @@ cnki_hn(cnki_t **param)
ptr->image_data[i].address,
ptr->image_data[i].size);
} else {
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n",
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4d\t%08x\t%8d\n",
"",
"",
"",
"",
@ -123,6 +129,12 @@ cnki_hn(cnki_t **param)
}
}
}
} else if ((*param)->stat > 1) {
printf("\t%4s\t%8s\t%8s\n",
"",
"",
"");
}
ptr = ptr->next;
}

View file

@ -481,7 +481,10 @@ cnki_pdf_hn(cnki_t **param)
* page object
*/
int *ids = NULL;
if (ptr->image_length > 0)
pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3);
else
pdf_get_free_ids(&pdf, &ids, 2);
int bitmap_size;
char *bitmap;
@ -489,10 +492,10 @@ cnki_pdf_hn(cnki_t **param)
int stream_size;
char *stream;
int *dim = malloc(2 * ptr->image_length * sizeof(int));
int *dim;
int ret;
int info[3];
if (ptr->image_length > 0) {
dim = malloc(2 * ptr->image_length * sizeof(int));
if (dim == NULL) {
free(root_kid);
@ -509,6 +512,10 @@ cnki_pdf_hn(cnki_t **param)
free(dim);
return 1;
}
}
int ret;
int info[3];
for (int i = 0; i < ptr->image_length; i++) {
memset(dictionary, 0, dictionary_size);
@ -684,6 +691,7 @@ cnki_pdf_hn(cnki_t **param)
}
}
if (ptr->image_length > 0) {
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<<\n/XObject <<");
@ -701,49 +709,24 @@ cnki_pdf_hn(cnki_t **param)
pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0);
free(dictionary);
}
int conv_size;
char *conv_dst;
char conv_src[2];
char conv_hex[3];
if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) {
if (ptr->text_size > 0) {
if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0 ||
strncmp(ptr->text, "COMPRESSTEXT", 12) == 0) {
cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size);
dictionary_size = 64 + 2 * stream_size;
dictionary = malloc(dictionary_size);
free(ptr->text);
if (dictionary == NULL) {
free(root_kid);
free(ids);
free(dim);
return 1;
ptr->text_size = stream_size;
ptr->text = stream;
}
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<feff");
for (int i = 0; i < stream_size; i += 16) {
conv_src[0] = stream[i + 7];
conv_src[1] = stream[i + 6];
conv_size = 6;
if (strconv(&conv_dst, "UTF-16BE",
conv_src, "GB18030", &conv_size) == 0) {
for (int j = 0; j < conv_size - 2; j++) {
snprintf(conv_hex, 3,
"%02x", (unsigned char) conv_dst[j]);
strcat(dictionary, conv_hex);
}
free(conv_dst);
}
}
free(stream);
strcat(dictionary, ">");
} else {
dictionary_size = 64 + 2 * ptr->text_size;
dictionary = malloc(dictionary_size);
@ -758,9 +741,26 @@ cnki_pdf_hn(cnki_t **param)
strcat(dictionary, "<feff");
for (int i = 0; i < ptr->text_size; i += 4) {
conv_src[0] = ptr->text[i + 3];
conv_src[1] = ptr->text[i + 2];
for (int i = 0; i < ptr->text_size; i += 6) {
if (i + 5 >= ptr->text_size)
break;
conv_src[0] = ptr->text[i + 5];
conv_src[1] = ptr->text[i + 4];
if ((conv_src[0] << 8 | conv_src[1]) == 0xa389) {
strcat(dictionary, "a389");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38a) {
strcat(dictionary, "a38a");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38d) {
strcat(dictionary, "a38d");
continue;
} else if ((conv_src[0] << 8 | conv_src[1]) == 0xa3a0) {
strcat(dictionary, "a3a0");
continue;
}
conv_size = 6;
@ -776,12 +776,12 @@ cnki_pdf_hn(cnki_t **param)
}
strcat(dictionary, ">");
}
/* FIXME: Use the text somehow? */
free(dictionary);
}
dictionary_size = 64 + 64 * ptr->image_length;
dictionary_size = 64 + 128 * ptr->image_length;
dictionary = malloc(dictionary_size);
if (dictionary == NULL) {
@ -791,6 +791,7 @@ cnki_pdf_hn(cnki_t **param)
return 1;
}
if (ptr->image_length > 0) {
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "q\n");
@ -858,29 +859,41 @@ cnki_pdf_hn(cnki_t **param)
NULL, dictionary, stream, stream_size);
free(stream);
}
memset(dictionary, 0, dictionary_size);
strcat(dictionary, "<<\n/Type /Page\n");
/* A4 paper */
strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n");
if (ptr->image_length > 0) {
free(dim);
snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]);
strcat(dictionary, buf);
snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]);
strcat(dictionary, buf);
/* A4 paper */
strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n");
/* Add /Parent when we know root */
pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0);
root_kid[cnt++] = ids[ptr->image_length + 2];
} else {
snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length]);
strcat(dictionary, buf);
/* Add /Parent when we know root */
pdf_obj_append(&pdf, ids[ptr->image_length + 1], NULL, dictionary, NULL, 0);
root_kid[cnt++] = ids[ptr->image_length + 1];
}
free(dictionary);
root_kid[cnt++] = ids[ptr->image_length + 2];
free(ids);
free(dim);
ptr = ptr->next;
}

View file

@ -13,12 +13,17 @@ int
cnki_zlib(char **dst, int *dst_size,
const char * restrict src, int src_size)
{
uint8_t padding = 0;
int32_t size;
memcpy(&size, src + 20, 4);
if (strncmp(src + 8, "COMPRESSTEXT", 12) == 0)
padding = 8;
memcpy(&size, src + 12 + padding, 4);
*dst_size = size;
if (strinflate(dst, size, src + 24, src_size - 24) != 0)
if (strinflate(dst, size, src + 16 + padding, src_size - 16 - padding) != 0)
return 1;
return 0;

View file

@ -98,7 +98,8 @@ main(int argc, char **argv)
strerror(errno));
return EXIT_FAILURE;
}
} else if (strncmp(param->file_stat->type, "HN", 2) == 0) {
} else if (strncmp(param->file_stat->type, "HN", 2) == 0 ||
(unsigned char) param->file_stat->type[0] == 0xc8) {
if (cnki_hn(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));