From 288b65a1fd6bcb1908a31c0a3e18ce7b6bf53a89 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 25 Dec 2022 01:26:05 +0000 Subject: [PATCH 01/41] Handle different JPEG colour component. Signed-off-by: yzrh --- CHANGE.md | 5 +++++ src/cnki_pdf.c | 45 ++++++++++++++++++++++++++------------------- src/jpeg.c | 5 +++-- src/jpeg.h | 4 ++-- src/version.h | 2 +- 5 files changed, 37 insertions(+), 24 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index e4217a5..b4d1136 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -3,6 +3,11 @@ * Support JPEG 2000 for HN. +0.2.1 (2022-12-XX) +================== + +* Handle different JPEG colour component. + 0.2.0 (2022-12-22) ================== diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index b59b7c6..c56a45f 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -492,7 +492,7 @@ cnki_pdf_hn(cnki_t **param) int *dim = malloc(2 * ptr->image_length * sizeof(int)); int ret; - int wh[2]; + int info[3]; if (dim == NULL) { free(root_kid); @@ -524,8 +524,8 @@ cnki_pdf_hn(cnki_t **param) case JBIG: ret = cnki_jbig(&bitmap, &bitmap_size, - &wh[0], - &wh[1], + &info[0], + &info[1], ptr->image_data[i].image, ptr->image_data[i].size); @@ -547,7 +547,7 @@ cnki_pdf_hn(cnki_t **param) free(bitmap); snprintf(buf, 64, "/Width %d\n/Height %d\n", - wh[0], wh[1]); + info[0], info[1]); strcat(dictionary, buf); strcat(dictionary, "/ColorSpace /DeviceGray\n" @@ -560,13 +560,14 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "/Filter /FlateDecode\n"); - dim[i * 2] = wh[0]; - dim[i * 2 + 1] = wh[1]; + dim[i * 2] = info[0]; + dim[i * 2 + 1] = info[1]; break; case DCT_0: case DCT_1: - ret = strinfo_jpeg_dim(&wh[0], - &wh[1], + ret = strinfo_jpeg_dim(&info[0], + &info[1], + &info[2], ptr->image_data[i].image, ptr->image_data[i].size); @@ -588,11 +589,17 @@ cnki_pdf_hn(cnki_t **param) memcpy(stream, ptr->image_data[i].image, stream_size); snprintf(buf, 64, "/Width %d\n/Height %d\n", - wh[0], wh[1]); + info[0], info[1]); strcat(dictionary, buf); - strcat(dictionary, "/ColorSpace /DeviceGray\n" - "/BitsPerComponent 8\n"); + if (info[2] == 1) + strcat(dictionary, "/ColorSpace /DeviceGray\n"); + else if (info[2] == 3) + strcat(dictionary, "/ColorSpace /DeviceRGB\n"); + else + strcat(dictionary, "/ColorSpace /DeviceCMYK\n"); + + strcat(dictionary, "/BitsPerComponent 8\n"); snprintf(buf, 64, "/Length %d\n", stream_size); @@ -600,14 +607,14 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "/Filter /DCTDecode\n"); - dim[i * 2] = wh[0]; - dim[i * 2 + 1] = wh[1]; + dim[i * 2] = info[0]; + dim[i * 2 + 1] = info[1]; break; case JBIG2: ret = cnki_jbig2(&bitmap, &bitmap_size, - &wh[0], - &wh[1], + &info[0], + &info[1], ptr->image_data[i].image, ptr->image_data[i].size); @@ -629,7 +636,7 @@ cnki_pdf_hn(cnki_t **param) free(bitmap); snprintf(buf, 64, "/Width %d\n/Height %d\n", - wh[0], wh[1]); + info[0], info[1]); strcat(dictionary, buf); strcat(dictionary, "/ColorSpace /DeviceGray\n" @@ -642,8 +649,8 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "/Filter /FlateDecode\n"); - dim[i * 2] = wh[0]; - dim[i * 2 + 1] = wh[1]; + dim[i * 2] = info[0]; + dim[i * 2 + 1] = info[1]; break; case JPX: default: @@ -658,7 +665,7 @@ cnki_pdf_hn(cnki_t **param) if (ret == 0) { if ((*param)->stat > 2) printf("%6d byte(s), width %4d, height %4d.\n", - stream_size, wh[0], wh[1]); + stream_size, info[0], info[1]); pdf_obj_append(&pdf, ids[i], NULL, dictionary, stream, stream_size); diff --git a/src/jpeg.c b/src/jpeg.c index 4ea4d7f..cdcae7b 100644 --- a/src/jpeg.c +++ b/src/jpeg.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -9,7 +9,7 @@ #include int -strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, +strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components, const char * restrict data, int data_size) { struct jpeg_decompress_struct cinfo; @@ -27,6 +27,7 @@ strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, *jpeg_width = cinfo.output_width; *jpeg_height = cinfo.output_height; + *jpeg_components = cinfo.output_components; jpeg_destroy((struct jpeg_common_struct *) &cinfo); diff --git a/src/jpeg.h b/src/jpeg.h index db35d94..1f5caa7 100644 --- a/src/jpeg.h +++ b/src/jpeg.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2020-2021, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ -int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, +int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components, const char * restrict data, int data_size); diff --git a/src/version.h b/src/version.h index 4e5cfa6..7c1ca3b 100644 --- a/src/version.h +++ b/src/version.h @@ -6,5 +6,5 @@ #define VERSION "0" #define RELEASE "2" -#define PATCH "0" +#define PATCH "1" #define EXTRA "" From d2826fa075544ada1fb9f530a375ef85f58c8ea0 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 25 Dec 2022 05:15:56 +0000 Subject: [PATCH 02/41] Simplify JBIG decoder. Signed-off-by: yzrh --- src/jbig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jbig.c b/src/jbig.c index 09a3d92..fce8e02 100644 --- a/src/jbig.c +++ b/src/jbig.c @@ -207,7 +207,7 @@ static void _procline(int line, char *a, char *b, char *c) { /* The encoder must be erroneous */ - uint16_t cx = (*b & 0x01) << 2; + uint16_t cx = 0; for (int i = 0; i < _width; i++) { _decode(cx); From c2ad6549fb337ce707e04aa441c9b492171a3b9d Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 25 Dec 2022 18:03:01 +0000 Subject: [PATCH 03/41] Handle headless HN and page with no image. Signed-off-by: yzrh --- src/cnki.c | 29 ++++- src/cnki.h | 5 +- src/cnki_hn.c | 108 ++++++++++-------- src/cnki_pdf.c | 285 +++++++++++++++++++++++++----------------------- src/cnki_zlib.c | 9 +- src/melon.c | 3 +- 6 files changed, 248 insertions(+), 191 deletions(-) diff --git a/src/cnki.c b/src/cnki.c index 5f120d0..cc49d73 100644 --- a/src/cnki.c +++ b/src/cnki.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -54,6 +54,11 @@ cnki_destroy(cnki_t **param) object_hn_t *ptr_hn; while ((ptr_hn = (*param)->object_hn) != NULL) { (*param)->object_hn = (*param)->object_hn->next; + free(ptr_hn->text); + if (ptr_hn->image_data != NULL) + for (int i = 0; i < ptr_hn->image_length; i++) + free(ptr_hn->image_data[i].image); + free(ptr_hn->image_data); free(ptr_hn); } @@ -71,12 +76,19 @@ cnki_info(cnki_t **param) printf("Reading file header at 0x%x\n", ADDRESS_HEAD); int addr[2]; + unsigned char str[2]; fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET); fread((*param)->file_stat->type, 4, 1, (*param)->fp_i); - if ((*param)->stat > 0) - printf("File type is '%s'\n", (*param)->file_stat->type); + fread(str, 2, 1, (*param)->fp_i); + + if ((*param)->stat > 0) { + if ((unsigned char) (*param)->file_stat->type[0] > 0x7f) + printf("File type is '%02x'\n", (unsigned char) (*param)->file_stat->type[0]); + else + printf("File type is '%s'\n", (*param)->file_stat->type); + } if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) { return 0; @@ -86,6 +98,9 @@ cnki_info(cnki_t **param) } else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) { addr[0] = ADDRESS_HN_PAGE; addr[1] = ADDRESS_HN_OUTLINE; + } else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) { + addr[0] = ADDRESS_C8_PAGE; + addr[1] = ADDRESS_HN_OUTLINE; } else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) { return 0; } else { @@ -102,6 +117,14 @@ cnki_info(cnki_t **param) printf("Advised %d page(s)\n", (*param)->file_stat->page); + if (strncmp((*param)->file_stat->type, "HN", 2) == 0 && str[0] == 0xc8 && str[1] == 0x00) { + fseek((*param)->fp_i, 0xd8, SEEK_SET); + return 0; + } else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) { + fseek((*param)->fp_i, 0x50, SEEK_SET); + return 0; + } + if ((*param)->stat > 1) printf("Reading outline count at 0x%x\n", addr[1]); diff --git a/src/cnki.h b/src/cnki.h index 237a2c1..193e69b 100644 --- a/src/cnki.h +++ b/src/cnki.h @@ -16,6 +16,8 @@ #define ADDRESS_HN_PAGE 0x0090 #define ADDRESS_HN_OUTLINE 0x0158 +#define ADDRESS_C8_PAGE 0x0008 + #define ADDRESS_KDH_BODY 0x00fe #define KEY_KDH "FZHMEI" @@ -64,7 +66,8 @@ typedef struct _object_hn_t { int32_t text_size; int16_t image_length; int16_t page; - int32_t unknown[2]; /* TODO: what is it? */ + int32_t unknown; /* TODO: what is it? */ + int32_t address_next; char *text; struct _hn_image_t *image_data; struct _object_hn_t *next; diff --git a/src/cnki_hn.c b/src/cnki_hn.c index feabb48..4d32092 100644 --- a/src/cnki_hn.c +++ b/src/cnki_hn.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -27,12 +27,13 @@ cnki_hn(cnki_t **param) if ((*param)->stat > 1) { printf("Loading page(s)\n"); - printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n", + printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4s\t%8s\t%8s\n", "address", "text", "length", "page", "unknown", + "next", "code", "address", "image"); @@ -44,7 +45,8 @@ cnki_hn(cnki_t **param) fread(&ptr->text_size, 4, 1, (*param)->fp_i); fread(&ptr->image_length, 2, 1, (*param)->fp_i); fread(&ptr->page, 2, 1, (*param)->fp_i); - fread(&ptr->unknown, 8, 1, (*param)->fp_i); + fread(&ptr->unknown, 4, 1, (*param)->fp_i); + fread(&ptr->address_next, 4, 1, (*param)->fp_i); ptr->text = NULL; ptr->image_data = NULL; @@ -62,66 +64,76 @@ cnki_hn(cnki_t **param) ptr = (*param)->object_hn; while (ptr != NULL) { - ptr->text = malloc(ptr->text_size); + if (ptr->text_size > 0) { + ptr->text = malloc(ptr->text_size); - if (ptr->text == NULL) - return 1; + if (ptr->text == NULL) + return 1; - fseek((*param)->fp_i, ptr->address, SEEK_SET); - fread(ptr->text, ptr->text_size, 1, (*param)->fp_i); + fseek((*param)->fp_i, ptr->address, SEEK_SET); + fread(ptr->text, ptr->text_size, 1, (*param)->fp_i); + } if ((*param)->stat > 1) - printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}", + printf("\t%08x\t%8d\t%6d\t%4d\t%8d\t%08x", ptr->address, ptr->text_size, ptr->image_length, ptr->page, - ptr->unknown[0], - ptr->unknown[1]); + ptr->unknown, + ptr->address_next); - ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t)); + if (ptr->image_length > 0) { + ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t)); - if (ptr->image_data == NULL) - return 1; - - for (int i = 0; i < ptr->image_length; i++) { - fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); - fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); - fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); - fseek((*param)->fp_i, - ptr->image_data[i].address + ptr->image_data[i].size, - SEEK_SET); - } - - for (int i = 0; i < ptr->image_length; i++) { - ptr->image_data[i].image = malloc(ptr->image_data[i].size); - - if (ptr->image_data[i].image == NULL) + if (ptr->image_data == NULL) return 1; - fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET); - fread(ptr->image_data[i].image, - ptr->image_data[i].size, 1, - (*param)->fp_i); + for (int i = 0; i < ptr->image_length; i++) { + fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); + fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); + fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); + fseek((*param)->fp_i, + ptr->image_data[i].address + ptr->image_data[i].size, + SEEK_SET); + } - if ((*param)->stat > 1) { - if (i == 0) { - printf("\t%4d\t%08x\t%8d\n", - ptr->image_data[i].format, - ptr->image_data[i].address, - ptr->image_data[i].size); - } else { - printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n", - "", - "", - "", - "", - "", - ptr->image_data[i].format, - ptr->image_data[i].address, - ptr->image_data[i].size); + for (int i = 0; i < ptr->image_length; i++) { + ptr->image_data[i].image = malloc(ptr->image_data[i].size); + + if (ptr->image_data[i].image == NULL) + return 1; + + fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET); + fread(ptr->image_data[i].image, + ptr->image_data[i].size, 1, + (*param)->fp_i); + + if ((*param)->stat > 1) { + if (i == 0) { + printf("\t%4d\t%08x\t%8d\n", + ptr->image_data[i].format, + ptr->image_data[i].address, + ptr->image_data[i].size); + } else { + printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4d\t%08x\t%8d\n", + "", + "", + "", + "", + "", + "", + ptr->image_data[i].format, + ptr->image_data[i].address, + ptr->image_data[i].size); + } } } + } else if ((*param)->stat > 1) { + printf("\t%4s\t%8s\t%8s\n", + "", + "", + ""); } ptr = ptr->next; diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index c56a45f..603ce01 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -481,7 +481,10 @@ cnki_pdf_hn(cnki_t **param) * page object */ int *ids = NULL; - pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); + if (ptr->image_length > 0) + pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); + else + pdf_get_free_ids(&pdf, &ids, 2); int bitmap_size; char *bitmap; @@ -489,27 +492,31 @@ cnki_pdf_hn(cnki_t **param) int stream_size; char *stream; - int *dim = malloc(2 * ptr->image_length * sizeof(int)); + int *dim; + + if (ptr->image_length > 0) { + dim = malloc(2 * ptr->image_length * sizeof(int)); + + if (dim == NULL) { + free(root_kid); + free(ids); + return 1; + } + + dictionary_size = 256; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(ids); + free(dim); + return 1; + } + } int ret; int info[3]; - if (dim == NULL) { - free(root_kid); - free(ids); - return 1; - } - - dictionary_size = 256; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; - } - for (int i = 0; i < ptr->image_length; i++) { memset(dictionary, 0, dictionary_size); @@ -684,66 +691,42 @@ cnki_pdf_hn(cnki_t **param) } } - memset(dictionary, 0, dictionary_size); + if (ptr->image_length > 0) { + memset(dictionary, 0, dictionary_size); - strcat(dictionary, "<<\n/XObject <<"); + strcat(dictionary, "<<\n/XObject <<"); - for (int i = 0; i < ptr->image_length; i++) { - snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); - strcat(dictionary, buf); + for (int i = 0; i < ptr->image_length; i++) { + snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); + strcat(dictionary, buf); - if (i + 1 < ptr->image_length) - strcat(dictionary, " "); + if (i + 1 < ptr->image_length) + strcat(dictionary, " "); + } + + strcat(dictionary, ">>\n>>"); + + pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); + + free(dictionary); } - strcat(dictionary, ">>\n>>"); - - pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); - - free(dictionary); - int conv_size; char *conv_dst; char conv_src[2]; char conv_hex[3]; - if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) { - cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); + if (ptr->text_size > 0) { + if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0 || + strncmp(ptr->text, "COMPRESSTEXT", 12) == 0) { + cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); - dictionary_size = 64 + 2 * stream_size; - dictionary = malloc(dictionary_size); + free(ptr->text); - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; + ptr->text_size = stream_size; + ptr->text = stream; } - memset(dictionary, 0, dictionary_size); - - strcat(dictionary, ""); - } else { dictionary_size = 64 + 2 * ptr->text_size; dictionary = malloc(dictionary_size); @@ -758,9 +741,26 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "text_size; i += 4) { - conv_src[0] = ptr->text[i + 3]; - conv_src[1] = ptr->text[i + 2]; + for (int i = 0; i < ptr->text_size; i += 6) { + if (i + 5 >= ptr->text_size) + break; + + conv_src[0] = ptr->text[i + 5]; + conv_src[1] = ptr->text[i + 4]; + + if ((conv_src[0] << 8 | conv_src[1]) == 0xa389) { + strcat(dictionary, "a389"); + continue; + } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38a) { + strcat(dictionary, "a38a"); + continue; + } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38d) { + strcat(dictionary, "a38d"); + continue; + } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa3a0) { + strcat(dictionary, "a3a0"); + continue; + } conv_size = 6; @@ -776,12 +776,12 @@ cnki_pdf_hn(cnki_t **param) } strcat(dictionary, ">"); + + /* FIXME: Use the text somehow? */ + free(dictionary); } - /* FIXME: Use the text somehow? */ - free(dictionary); - - dictionary_size = 64 + 64 * ptr->image_length; + dictionary_size = 64 + 128 * ptr->image_length; dictionary = malloc(dictionary_size); if (dictionary == NULL) { @@ -791,96 +791,109 @@ cnki_pdf_hn(cnki_t **param) return 1; } - memset(dictionary, 0, dictionary_size); + if (ptr->image_length > 0) { + memset(dictionary, 0, dictionary_size); - strcat(dictionary, "q\n"); + strcat(dictionary, "q\n"); - strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); + strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); - double resize_x; - double resize_y; + double resize_x; + double resize_y; - for (int i = 0; i < ptr->image_length; i++) { - if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) - continue; + for (int i = 0; i < ptr->image_length; i++) { + if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) + continue; - /* Scale within bound of A4 paper */ - resize_x = 595.276 * 4 / dim[i * 2]; - resize_y = 841.89 * 4 / dim[i * 2 + 1]; + /* Scale within bound of A4 paper */ + resize_x = 595.276 * 4 / dim[i * 2]; + resize_y = 841.89 * 4 / dim[i * 2 + 1]; - if (resize_y < resize_x) - snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", - resize_y, resize_y); - else - snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", - resize_x, resize_x); - strcat(dictionary, buf); - - /* Apply transformation matrix */ - if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { - snprintf(buf, 64, "1 0 0 1 0 %d cm\n", - dim[i * 2 + 1]); + if (resize_y < resize_x) + snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", + resize_y, resize_y); + else + snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", + resize_x, resize_x); strcat(dictionary, buf); - strcat(dictionary, "1 0 0 -1 0 0 cm\n"); + /* Apply transformation matrix */ + if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { + snprintf(buf, 64, "1 0 0 1 0 %d cm\n", + dim[i * 2 + 1]); + strcat(dictionary, buf); + + strcat(dictionary, "1 0 0 -1 0 0 cm\n"); + } + + snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", + dim[i * 2], dim[i * 2 + 1]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Im%d Do\n", i); + strcat(dictionary, buf); } - snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", - dim[i * 2], dim[i * 2 + 1]); + strcat(dictionary, "Q"); + + if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { + free(root_kid); + free(ids); + free(dim); + free(dictionary); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n"); + + snprintf(buf, 64, "/Length %d\n", stream_size); strcat(dictionary, buf); - snprintf(buf, 64, "/Im%d Do\n", i); - strcat(dictionary, buf); + strcat(dictionary, "/Filter /FlateDecode\n"); + + strcat(dictionary, ">>"); + + pdf_obj_append(&pdf, ids[ptr->image_length + 1], + NULL, dictionary, stream, stream_size); + + free(stream); } - strcat(dictionary, "Q"); - - if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { - free(root_kid); - free(ids); - free(dim); - free(dictionary); - return 1; - } - - memset(dictionary, 0, dictionary_size); - - strcat(dictionary, "<<\n"); - - snprintf(buf, 64, "/Length %d\n", stream_size); - strcat(dictionary, buf); - - strcat(dictionary, "/Filter /FlateDecode\n"); - - strcat(dictionary, ">>"); - - pdf_obj_append(&pdf, ids[ptr->image_length + 1], - NULL, dictionary, stream, stream_size); - - free(stream); - memset(dictionary, 0, dictionary_size); strcat(dictionary, "<<\n/Type /Page\n"); - snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); - strcat(dictionary, buf); - - snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); - strcat(dictionary, buf); - /* A4 paper */ strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n"); - /* Add /Parent when we know root */ - pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); + if (ptr->image_length > 0) { + free(dim); + + snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); + strcat(dictionary, buf); + + /* Add /Parent when we know root */ + pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); + + root_kid[cnt++] = ids[ptr->image_length + 2]; + } else { + snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length]); + strcat(dictionary, buf); + + /* Add /Parent when we know root */ + pdf_obj_append(&pdf, ids[ptr->image_length + 1], NULL, dictionary, NULL, 0); + + root_kid[cnt++] = ids[ptr->image_length + 1]; + } free(dictionary); - root_kid[cnt++] = ids[ptr->image_length + 2]; - free(ids); - free(dim); ptr = ptr->next; } diff --git a/src/cnki_zlib.c b/src/cnki_zlib.c index edff141..075456b 100644 --- a/src/cnki_zlib.c +++ b/src/cnki_zlib.c @@ -13,12 +13,17 @@ int cnki_zlib(char **dst, int *dst_size, const char * restrict src, int src_size) { + uint8_t padding = 0; int32_t size; - memcpy(&size, src + 20, 4); + + if (strncmp(src + 8, "COMPRESSTEXT", 12) == 0) + padding = 8; + + memcpy(&size, src + 12 + padding, 4); *dst_size = size; - if (strinflate(dst, size, src + 24, src_size - 24) != 0) + if (strinflate(dst, size, src + 16 + padding, src_size - 16 - padding) != 0) return 1; return 0; diff --git a/src/melon.c b/src/melon.c index af6aaf4..f8bb645 100644 --- a/src/melon.c +++ b/src/melon.c @@ -98,7 +98,8 @@ main(int argc, char **argv) strerror(errno)); return EXIT_FAILURE; } - } else if (strncmp(param->file_stat->type, "HN", 2) == 0) { + } else if (strncmp(param->file_stat->type, "HN", 2) == 0 || + (unsigned char) param->file_stat->type[0] == 0xc8) { if (cnki_hn(¶m) != 0) { fprintf(stderr, "%s: %s\n", argv[0], strerror(errno)); From 224a09a015de72f56ee86841504575fe474c6260 Mon Sep 17 00:00:00 2001 From: yzrh Date: Mon, 26 Dec 2022 00:12:46 +0000 Subject: [PATCH 04/41] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index b4d1136..c4ce375 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -1,12 +1,15 @@ -0.3.0 (2022-XX-XX) +0.3.0 (2023-XX-XX) ================== * Support JPEG 2000 for HN. +* Support HN figure placement. +* Support HN text overlay. -0.2.1 (2022-12-XX) +0.2.1 (2022-12-26) ================== * Handle different JPEG colour component. +* Handle headless HN and page with no image. 0.2.0 (2022-12-22) ================== From abce2fd2e4f8089779fb9b1dce94133716b0bb39 Mon Sep 17 00:00:00 2001 From: yzrh Date: Mon, 26 Dec 2022 03:46:01 +0000 Subject: [PATCH 05/41] Add preliminary support for HN figure placement. Signed-off-by: yzrh --- README.md | 4 +- src/cnki.c | 2 +- src/cnki.h | 4 + src/cnki_hn.c | 4 + src/cnki_pdf.c | 193 ++++++++++++++++++++++++++++++++++++------------- src/version.h | 2 +- 6 files changed, 156 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 471282e..b94a3e7 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,9 @@ Specify output file Set buffer size (default 512k) -v, --verbose -Print more information (twice for even more, three times for HN image decoding information as well) +Print more information (twice for even more, three times for HN image processing information as well) Thanks ====== -This project is inspired by [https://github.com/JeziL/caj2pdf](https://github.com/JeziL/caj2pdf) +This project is inspired by [https://github.com/caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf) diff --git a/src/cnki.c b/src/cnki.c index cc49d73..8c2e6e6 100644 --- a/src/cnki.c +++ b/src/cnki.c @@ -138,7 +138,7 @@ cnki_info(cnki_t **param) if ((*param)->file_stat->outline > 0) { if ((*param)->stat > 1) { printf("Loading outline(s)\n"); - printf("\t%16s\t%-24s\t%12s\t%12s\t%5s\n", + printf("\t%19s\t%-24s\t%12s\t%12s\t%5s\n", "title", "hierarchy", "page", diff --git a/src/cnki.h b/src/cnki.h index 193e69b..e9cc5d1 100644 --- a/src/cnki.h +++ b/src/cnki.h @@ -58,6 +58,10 @@ typedef struct _hn_image_t { int32_t format; /* hn_code */ int32_t address; int32_t size; + int16_t x; + int16_t y; + int16_t w; + int16_t h; char *image; } hn_image_t; diff --git a/src/cnki_hn.c b/src/cnki_hn.c index 4d32092..c2f76ec 100644 --- a/src/cnki_hn.c +++ b/src/cnki_hn.c @@ -93,6 +93,10 @@ cnki_hn(cnki_t **param) fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); + ptr->image_data[i].x = 0; + ptr->image_data[i].y = 0; + ptr->image_data[i].w = 0; + ptr->image_data[i].h = 0; fseek((*param)->fp_i, ptr->image_data[i].address + ptr->image_data[i].size, SEEK_SET); diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 603ce01..0cb30ca 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -524,7 +524,7 @@ cnki_pdf_hn(cnki_t **param) "/Subtype /Image\n"); if ((*param)->stat > 2) - printf("\tDecoding data, page %04d item %02d format %d... ", + printf("\tProcessing image, page %04d item %d format %d... ", ptr->page, i, ptr->image_data[i].format); switch (ptr->image_data[i].format) { @@ -700,7 +700,7 @@ cnki_pdf_hn(cnki_t **param) snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); strcat(dictionary, buf); - if (i + 1 < ptr->image_length) + if (i < ptr->image_length - 1) strcat(dictionary, " "); } @@ -739,43 +739,112 @@ cnki_pdf_hn(cnki_t **param) memset(dictionary, 0, dictionary_size); - strcat(dictionary, "text_size; i += 6) { - if (i + 5 >= ptr->text_size) - break; + for (int i = 0, j = 0; i < ptr->text_size - 1;) { + switch ((uint16_t) (ptr->text[i + 1] << 8 | ptr->text[i])) { + case 0x8001: + if (ptr->address_next <= ptr->address) { + i += 2; + break; + } - conv_src[0] = ptr->text[i + 5]; - conv_src[1] = ptr->text[i + 4]; + strcat(dictionary, "T*\n"); + case 0x8070: + if (ptr->address_next > ptr->address) { + i += 4; - if ((conv_src[0] << 8 | conv_src[1]) == 0xa389) { - strcat(dictionary, "a389"); - continue; - } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38a) { - strcat(dictionary, "a38a"); - continue; - } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38d) { - strcat(dictionary, "a38d"); - continue; - } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa3a0) { - strcat(dictionary, "a3a0"); - continue; - } + for (;;) { + if (i + 3 >= ptr->text_size || + (unsigned char) ptr->text[i + 1] == 0x80) + break; - conv_size = 6; + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - for (int j = 0; j < conv_size - 2; j++) { - snprintf(conv_hex, 3, - "%02x", (unsigned char) conv_dst[j]); - strcat(dictionary, conv_hex); - } - free(conv_dst); + conv_size = 6; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + if (conv_size - 2 > 0) { + strcat(dictionary, " Tj\n"); + } + free(conv_dst); + } + + i += 4; + } + + break; + } + + if (i + 7 >= ptr->text_size) { + i += 2; + break; + } + + conv_src[0] = ptr->text[i + 7]; + conv_src[1] = ptr->text[i + 6]; + + conv_size = 6; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + if (conv_size - 2 > 0) { + strcat(dictionary, " Tj\n"); + } + free(conv_dst); + } + + i += 8; + break; + case 0x800a: + if (i + 27 >= ptr->text_size || j >= ptr->image_length) { + i += 2; + break; + } + + if (ptr->image_length > 0) { + ptr->image_data[j].x = + ptr->text[i + 5] << 8 | ptr->text[i + 4]; + ptr->image_data[j].y = + ptr->text[i + 7] << 8 | ptr->text[i + 6]; + ptr->image_data[j].w = + ptr->text[i + 9] << 8 | ptr->text[i + 8]; + ptr->image_data[j].h = + ptr->text[i + 11] << 8 | ptr->text[i + 10]; + + if ((*param)->stat > 2) + printf("\tItem %d: origin (%4d, %4d), width %4d, height %4d\n", + j, + ptr->image_data[j].x, + ptr->image_data[j].y, + ptr->image_data[j].w, + ptr->image_data[j].h); + } + + i += 28; + j++; + break; + default: + i += 4; + break; } } - strcat(dictionary, ">"); + strcat(dictionary, "ET"); /* FIXME: Use the text somehow? */ free(dictionary); @@ -794,20 +863,14 @@ cnki_pdf_hn(cnki_t **param) if (ptr->image_length > 0) { memset(dictionary, 0, dictionary_size); - strcat(dictionary, "q\n"); - - strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); - - double resize_x; - double resize_y; - - for (int i = 0; i < ptr->image_length; i++) { - if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) - continue; + char resize_str[64] = "0.25 0 0 0.25 0 0 cm\n"; + double resize_x = 1; + double resize_y = 1; + if (dim[0] > 0 && dim[1] > 0) { /* Scale within bound of A4 paper */ - resize_x = 595.276 * 4 / dim[i * 2]; - resize_y = 841.89 * 4 / dim[i * 2 + 1]; + resize_x = 4 * 595.2756 / dim[0]; + resize_y = 4 * 841.8898 / dim[1]; if (resize_y < resize_x) snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", @@ -815,9 +878,18 @@ cnki_pdf_hn(cnki_t **param) else snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", resize_x, resize_x); - strcat(dictionary, buf); + strcat(resize_str, buf); + } - /* Apply transformation matrix */ + for (int i = 0; i < ptr->image_length; i++) { + if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) + continue; + + strcat(dictionary, "q\n"); + + strcat(dictionary, resize_str); + + /* Rotate image */ if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { snprintf(buf, 64, "1 0 0 1 0 %d cm\n", dim[i * 2 + 1]); @@ -826,15 +898,38 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "1 0 0 -1 0 0 cm\n"); } + /* Translate figure */ + if (i > 0) { + double origin_x = 0.4043745 * ptr->image_data[i].x; + double origin_y = 0.4043561 * ptr->image_data[i].y; + + if (origin_x < 0) + origin_x += (2381.102 - dim[i * 2]) / 2; + + if (origin_y < 0) + origin_y += (3367.559 + dim[i * 2 + 1]) / 2; + + if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) + origin_y = -3367.559 + origin_y + dim[i * 2 + 1]; + else + origin_y = 3367.559 - origin_y - dim[i * 2 + 1]; + + snprintf(buf, 64, "1 0 0 1 %f %f cm\n", origin_x, origin_y); + strcat(dictionary, buf); + } + snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", dim[i * 2], dim[i * 2 + 1]); strcat(dictionary, buf); snprintf(buf, 64, "/Im%d Do\n", i); strcat(dictionary, buf); - } - strcat(dictionary, "Q"); + strcat(dictionary, "Q"); + + if (i < ptr->image_length - 1) + strcat(dictionary, "\n"); + } if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { free(root_kid); @@ -866,7 +961,7 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "<<\n/Type /Page\n"); /* A4 paper */ - strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n"); + strcat(dictionary, "/MediaBox [0 0 595.2756 841.8898]\n"); if (ptr->image_length > 0) { free(dim); @@ -946,7 +1041,7 @@ cnki_pdf_hn(cnki_t **param) for (int i = 0; i < (*param)->file_stat->page; i++) { snprintf(buf, 64, "%d 0 R", root_kid[i]); strcat(dictionary, buf); - if (i + 1 < (*param)->file_stat->page) + if (i < (*param)->file_stat->page - 1) strcat(dictionary, " "); } diff --git a/src/version.h b/src/version.h index 7c1ca3b..db25a27 100644 --- a/src/version.h +++ b/src/version.h @@ -6,5 +6,5 @@ #define VERSION "0" #define RELEASE "2" -#define PATCH "1" +#define PATCH "2" #define EXTRA "" From 8083b30530a37d3529d453515465a00aac74a154 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 29 Dec 2022 00:18:07 +0000 Subject: [PATCH 06/41] Add JPEG 2000 support. Signed-off-by: yzrh --- README.md | 5 +-- src/Makefile | 6 +-- src/cnki_pdf.c | 36 +++++++++++++++ src/jp2.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++ src/jp2.h | 8 ++++ 5 files changed, 168 insertions(+), 6 deletions(-) create mode 100644 src/jp2.c create mode 100644 src/jp2.h diff --git a/README.md b/README.md index b94a3e7..4e36fbd 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,6 @@ Development Currently, CAJ, KDH, and HN can be converted. Please report any failures with a sample that can reproduce the behaviour. -HN support does not support JPEG 2000 yet. - Dependency ---------- @@ -19,6 +17,7 @@ Dependency 3. zlib 4. jbig2dec 5. libjpeg-turbo +6. openjpeg Usage ===== @@ -36,7 +35,7 @@ Options Specify output file -b, --buffer -Set buffer size (default 512k) +Set input buffer size (default 512k) -v, --verbose Print more information (twice for even more, three times for HN image processing information as well) diff --git a/src/Makefile b/src/Makefile index 6943af3..065a8a5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,11 +4,11 @@ # SPDX-License-Identifier: Apache-2.0 # -src = melon.c iconv.c zlib.c jbig.c jbig2.c jpeg.c \ +src = melon.c iconv.c zlib.c jbig.c jbig2.c jpeg.c jp2.c \ cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c \ cnki_pdf.c cnki_zlib.c cnki_jbig.c cnki_jbig2.c cnki.c \ pdf_cnki.c pdf_get.c pdf_parser.c pdf_writer.c pdf.c -inc = extern.h version.h iconv.h zlib.h jbig.h jbig2.h jpeg.h \ +inc = extern.h version.h iconv.h zlib.h jbig.h jbig2.h jpeg.h jp2.h \ cnki.h pdf_cnki.h cnki_jbig.h pdf.h obj = ${src:.c=.o} @@ -16,7 +16,7 @@ obj = ${src:.c=.o} PREFIX = /usr/local CFLAGS = -O2 -pipe -flto -Wall -Wextra -LDFLAGS = -Wl,-O2 -lcrypto -liconv -lz -ljbig2dec -ljpeg -Wl,--as-needed +LDFLAGS = -Wl,-O2 -lcrypto -liconv -lz -ljbig2dec -ljpeg -lopenjp2 -Wl,--as-needed CFLAGS += -I/usr/local/include LDFLAGS += -L/usr/local/lib diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 0cb30ca..887e5f4 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -11,6 +11,7 @@ #include "iconv.h" #include "zlib.h" #include "jpeg.h" +#include "jp2.h" #include "pdf.h" #include "pdf_cnki.h" @@ -660,6 +661,41 @@ cnki_pdf_hn(cnki_t **param) dim[i * 2 + 1] = info[1]; break; case JPX: + ret = strinfo_jp2_dim(&info[0], + &info[1], + ptr->image_data[i].image, + ptr->image_data[i].size); + + if (ret != 0) { + dim[i * 2] = 0; + dim[i * 2 + 1] = 0; + break; + } + + stream_size = ptr->image_data[i].size; + stream = malloc(stream_size); + if (stream == NULL) { + free(root_kid); + free(ids); + free(dim); + free(dictionary); + return 1; + } + memcpy(stream, ptr->image_data[i].image, stream_size); + + snprintf(buf, 64, "/Width %d\n/Height %d\n", + info[0], info[1]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Length %d\n", + stream_size); + strcat(dictionary, buf); + + strcat(dictionary, "/Filter /JPXDecode\n"); + + dim[i * 2] = info[0]; + dim[i * 2 + 1] = info[1]; + break; default: ret = -1; dim[i * 2] = -1; diff --git a/src/jp2.c b/src/jp2.c new file mode 100644 index 0000000..9420b48 --- /dev/null +++ b/src/jp2.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2022, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#ifdef __linux__ +#include +#else +#include +#endif + +typedef struct _stream_user_data { + OPJ_SIZE_T pos; + OPJ_SIZE_T size; + const unsigned char *data; +} stream_user_data; + +static OPJ_SIZE_T +_opj_stream_read(void *p_buffer, OPJ_SIZE_T p_nb_bytes, void *p_user_data) +{ + stream_user_data *d = (stream_user_data *) p_user_data; + + if (d->pos >= d->size) + return (OPJ_SIZE_T) - 1; + + OPJ_SIZE_T ret_size = p_nb_bytes; + + if (d->pos + ret_size > d->size) + ret_size = d->size - d->pos; + + memcpy(p_buffer, d->data + d->pos, ret_size); + + d->pos += ret_size; + + return ret_size; +} + +static OPJ_OFF_T +_opj_stream_skip(OPJ_OFF_T p_nb_bytes, void *p_user_data) +{ + stream_user_data *d = (stream_user_data *) p_user_data; + + if (d->pos + p_nb_bytes <= d->size) + d->pos += p_nb_bytes; + else + d->pos = d->size; + + return d->pos; +} + +static OPJ_BOOL +_opj_stream_seek(OPJ_OFF_T p_nb_bytes, void *p_user_data) +{ + stream_user_data *d = (stream_user_data *) p_user_data; + + if (p_nb_bytes <= (OPJ_OFF_T) d->size) { + d->pos = p_nb_bytes; + return OPJ_TRUE; + } + + return OPJ_FALSE; +} + +int +strinfo_jp2_dim(int *jp2_width, int *jp2_height, + const char * restrict data, int data_size) +{ + opj_codec_t *codec; + opj_dparameters_t param; + opj_stream_t *stream; + opj_image_t *image; + stream_user_data d; + + if (data_size < 2) + return 1; + + opj_set_default_decoder_parameters(¶m); + + if ((unsigned char) data[0] == 0xff && (unsigned char) data[1] == 0x4f) + codec = opj_create_decompress(OPJ_CODEC_J2K); + else + codec = opj_create_decompress(OPJ_CODEC_JP2); + + if (!opj_setup_decoder(codec, ¶m)) { + opj_destroy_codec(codec); + return 1; + } + + stream = opj_stream_default_create(OPJ_TRUE); + + d.pos = 0; + d.size = data_size; + d.data = (unsigned char *) data; + + opj_stream_set_read_function(stream, _opj_stream_read); + opj_stream_set_skip_function(stream, _opj_stream_skip); + opj_stream_set_seek_function(stream, _opj_stream_seek); + opj_stream_set_user_data(stream, &d, NULL); + opj_stream_set_user_data_length(stream, data_size); + + if (!opj_read_header(stream, codec, &image)) { + opj_destroy_codec(codec); + opj_stream_destroy(stream); + return 1; + } + + opj_destroy_codec(codec); + opj_stream_destroy(stream); + + *jp2_width = image->x1 - image->x0; + *jp2_height = image->y1 - image->y0; + + opj_image_destroy(image); + + return 0; +} diff --git a/src/jp2.h b/src/jp2.h new file mode 100644 index 0000000..5644938 --- /dev/null +++ b/src/jp2.h @@ -0,0 +1,8 @@ +/* + * Copyright (c) 2022, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +int strinfo_jp2_dim(int *jp2_width, int *jp2_height, + const char * restrict data, int data_size); From 988a751c15b43942b20ed437a15d6da6945aa883 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 29 Dec 2022 02:49:05 +0000 Subject: [PATCH 07/41] Handle missing root object which is parent of others. Signed-off-by: yzrh --- CHANGE.md | 1 + src/cnki_pdf.c | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index c4ce375..a832777 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -1,6 +1,7 @@ 0.3.0 (2023-XX-XX) ================== +* Handle missing but referenced root object. * Support JPEG 2000 for HN. * Support HN figure placement. * Support HN text overlay. diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 887e5f4..502a0ce 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -309,7 +309,9 @@ cnki_pdf(cnki_t **param) if (parent_missing[i] == 1) root_kid++; - if (root_kid <= 1) { + if (root_kid > 1) { + root = pdf_get_free_id(&pdf); + } else { if (root_kid == 0) { for (int i = 1; i <= parent[0]; i++) if (root == 0 || root < parent[i]) @@ -323,15 +325,15 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Root object is %d.\n", root); - } else { + } + + if (pdf_get_kid_count(&pdf, root) == 0) { if ((*param)->stat > 0) printf("Root object is missing\n"); if ((*param)->stat > 1) printf("Generating root object\n"); - root = pdf_get_free_id(&pdf); - snprintf(buf, 64, "<<\n/Type /Pages\n/Kids "); strcat(dictionary, buf); @@ -886,7 +888,7 @@ cnki_pdf_hn(cnki_t **param) free(dictionary); } - dictionary_size = 64 + 128 * ptr->image_length; + dictionary_size = 128 + 128 * ptr->image_length; dictionary = malloc(dictionary_size); if (dictionary == NULL) { From cd0af5ba3ceee50b1c8a287149b48b2f4cb7ce0b Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 29 Dec 2022 03:58:22 +0000 Subject: [PATCH 08/41] Fix buffer overflow when object size is less than 8 bytes. Signed-off-by: yzrh --- src/pdf_parser.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 3b29c52..b4470f9 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -126,6 +126,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) pdf_object_t *ptr = (*pdf)->next; + char str[8]; char *buf; char *head; char *tail; @@ -140,11 +141,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) memset(buf, 0, ptr->size); fseek(*fp, ptr->address - 12, SEEK_SET); - fread(buf, 8, 1, *fp); + fread(str, 8, 1, *fp); for (int i = 0; i < 8; i++) { - if (buf[i] >= '0' && buf[i] <= '9') { - ptr->id = atoi(buf + i); + if (str[i] >= '0' && str[i] <= '9') { + ptr->id = atoi(str + i); break; } } From 97931e1470880f644ec6efefcce8f2bab33f8b30 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 29 Dec 2022 05:23:04 +0000 Subject: [PATCH 09/41] Fix PDF object check. Signed-off-by: yzrh --- CHANGE.md | 3 ++- src/cnki_pdf.c | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index a832777..63a76c3 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -1,10 +1,11 @@ 0.3.0 (2023-XX-XX) ================== -* Handle missing but referenced root object. * Support JPEG 2000 for HN. * Support HN figure placement. * Support HN text overlay. +* Handle missing but referenced root object. +* Fix buffer overflow. 0.2.1 (2022-12-26) ================== diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 502a0ce..328a989 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -322,12 +322,13 @@ cnki_pdf(cnki_t **param) root = i; } - if ((*param)->stat > 0) - printf("Root object is %d.\n", - root); + if (root == 0) + root = pdf_get_free_id(&pdf); + else if ((*param)->stat > 0) + printf("Root object is %d.\n", root); } - if (pdf_get_kid_count(&pdf, root) == 0) { + if (pdf_get_obj(&pdf, root, NULL) != 0) { if ((*param)->stat > 0) printf("Root object is missing\n"); From 060bc00a0d68de72299d7084669b3b564248e205 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 29 Dec 2022 06:30:59 +0000 Subject: [PATCH 10/41] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGE.md b/CHANGE.md index 63a76c3..1069f8e 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -1,10 +1,15 @@ 0.3.0 (2023-XX-XX) ================== -* Support JPEG 2000 for HN. * Support HN figure placement. * Support HN text overlay. + +0.2.2 (2022-12-29) +================== + +* Support JPEG 2000 for HN. * Handle missing but referenced root object. +* Handle HN with more than one image per page. * Fix buffer overflow. 0.2.1 (2022-12-26) From 5a1afb00567aeacca9ead58f5383c3f502317a69 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 29 Dec 2022 16:32:49 +0000 Subject: [PATCH 11/41] Link against libc for iconv, find openjpeg header with pkgconf. Signed-off-by: yzrh --- README.md | 12 ++++++------ src/Makefile | 7 ++++++- src/jp2.c | 4 ---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 4e36fbd..1fdcf6c 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,12 @@ any failures with a sample that can reproduce the behaviour. Dependency ---------- -1. OpenSSL -2. libiconv -3. zlib -4. jbig2dec -5. libjpeg-turbo -6. openjpeg +1. libcrypto (OpenSSL) +2. zlib +3. jbig2dec +4. libjpeg-turbo +5. openjpeg +6. pkgconf Usage ===== diff --git a/src/Makefile b/src/Makefile index 065a8a5..5442ff0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -16,11 +16,16 @@ obj = ${src:.c=.o} PREFIX = /usr/local CFLAGS = -O2 -pipe -flto -Wall -Wextra -LDFLAGS = -Wl,-O2 -lcrypto -liconv -lz -ljbig2dec -ljpeg -lopenjp2 -Wl,--as-needed +LDFLAGS = -Wl,-O2 -lcrypto -lz -ljbig2dec -ljpeg -lopenjp2 -Wl,--as-needed CFLAGS += -I/usr/local/include LDFLAGS += -L/usr/local/lib +OPENJPEG_CFLAGS != pkgconf --cflags libopenjp2 + +CFLAGS += ${OPENJPEG_CFLAGS} +CFLAGS += -DLIBICONV_PLUG + all: ${obj} ${inc} ${CC} ${LDFLAGS} -o melon ${obj} diff --git a/src/jp2.c b/src/jp2.c index 9420b48..a9d4429 100644 --- a/src/jp2.c +++ b/src/jp2.c @@ -6,11 +6,7 @@ #include -#ifdef __linux__ #include -#else -#include -#endif typedef struct _stream_user_data { OPJ_SIZE_T pos; From 1ce3f89574fa7256ab019eefc96a7362165cca52 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 29 Dec 2022 21:10:03 +0000 Subject: [PATCH 12/41] Handle combination of text and image in page content. Signed-off-by: yzrh --- src/cnki_pdf.c | 125 ++++++++++++++++++++++++------------------------- src/version.h | 2 +- 2 files changed, 63 insertions(+), 64 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 328a989..3d446e4 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -460,6 +460,9 @@ cnki_pdf_hn(cnki_t **param) if (pdf_obj_create(&pdf) != 0) return 1; + int font = pdf_get_free_id(&pdf); + pdf_obj_append(&pdf, font, NULL, "<<\n/Type /Font\n/Subtype /TrueType\n/BaseFont /NotoSansCJKSC\n>>", NULL, 0); + if ((*param)->stat > 1) printf("Generating PDF object(s)\n"); @@ -480,15 +483,12 @@ cnki_pdf_hn(cnki_t **param) while (ptr != NULL) { /* * External object (ptr->image_length) + - * content object + * resource object + + * content object + * page object */ int *ids = NULL; - if (ptr->image_length > 0) - pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); - else - pdf_get_free_ids(&pdf, &ids, 2); + pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); int bitmap_size; char *bitmap; @@ -721,19 +721,39 @@ cnki_pdf_hn(cnki_t **param) if ((*param)->stat > 2) printf("Not extracted.\n"); - pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0); + pdf_obj_append(&pdf, ids[i], "null", NULL, NULL, 0); } else { if ((*param)->stat > 2) printf("Unsupported format.\n"); - pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0); + pdf_obj_append(&pdf, ids[i], "null", NULL, NULL, 0); } } - if (ptr->image_length > 0) { - memset(dictionary, 0, dictionary_size); + if (ptr->image_length > 0) + free(dictionary); - strcat(dictionary, "<<\n/XObject <<"); + dictionary_size = 128 + 2 * ptr->text_size + 128 * ptr->image_length; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(ids); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n"); + + if (ptr->text_size > 0) { + snprintf(buf, 64, "/Font <>\n", font); + strcat(dictionary, buf); + } + + if (ptr->image_length > 0) { + strcat(dictionary, "/XObject <<"); for (int i = 0; i < ptr->image_length; i++) { snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); @@ -743,13 +763,15 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, " "); } - strcat(dictionary, ">>\n>>"); - - pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); - - free(dictionary); + strcat(dictionary, ">>\n"); } + strcat(dictionary, ">>"); + + pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); + + memset(dictionary, 0, dictionary_size); + int conv_size; char *conv_dst; char conv_src[2]; @@ -766,20 +788,10 @@ cnki_pdf_hn(cnki_t **param) ptr->text = stream; } - dictionary_size = 64 + 2 * ptr->text_size; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; - } - - memset(dictionary, 0, dictionary_size); - strcat(dictionary, "BT\n"); + strcat(dictionary, "/F0 10 Tf\n"); + for (int i = 0, j = 0; i < ptr->text_size - 1;) { switch ((uint16_t) (ptr->text[i + 1] << 8 | ptr->text[i])) { case 0x8001: @@ -801,6 +813,9 @@ cnki_pdf_hn(cnki_t **param) conv_src[0] = ptr->text[i + 3]; conv_src[1] = ptr->text[i + 2]; + //snprintf(buf, 64, "%f %f Td\n"); + //strcat(dictionary, buf); + conv_size = 6; if (strconv(&conv_dst, "UTF-16BE", @@ -831,6 +846,9 @@ cnki_pdf_hn(cnki_t **param) conv_src[0] = ptr->text[i + 7]; conv_src[1] = ptr->text[i + 6]; + //snprintf(buf, 64, "%f %f Td\n"); + //strcat(dictionary, buf); + conv_size = 6; if (strconv(&conv_dst, "UTF-16BE", @@ -885,23 +903,11 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "ET"); - /* FIXME: Use the text somehow? */ - free(dictionary); - } - - dictionary_size = 128 + 128 * ptr->image_length; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; + if (ptr->image_length > 0) + strcat(dictionary, "\n"); } if (ptr->image_length > 0) { - memset(dictionary, 0, dictionary_size); - char resize_str[64] = "0.25 0 0 0.25 0 0 cm\n"; double resize_x = 1; double resize_y = 1; @@ -970,10 +976,13 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "\n"); } + free(dim); + } + + if (strlen(dictionary) > 0) { if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { free(root_kid); free(ids); - free(dim); free(dictionary); return 1; } @@ -993,6 +1002,9 @@ cnki_pdf_hn(cnki_t **param) NULL, dictionary, stream, stream_size); free(stream); + } else { + pdf_obj_append(&pdf, ids[ptr->image_length + 1], + "null", NULL, NULL, 0); } memset(dictionary, 0, dictionary_size); @@ -1002,32 +1014,19 @@ cnki_pdf_hn(cnki_t **param) /* A4 paper */ strcat(dictionary, "/MediaBox [0 0 595.2756 841.8898]\n"); - if (ptr->image_length > 0) { - free(dim); + snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); + strcat(dictionary, buf); - snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); - strcat(dictionary, buf); + snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); + strcat(dictionary, buf); - snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); - strcat(dictionary, buf); + /* Add /Parent when we know root */ + pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); - /* Add /Parent when we know root */ - pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); - - root_kid[cnt++] = ids[ptr->image_length + 2]; - } else { - snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length]); - strcat(dictionary, buf); - - /* Add /Parent when we know root */ - pdf_obj_append(&pdf, ids[ptr->image_length + 1], NULL, dictionary, NULL, 0); - - root_kid[cnt++] = ids[ptr->image_length + 1]; - } - - free(dictionary); + root_kid[cnt++] = ids[ptr->image_length + 2]; free(ids); + free(dictionary); ptr = ptr->next; } diff --git a/src/version.h b/src/version.h index db25a27..08b70ea 100644 --- a/src/version.h +++ b/src/version.h @@ -6,5 +6,5 @@ #define VERSION "0" #define RELEASE "2" -#define PATCH "2" +#define PATCH "3" #define EXTRA "" From 5466a441dfad627063ae498ed7359f1af9fb8de7 Mon Sep 17 00:00:00 2001 From: yzrh Date: Fri, 30 Dec 2022 02:00:12 +0000 Subject: [PATCH 13/41] Fix type casting when processing data. Signed-off-by: yzrh --- src/cnki.h | 8 ++++---- src/cnki_jbig.h | 4 ++-- src/cnki_pdf.c | 50 ++++++++++++++++++++++++------------------------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/cnki.h b/src/cnki.h index e9cc5d1..7d7d15e 100644 --- a/src/cnki.h +++ b/src/cnki.h @@ -58,10 +58,10 @@ typedef struct _hn_image_t { int32_t format; /* hn_code */ int32_t address; int32_t size; - int16_t x; - int16_t y; - int16_t w; - int16_t h; + uint16_t x; + uint16_t y; + uint16_t w; + uint16_t h; char *image; } hn_image_t; diff --git a/src/cnki_jbig.h b/src/cnki_jbig.h index 701b4df..2983607 100644 --- a/src/cnki_jbig.h +++ b/src/cnki_jbig.h @@ -27,8 +27,8 @@ typedef struct _dib_t { uint16_t depth; uint32_t compression; /* dib_compression_code */ uint32_t size; - uint32_t resolution_h; - uint32_t resolution_v; + int32_t resolution_h; + int32_t resolution_v; uint32_t colour; uint32_t colour_used; } dib_t; diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 3d446e4..372c5da 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -793,15 +793,15 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "/F0 10 Tf\n"); for (int i = 0, j = 0; i < ptr->text_size - 1;) { - switch ((uint16_t) (ptr->text[i + 1] << 8 | ptr->text[i])) { - case 0x8001: + switch (ptr->text[i]) { + case 0x01: if (ptr->address_next <= ptr->address) { i += 2; break; } strcat(dictionary, "T*\n"); - case 0x8070: + case 0x70: if (ptr->address_next > ptr->address) { i += 4; @@ -867,21 +867,24 @@ cnki_pdf_hn(cnki_t **param) i += 8; break; - case 0x800a: + case 0x0a: if (i + 27 >= ptr->text_size || j >= ptr->image_length) { i += 2; break; } if (ptr->image_length > 0) { - ptr->image_data[j].x = - ptr->text[i + 5] << 8 | ptr->text[i + 4]; - ptr->image_data[j].y = - ptr->text[i + 7] << 8 | ptr->text[i + 6]; - ptr->image_data[j].w = - ptr->text[i + 9] << 8 | ptr->text[i + 8]; - ptr->image_data[j].h = - ptr->text[i + 11] << 8 | ptr->text[i + 10]; + ptr->image_data[j].x += (unsigned char) ptr->text[i + 5] << 8; + ptr->image_data[j].x += (unsigned char) ptr->text[i + 4]; + + ptr->image_data[j].y += (unsigned char) ptr->text[i + 7] << 8; + ptr->image_data[j].y += (unsigned char) ptr->text[i + 6]; + + ptr->image_data[j].w += (unsigned char) ptr->text[i + 9] << 8; + ptr->image_data[j].w += (unsigned char) ptr->text[i + 8]; + + ptr->image_data[j].h += (unsigned char) ptr->text[i + 11] << 8; + ptr->image_data[j].h += (unsigned char) ptr->text[i + 10]; if ((*param)->stat > 2) printf("\tItem %d: origin (%4d, %4d), width %4d, height %4d\n", @@ -908,7 +911,7 @@ cnki_pdf_hn(cnki_t **param) } if (ptr->image_length > 0) { - char resize_str[64] = "0.25 0 0 0.25 0 0 cm\n"; + char resize_str[64]; double resize_x = 1; double resize_y = 1; @@ -918,12 +921,13 @@ cnki_pdf_hn(cnki_t **param) resize_y = 4 * 841.8898 / dim[1]; if (resize_y < resize_x) - snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", + snprintf(resize_str, 64, "%f 0 0 %f 0 0 cm\n", resize_y, resize_y); else - snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", + snprintf(resize_str, 64, "%f 0 0 %f 0 0 cm\n", resize_x, resize_x); - strcat(resize_str, buf); + } else { + memset(resize_str, 0, 64); } for (int i = 0; i < ptr->image_length; i++) { @@ -932,7 +936,7 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "q\n"); - strcat(dictionary, resize_str); + strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); /* Rotate image */ if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { @@ -945,14 +949,8 @@ cnki_pdf_hn(cnki_t **param) /* Translate figure */ if (i > 0) { - double origin_x = 0.4043745 * ptr->image_data[i].x; - double origin_y = 0.4043561 * ptr->image_data[i].y; - - if (origin_x < 0) - origin_x += (2381.102 - dim[i * 2]) / 2; - - if (origin_y < 0) - origin_y += (3367.559 + dim[i * 2 + 1]) / 2; + double origin_x = 0.4043339 * ptr->image_data[i].x; + double origin_y = 0.4043273 * ptr->image_data[i].y; if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) origin_y = -3367.559 + origin_y + dim[i * 2 + 1]; @@ -967,6 +965,8 @@ cnki_pdf_hn(cnki_t **param) dim[i * 2], dim[i * 2 + 1]); strcat(dictionary, buf); + strcat(dictionary, resize_str); + snprintf(buf, 64, "/Im%d Do\n", i); strcat(dictionary, buf); From 9646ee61c355a97ad47d20ec3e3d77bceee34b07 Mon Sep 17 00:00:00 2001 From: yzrh Date: Fri, 30 Dec 2022 02:04:43 +0000 Subject: [PATCH 14/41] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGE.md b/CHANGE.md index 1069f8e..285ad1b 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -1,9 +1,13 @@ 0.3.0 (2023-XX-XX) ================== -* Support HN figure placement. * Support HN text overlay. +0.2.3 (2022-12-30) +================== + +* Support HN figure placement. + 0.2.2 (2022-12-29) ================== From 226f16ddf41619caea347228322851835b4d1045 Mon Sep 17 00:00:00 2001 From: yzrh Date: Fri, 30 Dec 2022 12:20:49 +0000 Subject: [PATCH 15/41] Handle HN page with figure only. Signed-off-by: yzrh --- src/cnki_pdf.c | 39 ++++++++++++++++++++------------------- src/jbig.c | 10 +++++----- src/pdf_get.c | 4 ++-- src/version.h | 2 +- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 372c5da..45f5324 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -218,7 +218,7 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Discovered %d parent object(s)\n", parent[0]); - int *parent_missing = malloc(parent[0] * sizeof(int)); + int8_t *parent_missing = malloc(parent[0] * sizeof(int8_t)); if (parent_missing == NULL) return 1; @@ -343,7 +343,7 @@ cnki_pdf(cnki_t **param) strcat(dictionary, "["); for (int i = 0, j = 0; i < parent[0]; i++) { - if (parent_missing[i]) { + if (parent_missing[i] == 1) { snprintf(buf, 64, "%d 0 R", parent[i + 1]); strcat(dictionary, buf); @@ -793,15 +793,11 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "/F0 10 Tf\n"); for (int i = 0, j = 0; i < ptr->text_size - 1;) { - switch (ptr->text[i]) { - case 0x01: - if (ptr->address_next <= ptr->address) { - i += 2; - break; - } - - strcat(dictionary, "T*\n"); - case 0x70: + switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) { + case 0x8001: + if (ptr->address_next > ptr->address) + strcat(dictionary, "T*\n"); + case 0x8070: if (ptr->address_next > ptr->address) { i += 4; @@ -867,9 +863,12 @@ cnki_pdf_hn(cnki_t **param) i += 8; break; - case 0x0a: + case 0x800a: if (i + 27 >= ptr->text_size || j >= ptr->image_length) { i += 2; + + if (j >= ptr->image_length) + i += 26; break; } @@ -896,7 +895,9 @@ cnki_pdf_hn(cnki_t **param) } i += 28; - j++; + + if (j == 0 || ptr->image_data[j].x > 0 || ptr->image_data[j].y > 0) + j++; break; default: i += 4; @@ -915,10 +916,10 @@ cnki_pdf_hn(cnki_t **param) double resize_x = 1; double resize_y = 1; - if (dim[0] > 0 && dim[1] > 0) { + if (ptr->image_data[0].x == 0 && ptr->image_data[0].y == 0 && dim[0] > 0 && dim[1] > 0) { /* Scale within bound of A4 paper */ - resize_x = 4 * 595.2756 / dim[0]; - resize_y = 4 * 841.8898 / dim[1]; + resize_x = 2381.102 / dim[0]; + resize_y = 3367.559 / dim[1]; if (resize_y < resize_x) snprintf(resize_str, 64, "%f 0 0 %f 0 0 cm\n", @@ -948,9 +949,9 @@ cnki_pdf_hn(cnki_t **param) } /* Translate figure */ - if (i > 0) { - double origin_x = 0.4043339 * ptr->image_data[i].x; - double origin_y = 0.4043273 * ptr->image_data[i].y; + if (ptr->image_data[i].x > 0 || ptr->image_data[i].y > 0) { + double origin_x = ptr->image_data[i].x * dim[i * 2] / ptr->image_data[i].w; + double origin_y = ptr->image_data[i].y * dim[i * 2 + 1] / ptr->image_data[i].h; if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) origin_y = -3367.559 + origin_y + dim[i * 2 + 1]; diff --git a/src/jbig.c b/src/jbig.c index fce8e02..b466411 100644 --- a/src/jbig.c +++ b/src/jbig.c @@ -108,7 +108,7 @@ static void _bytein(void) { if (_ret_pos < _scd_size) - _reg_c += *(_scd + _ret_pos++) << 8; + _reg_c += _scd[_ret_pos++] << 8; _ct = 8; } @@ -215,19 +215,19 @@ _procline(int line, char *a, char *b, char *c) cx >>= 1; if (_pix == 1) { - *(_ret + _width_padded * (_height - line - 1) + i / 8) |= _pix << (7 - (i & 0x07)); - *(c + i) = 1; + _ret[_width_padded * (_height - line - 1) + i / 8] |= _pix << (7 - (i & 0x07)); + c[i] = 1; cx |= 0x0200; } else { cx &= 0xfdff; } - if (i + 2 < _width && *(a + i + 2) == 1) + if (i + 2 < _width && a[i + 2] == 1) cx |= 0x0004; else cx &= 0xfffb; - if (i + 3 < _width && *(b + i + 3) == 1) + if (i + 3 < _width && b[i + 3] == 1) cx |= 0x0080; else cx &= 0xff7f; diff --git a/src/pdf_get.c b/src/pdf_get.c index a72c68d..bde5bf2 100644 --- a/src/pdf_get.c +++ b/src/pdf_get.c @@ -89,7 +89,7 @@ pdf_get_free_id(pdf_object_t **pdf) int id = 0; - for (int i = 1; i < 99999999; i++) { + for (int i = 1; i < 100000000; i++) { ptr = (*pdf)->next; while (ptr != NULL) { if (ptr->id == i) { @@ -123,7 +123,7 @@ pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count) int id = 0; pdf_object_t *ptr; - for (int i = 1; i < 99999999; i++) { + for (int i = 1; i < 100000000; i++) { ptr = (*pdf)->next; while (ptr != NULL) { if (ptr->id == i) { diff --git a/src/version.h b/src/version.h index 08b70ea..53be5ba 100644 --- a/src/version.h +++ b/src/version.h @@ -6,5 +6,5 @@ #define VERSION "0" #define RELEASE "2" -#define PATCH "3" +#define PATCH "4" #define EXTRA "" From 1d899d934d26a5e19333712deb50c39b3843076d Mon Sep 17 00:00:00 2001 From: yzrh Date: Fri, 30 Dec 2022 20:16:53 +0000 Subject: [PATCH 16/41] Fix PDF object check. Signed-off-by: yzrh --- src/cnki_pdf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 45f5324..72500ab 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -328,7 +328,9 @@ cnki_pdf(cnki_t **param) printf("Root object is %d.\n", root); } - if (pdf_get_obj(&pdf, root, NULL) != 0) { + pdf_object_t *tmp; + + if (pdf_get_obj(&pdf, root, &tmp) != 0) { if ((*param)->stat > 0) printf("Root object is missing\n"); @@ -423,8 +425,6 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 1) printf("Deleting xref object\n"); - pdf_object_t *tmp; - pdf_get_obj(&pdf, xref, &tmp); pdf_obj_del(&pdf, xref); @@ -1106,7 +1106,7 @@ cnki_pdf_hn(cnki_t **param) return 1; } - pdf_object_t *tmp = NULL; + pdf_object_t *tmp; /* Add /Parent to page object */ for (int i = 0; i < (*param)->file_stat->page; i++) { From 220a81c2adc0fb51137068c46cbcbe7641115a03 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sat, 31 Dec 2022 10:48:29 +0000 Subject: [PATCH 17/41] Fix HN image compositing. Signed-off-by: yzrh --- src/cnki_pdf.c | 75 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 72500ab..6cb8c9f 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -496,10 +496,10 @@ cnki_pdf_hn(cnki_t **param) int stream_size; char *stream; - int *dim; + double *dim; if (ptr->image_length > 0) { - dim = malloc(2 * ptr->image_length * sizeof(int)); + dim = malloc(2 * ptr->image_length * sizeof(double)); if (dim == NULL) { free(root_kid); @@ -911,24 +911,35 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "\n"); } + /* FIXME: Use the text somehow? */ + memset(dictionary, 0, dictionary_size); + if (ptr->image_length > 0) { - char resize_str[64]; double resize_x = 1; double resize_y = 1; + double margin_x = 0; + double margin_y = 0; + if (ptr->image_data[0].x == 0 && ptr->image_data[0].y == 0 && dim[0] > 0 && dim[1] > 0) { /* Scale within bound of A4 paper */ - resize_x = 2381.102 / dim[0]; - resize_y = 3367.559 / dim[1]; + resize_x = 2480.315 / dim[0]; + resize_y = 3507.874 / dim[1]; - if (resize_y < resize_x) - snprintf(resize_str, 64, "%f 0 0 %f 0 0 cm\n", - resize_y, resize_y); - else - snprintf(resize_str, 64, "%f 0 0 %f 0 0 cm\n", - resize_x, resize_x); - } else { - memset(resize_str, 0, 64); + if (resize_y < resize_x) { + for (int i = 0; i < ptr->image_length; i++) { + dim[i * 2] *= resize_y; + dim[i * 2 + 1] *= resize_y; + } + } else { + for (int i = 0; i < ptr->image_length; i++) { + dim[i * 2] *= resize_x; + dim[i * 2 + 1] *= resize_x; + } + } + + margin_x = (2480.315 - dim[0]) / 2; + margin_y = (3507.874 - dim[1]) / 2; } for (int i = 0; i < ptr->image_length; i++) { @@ -937,36 +948,48 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "q\n"); - strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); + strcat(dictionary, "0.24 0 0 0.24 0 0 cm\n"); /* Rotate image */ if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { - snprintf(buf, 64, "1 0 0 1 0 %d cm\n", - dim[i * 2 + 1]); + snprintf(buf, 64, "1 0 0 -1 0 %f cm\n", dim[i * 2 + 1]); strcat(dictionary, buf); - - strcat(dictionary, "1 0 0 -1 0 0 cm\n"); } /* Translate figure */ if (ptr->image_data[i].x > 0 || ptr->image_data[i].y > 0) { - double origin_x = ptr->image_data[i].x * dim[i * 2] / ptr->image_data[i].w; - double origin_y = ptr->image_data[i].y * dim[i * 2 + 1] / ptr->image_data[i].h; + double origin_x = ptr->image_data[i].x * 0.40433; + double origin_y = ptr->image_data[i].y * 0.40433; + + if (resize_y < resize_x) { + origin_x *= resize_y; + origin_y *= resize_y; + } else { + origin_x *= resize_x; + origin_y *= resize_x; + } if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) - origin_y = -3367.559 + origin_y + dim[i * 2 + 1]; + origin_y = -3507.874 + origin_y + dim[i * 2 + 1]; else - origin_y = 3367.559 - origin_y - dim[i * 2 + 1]; + origin_y = 3507.874 - origin_y - dim[i * 2 + 1]; snprintf(buf, 64, "1 0 0 1 %f %f cm\n", origin_x, origin_y); strcat(dictionary, buf); } - snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", - dim[i * 2], dim[i * 2 + 1]); - strcat(dictionary, buf); + if (margin_x > 0 || margin_y > 0) { + if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { + snprintf(buf, 64, "1 0 0 1 %f %f cm\n", margin_x, -margin_y); + strcat(dictionary, buf); + } else { + snprintf(buf, 64, "1 0 0 1 %f %f cm\n", margin_x, margin_y); + strcat(dictionary, buf); + } + } - strcat(dictionary, resize_str); + snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", dim[i * 2], dim[i * 2 + 1]); + strcat(dictionary, buf); snprintf(buf, 64, "/Im%d Do\n", i); strcat(dictionary, buf); From 0bbf8e65dd712d91ffc63493e4c2d16599d1685e Mon Sep 17 00:00:00 2001 From: yzrh Date: Sat, 31 Dec 2022 11:28:03 +0000 Subject: [PATCH 18/41] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGE.md b/CHANGE.md index 285ad1b..7a384ea 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -3,6 +3,12 @@ * Support HN text overlay. +0.2.4 (2022-12-31) +================== + +* Fix HN image compositing. +* Fix PDF object check. + 0.2.3 (2022-12-30) ================== From 3ac51d66b9e7b7169c99676f39e7dab457f6b979 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sat, 31 Dec 2022 18:52:06 +0000 Subject: [PATCH 19/41] Fix JBIG table length. Signed-off-by: yzrh --- src/jbig.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/jbig.c b/src/jbig.c index b466411..37afb50 100644 --- a/src/jbig.c +++ b/src/jbig.c @@ -9,7 +9,7 @@ #include #include -static const uint16_t _LSZ[256] = { +static const uint16_t _LSZ[0x71] = { 0x5a1d, 0x2586, 0x1114, 0x080b, 0x03d8, 0x01da, 0x00e5, 0x006f, 0x0036, 0x001a, 0x000d, 0x0006, 0x0003, 0x0001, 0x5a7f, 0x3f25, 0x2cf2, @@ -28,7 +28,7 @@ static const uint16_t _LSZ[256] = { 0x5627, 0x50e7, 0x4b85, 0x5597, 0x504f, 0x5a10, 0x5522, 0x59eb }; -static const uint8_t _NLPS[256] = { +static const uint8_t _NLPS[0x71] = { 1, 14, 16, 18, 20, 23, 25, 28, 30, 33, 35, 9, 10, 12, 15, 36, 38, @@ -47,7 +47,7 @@ static const uint8_t _NLPS[256] = { 105, 108, 109, 110, 111, 110, 112, 112 }; -static const uint8_t _NMPS[256] = { +static const uint8_t _NMPS[0x71] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 15, 16, 17, @@ -66,7 +66,7 @@ static const uint8_t _NMPS[256] = { 106, 107, 103, 109, 107, 111, 109, 111 }; -static const bool _SWTCH[256] = { +static const bool _SWTCH[0x71] = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, From bffb8ce8a4f21f3e20513e7f76a949acb190e14a Mon Sep 17 00:00:00 2001 From: yzrh Date: Sat, 31 Dec 2022 21:17:28 +0000 Subject: [PATCH 20/41] Fix JBIG decoder. Signed-off-by: yzrh --- src/jbig.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/jbig.c b/src/jbig.c index 37afb50..c3c501b 100644 --- a/src/jbig.c +++ b/src/jbig.c @@ -209,6 +209,17 @@ _procline(int line, char *a, char *b, char *c) /* The encoder must be erroneous */ uint16_t cx = 0; + if (line > 0) { + cx += (_ret[_width_padded * (_height - line)] & 0x20) << 2; + cx += _ret[_width_padded * (_height - line)] & 0x40; + cx += (_ret[_width_padded * (_height - line)] & 0x80) >> 2; + } + + if (line > 1) { + cx += (_ret[_width_padded * (_height - line + 1)] & 0x40) >> 4; + cx += (_ret[_width_padded * (_height - line + 1)] & 0x80) >> 6; + } + for (int i = 0; i < _width; i++) { _decode(cx); From 70e1e7ea97e6bb7ac9714f75fcc860fd97f34a45 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 1 Jan 2023 00:42:20 +0000 Subject: [PATCH 21/41] Fix JBIG decoder data type. Signed-off-by: yzrh --- src/jbig.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/jbig.c b/src/jbig.c index c3c501b..5b262e6 100644 --- a/src/jbig.c +++ b/src/jbig.c @@ -99,7 +99,7 @@ static int _height; static int _width_padded; static int _ret_pos; -static char *_ret; +static unsigned char *_ret; static int _scd_size; static unsigned char *_scd; @@ -304,7 +304,7 @@ strdec_jbig(char **bitmap, int width, int height, memset(*bitmap, 0, _height * _width_padded); _ret_pos = 0; - _ret = *bitmap; + _ret = (unsigned char *) *bitmap; _scd_size = jbig_size; _scd = (unsigned char *) jbig; From a18de8f2ef4be4a92b2e08161ce4b8f164ad4f7f Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 1 Jan 2023 10:09:08 +0000 Subject: [PATCH 22/41] Rename JBIG decoder. Signed-off-by: yzrh --- src/Makefile | 10 +++++----- src/cnki_jbig.c | 2 +- src/{jbig.c => cnki_jbig_dec.c} | 0 src/{jbig.h => cnki_jbig_dec.h} | 0 4 files changed, 6 insertions(+), 6 deletions(-) rename src/{jbig.c => cnki_jbig_dec.c} (100%) rename src/{jbig.h => cnki_jbig_dec.h} (100%) diff --git a/src/Makefile b/src/Makefile index 5442ff0..74aff8e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,12 +4,12 @@ # SPDX-License-Identifier: Apache-2.0 # -src = melon.c iconv.c zlib.c jbig.c jbig2.c jpeg.c jp2.c \ - cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c \ - cnki_pdf.c cnki_zlib.c cnki_jbig.c cnki_jbig2.c cnki.c \ +src = melon.c iconv.c zlib.c jbig2.c jpeg.c jp2.c \ + cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c cnki_pdf.c \ + cnki_zlib.c cnki_jbig.c cnki_jbig_dec.c cnki_jbig2.c cnki.c \ pdf_cnki.c pdf_get.c pdf_parser.c pdf_writer.c pdf.c -inc = extern.h version.h iconv.h zlib.h jbig.h jbig2.h jpeg.h jp2.h \ - cnki.h pdf_cnki.h cnki_jbig.h pdf.h +inc = extern.h version.h iconv.h zlib.h jbig2.h jpeg.h jp2.h \ + cnki.h pdf_cnki.h cnki_jbig.h cnki_jbig_dec.h pdf.h obj = ${src:.c=.o} diff --git a/src/cnki_jbig.c b/src/cnki_jbig.c index f35d1d5..acc43eb 100644 --- a/src/cnki_jbig.c +++ b/src/cnki_jbig.c @@ -8,7 +8,7 @@ #include #include "cnki_jbig.h" -#include "jbig.h" +#include "cnki_jbig_dec.h" int cnki_jbig(char **bitmap, int *bitmap_size, diff --git a/src/jbig.c b/src/cnki_jbig_dec.c similarity index 100% rename from src/jbig.c rename to src/cnki_jbig_dec.c diff --git a/src/jbig.h b/src/cnki_jbig_dec.h similarity index 100% rename from src/jbig.h rename to src/cnki_jbig_dec.h From 9019a184494e6fc220bcc1eb8f47f33fe0f3e506 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 1 Jan 2023 10:44:27 +0000 Subject: [PATCH 23/41] Split md5 function. Signed-off-by: yzrh --- src/Makefile | 4 ++-- src/md5.c | 24 ++++++++++++++++++++++++ src/md5.h | 9 +++++++++ src/pdf_writer.c | 16 ++++++++-------- 4 files changed, 43 insertions(+), 10 deletions(-) create mode 100644 src/md5.c create mode 100644 src/md5.h diff --git a/src/Makefile b/src/Makefile index 74aff8e..b2346a7 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,11 +4,11 @@ # SPDX-License-Identifier: Apache-2.0 # -src = melon.c iconv.c zlib.c jbig2.c jpeg.c jp2.c \ +src = melon.c iconv.c zlib.c jbig2.c jpeg.c jp2.c md5.c \ cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c cnki_pdf.c \ cnki_zlib.c cnki_jbig.c cnki_jbig_dec.c cnki_jbig2.c cnki.c \ pdf_cnki.c pdf_get.c pdf_parser.c pdf_writer.c pdf.c -inc = extern.h version.h iconv.h zlib.h jbig2.h jpeg.h jp2.h \ +inc = extern.h version.h iconv.h zlib.h jbig2.h jpeg.h jp2.h md5.h \ cnki.h pdf_cnki.h cnki_jbig.h cnki_jbig_dec.h pdf.h obj = ${src:.c=.o} diff --git a/src/md5.c b/src/md5.c new file mode 100644 index 0000000..e5ab95e --- /dev/null +++ b/src/md5.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +int +strmd5(unsigned char **dst, int *dst_size, + const unsigned char * restrict src, int src_size) +{ + *dst_size = MD5_DIGEST_LENGTH; + *dst = malloc(*dst_size); + + if (*dst == NULL) + return 1; + + MD5(src, src_size, *dst); + + return 0; +} diff --git a/src/md5.h b/src/md5.h new file mode 100644 index 0000000..9c1745d --- /dev/null +++ b/src/md5.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2023, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +int +strmd5(unsigned char **dst, int *dst_size, + const unsigned char * restrict src, int src_size); diff --git a/src/pdf_writer.c b/src/pdf_writer.c index be64e49..465d26b 100644 --- a/src/pdf_writer.c +++ b/src/pdf_writer.c @@ -5,11 +5,9 @@ */ #include -#include #include -#include - +#include "md5.h" #include "pdf.h" int @@ -144,11 +142,11 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) buf_size = snprintf(buf, 64, "%lx%x", timestamp, size); #endif - unsigned char str[64]; - memcpy(str, buf, 64); + int fid_size; + unsigned char *fid; - unsigned char fid[MD5_DIGEST_LENGTH]; - MD5(str, buf_size, fid); + if (strmd5(&fid, &fid_size, (unsigned char *) buf, buf_size) != 0) + return 1; pdf_object_t *ptr = *pdf; while (ptr->next != NULL) @@ -172,7 +170,7 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) for (int i = 0; i < 2; i++) { fputs("<", *fp); - for (int j = 0; j < MD5_DIGEST_LENGTH; j++) + for (int j = 0; j < fid_size; j++) fprintf(*fp, "%02x", fid[j]); fputs(">", *fp); @@ -191,5 +189,7 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) fputs("%%EOF\n", *fp); + free(fid); + return 0; } From cde014cffbe2e8d94de144008ad00bbccbb3a8ab Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 1 Jan 2023 18:58:43 +0000 Subject: [PATCH 24/41] Improve PDF parser. Signed-off-by: yzrh --- src/pdf_parser.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index b4470f9..54c7fb4 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -19,12 +19,12 @@ static void * _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) { const char whitespace[6] = { + ' ', '\r', '\n', '\f', '\t', - '\0', - ' ' + '\0' }; char tmp[s1 + 1]; @@ -34,7 +34,7 @@ _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) for (int i = 0; i < 6; i++) { tmp[s1] = whitespace[i]; - if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) + if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) return ret; } @@ -57,13 +57,18 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) end = ftell(*fp); fseek(*fp, cur, SEEK_SET); - int head = 0; - int tail = 0; + long head = 0; + long tail = 0; char *pos; char *tmp; for (;;) { - fread(buf, size_buf, 1, *fp); + if (cur + size_buf < end) { + fread(buf, size_buf, 1, *fp); + } else { + fread(buf, end - cur, 1, *fp); + memset(buf + end - cur, 0, size_buf - end + cur); + } if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) head = cur + (pos - buf) + 7; @@ -72,8 +77,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) /* We need to check if it is the object stored in stream */ while (memcmp(pos + 7, "\r\nendstream", 11) == 0 && - (tmp = _memmem_whitespace(pos + 6, - size_buf - (pos - buf) - 6, + (tmp = _memmem_whitespace(pos + 7, + size_buf - (pos - buf) - 7, "endobj", 6)) != NULL) pos = tmp; @@ -102,13 +107,16 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) ptr->address = head; ptr->size = tail - head; - fseek(*fp, tail + 6, SEEK_SET); + fseek(*fp, tail + 7, SEEK_SET); head = tail = 0; + } else if (head > 0 && tail > 0) { + fseek(*fp, head, SEEK_SET); + tail = 0; } else { - fseek(*fp, -6, SEEK_CUR); + fseek(*fp, -7, SEEK_CUR); } - if ((cur = ftell(*fp)) + 6 >= end) + if ((cur = ftell(*fp)) + 7 >= end) break; } @@ -159,11 +167,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) * A dictionary object may have nested dictionary, * but it should not be in a stream */ - while ((tmp = _memmem_whitespace(tail + 2, - ptr->size - (tail - buf) - 2, + while ((tmp = _memmem_whitespace(tail + 3, + ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && - memmem(tail + 2, - ptr->size - (tail - buf) - 2, + memmem(tail + 3, + ptr->size - (tail - buf) - 3, "stream\r\n", 8) == NULL) tail = tmp; @@ -190,8 +198,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) while (_memmem_whitespace(tail, ptr->size - (tail - buf), "endobj", 6) != NULL && - (tmp = _memmem_whitespace(tail + 9, - ptr->size - (tail - buf) - 9, + (tmp = _memmem_whitespace(tail + 10, + ptr->size - (tail - buf) - 10, "endstream", 9)) != NULL) tail = tmp; From 1a1fee1034b7d143a3ad77707ef930f2a8d1e3d8 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 1 Jan 2023 19:31:33 +0000 Subject: [PATCH 25/41] Handle duplicated object in CAJ. Signed-off-by: yzrh --- src/cnki_pdf.c | 73 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 6cb8c9f..90ba28e 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -145,11 +145,46 @@ _pdf_obj_sort(cnki_t **param, pdf_object_t **pdf) ret = pdf_obj_sort(pdf); + if ((*param)->stat > 0) + printf("Sorted object(s)\n"); + + return ret; +} + +static int +_pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) +{ + int ret = 0; + + pdf_object_t *tmp; + pdf_object_t *ptr; + + if ((*param)->stat > 1) + printf("Deleting duplicated object\n\t%8s\n", "id"); + + ptr = *pdf; + while (ptr->next != NULL) { + if (ptr->id == ptr->next->id) { + pdf_get_obj(&ptr, ptr->id, &tmp); + pdf_obj_del(&ptr, ptr->id); + + tmp->next = NULL; + pdf_obj_destroy(&tmp); + + ret++; + + if ((*param)->stat > 1) + printf("\t%8d\n", ptr->id); + } + + ptr = ptr->next; + } + if ((*param)->stat > 0) { if (ret == 0) - printf("Sorted object(s)\n"); + printf("No duplicated object\n"); else - printf("Object(s) not sorted\n"); + printf("Deleted %d duplicated object(s)\n", ret); } return ret; @@ -338,12 +373,9 @@ cnki_pdf(cnki_t **param) printf("Generating root object\n"); snprintf(buf, 64, - "<<\n/Type /Pages\n/Kids "); + "<<\n/Type /Pages\n/Kids ["); strcat(dictionary, buf); - if (parent[0] > 1) - strcat(dictionary, "["); - for (int i = 0, j = 0; i < parent[0]; i++) { if (parent_missing[i] == 1) { snprintf(buf, 64, "%d 0 R", parent[i + 1]); @@ -354,12 +386,7 @@ cnki_pdf(cnki_t **param) } } - if (parent[0] > 1) - strcat(dictionary, "]"); - - strcat(dictionary, "\n"); - - snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page); + snprintf(buf, 64, "]\n/Count %d\n", (*param)->file_stat->page); strcat(dictionary, buf); strcat(dictionary, ">>"); @@ -442,6 +469,8 @@ cnki_pdf(cnki_t **param) _pdf_obj_sort(param, &pdf); + _pdf_obj_dedup(param, &pdf); + _pdf_dump(param, &pdf); pdf_obj_destroy(&pdf); @@ -721,12 +750,12 @@ cnki_pdf_hn(cnki_t **param) if ((*param)->stat > 2) printf("Not extracted.\n"); - pdf_obj_append(&pdf, ids[i], "null", NULL, NULL, 0); + pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0); } else { if ((*param)->stat > 2) printf("Unsupported format.\n"); - pdf_obj_append(&pdf, ids[i], "null", NULL, NULL, 0); + pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0); } } @@ -1028,7 +1057,7 @@ cnki_pdf_hn(cnki_t **param) free(stream); } else { pdf_obj_append(&pdf, ids[ptr->image_length + 1], - "null", NULL, NULL, 0); + NULL, NULL, NULL, 0); } memset(dictionary, 0, dictionary_size); @@ -1094,12 +1123,9 @@ cnki_pdf_hn(cnki_t **param) int root = pdf_get_free_id(&pdf); - snprintf(buf, 64, "<<\n/Type /Pages\n/Kids "); + snprintf(buf, 64, "<<\n/Type /Pages\n/Kids ["); strcat(dictionary, buf); - if ((*param)->file_stat->page > 1) - strcat(dictionary, "["); - for (int i = 0; i < (*param)->file_stat->page; i++) { snprintf(buf, 64, "%d 0 R", root_kid[i]); strcat(dictionary, buf); @@ -1107,12 +1133,7 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, " "); } - if ((*param)->file_stat->page > 1) - strcat(dictionary, "]"); - - strcat(dictionary, "\n"); - - snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page); + snprintf(buf, 64, "]\n/Count %d\n", (*param)->file_stat->page); strcat(dictionary, buf); strcat(dictionary, ">>"); From d6fa934b5f1a418ea4821a6562773b9ff1aaf6e8 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 1 Jan 2023 20:36:17 +0000 Subject: [PATCH 26/41] Handle incomplete PDF object in parser. Signed-off-by: yzrh --- src/pdf_parser.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 54c7fb4..d0affb6 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -148,12 +148,16 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) memset(buf, 0, ptr->size); - fseek(*fp, ptr->address - 12, SEEK_SET); + fseek(*fp, ptr->address - 15, SEEK_SET); fread(str, 8, 1, *fp); - for (int i = 0; i < 8; i++) { - if (str[i] >= '0' && str[i] <= '9') { - ptr->id = atoi(str + i); + for (int i = 7; i >= 0; i--) { + if (str[i] < '0' || str[i] > '9') { + if (i < 7) + ptr->id = atoi(str + i + 1); + else + ptr->id = 0; + break; } } @@ -181,8 +185,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if (ptr->dictionary == NULL) return 1; - memset(ptr->dictionary, 0, ptr->dictionary_size + 1); memcpy(ptr->dictionary, head, ptr->dictionary_size); + memset(ptr->dictionary + ptr->dictionary_size, 0, 1); if ((head = memmem(tail, ptr->size - (tail - buf), @@ -195,8 +199,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) * contains another object that * contains another stream */ - while (_memmem_whitespace(tail, - ptr->size - (tail - buf), + while (_memmem_whitespace(tail + 10, + ptr->size - (tail - buf) - 10, "endobj", 6) != NULL && (tmp = _memmem_whitespace(tail + 10, ptr->size - (tail - buf) - 10, @@ -211,19 +215,13 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) memcpy(ptr->stream, head + 8, ptr->stream_size); } + + free(buf); } else { ptr->object_size = ptr->size; - ptr->object = malloc(ptr->object_size + 1); - - if (ptr->object == NULL) - return 1; - - memset(ptr->object, 0, ptr->object_size + 1); - memcpy(ptr->object, buf, ptr->object_size); + ptr->object = buf; } - free(buf); - ptr = ptr->next; } From 000405693ead8ef950558b15de576e36e184b680 Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 1 Jan 2023 21:26:44 +0000 Subject: [PATCH 27/41] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 7 +++++++ src/pdf_parser.c | 2 -- src/version.h | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index 7a384ea..a2bfe9f 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -3,6 +3,13 @@ * Support HN text overlay. +0.2.5 (2023-01-XX) +================== + +* Improve PDF parser. +* Handle duplicated object in CAJ. +* Fix JBIG decoder. + 0.2.4 (2022-12-31) ================== diff --git a/src/pdf_parser.c b/src/pdf_parser.c index d0affb6..bb45e63 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -146,8 +146,6 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if (buf == NULL) return 1; - memset(buf, 0, ptr->size); - fseek(*fp, ptr->address - 15, SEEK_SET); fread(str, 8, 1, *fp); diff --git a/src/version.h b/src/version.h index 53be5ba..46eeb34 100644 --- a/src/version.h +++ b/src/version.h @@ -1,10 +1,10 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ #define VERSION "0" #define RELEASE "2" -#define PATCH "4" +#define PATCH "5" #define EXTRA "" From 7d9d658461ed5b0118a1bff8f1df29fb00165a25 Mon Sep 17 00:00:00 2001 From: yzrh Date: Mon, 2 Jan 2023 15:38:45 +0000 Subject: [PATCH 28/41] Handle duplicated image in HN. Signed-off-by: yzrh --- src/cnki_pdf.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 90ba28e..cfcba25 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -500,6 +500,8 @@ cnki_pdf_hn(cnki_t **param) char buf[64]; + pdf_object_t *tmp; + int cnt = 0; int *root_kid = malloc((*param)->file_stat->page * sizeof(int)); @@ -971,6 +973,28 @@ cnki_pdf_hn(cnki_t **param) margin_y = (3507.874 - dim[1]) / 2; } + /* Remove duplicated image, ptr->image_length is sometimes squared */ + for (int i = 1; i < ptr->image_length; i++) { + if ((ptr->image_data[i].x > 0 || ptr->image_data[i].y > 0) && + dim[i * 2] < dim[0] && dim[i * 2 + 1] < dim[1]) + continue; + + for (int j = i; j < ptr->image_length; j++) { + pdf_get_obj(&pdf, ids[j], &tmp); + pdf_obj_del(&pdf, ids[j]); + + tmp->next = NULL; + pdf_obj_destroy(&tmp); + + dim[j * 2] = -1; + dim[j * 2 + 1] = -1; + + pdf_obj_append(&pdf, ids[j], NULL, NULL, NULL, 0); + } + + break; + } + for (int i = 0; i < ptr->image_length; i++) { if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) continue; @@ -1150,8 +1174,6 @@ cnki_pdf_hn(cnki_t **param) return 1; } - pdf_object_t *tmp; - /* Add /Parent to page object */ for (int i = 0; i < (*param)->file_stat->page; i++) { if (pdf_get_obj(&pdf, root_kid[i], &tmp) != 0) { From 4a02b8bfc74920291a62f06fff9cf6e6c4f23ace Mon Sep 17 00:00:00 2001 From: yzrh Date: Mon, 2 Jan 2023 23:40:54 +0000 Subject: [PATCH 29/41] Fix inconsistent whitespace detection in PDF parser. Signed-off-by: yzrh --- CHANGE.md | 1 + src/pdf_parser.c | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index a2bfe9f..063d93a 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -8,6 +8,7 @@ * Improve PDF parser. * Handle duplicated object in CAJ. +* Handle duplicated image in HN. * Fix JBIG decoder. 0.2.4 (2022-12-31) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index bb45e63..2585e6f 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -19,26 +19,35 @@ static void * _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) { const char whitespace[6] = { - ' ', - '\r', - '\n', - '\f', - '\t', - '\0' + 0x00, + 0x09, + 0x0a, + 0x0c, + 0x0d, + 0x20 }; - char tmp[s1 + 1]; - memcpy(tmp, p1, s1); + char *ret = NULL; - char *ret; + char str[s1 + 1]; + memcpy(str, p1, s1); + + size_t tmp_size = 0; + char *tmp; for (int i = 0; i < 6; i++) { - tmp[s1] = whitespace[i]; - if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) - return ret; + str[s1] = whitespace[i]; + + if ((tmp = memmem(p0, s0, str, s1 + 1)) == NULL) + continue; + + if (tmp_size == 0 || (size_t) (tmp - (char *) p0) < tmp_size) { + tmp_size = tmp - (char *) p0; + ret = tmp; + } } - return NULL; + return ret; } static int From e0fe937e1a3c61581f80e27ad5d2c510e0901755 Mon Sep 17 00:00:00 2001 From: yzrh Date: Tue, 3 Jan 2023 12:12:42 +0000 Subject: [PATCH 30/41] Fix KDH decryption. Signed-off-by: yzrh --- src/cnki_kdh.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/cnki_kdh.c b/src/cnki_kdh.c index b13434d..af453a7 100644 --- a/src/cnki_kdh.c +++ b/src/cnki_kdh.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, yzrh + * Copyright (c) 2020-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -15,16 +15,18 @@ cnki_kdh(cnki_t **param) if ((*param)->stat > 0) printf("Begin 'KDH' decryption\n"); + long cur = ADDRESS_KDH_BODY; + long end; + fseek((*param)->fp_i, 0, SEEK_END); - - long size = ftell((*param)->fp_i); - - fseek((*param)->fp_i, ADDRESS_KDH_BODY, SEEK_SET); + end = ftell((*param)->fp_i); + fseek((*param)->fp_i, cur, SEEK_SET); const char key[] = KEY_KDH; const int key_len = KEY_KDH_LENGTH; long key_cur = 0; + int buf_size; char buf[(*param)->size_buf]; FILE *tmp = tmpfile(); @@ -33,32 +35,32 @@ cnki_kdh(cnki_t **param) return 1; for (;;) { - fread(buf, (*param)->size_buf, 1, (*param)->fp_i); + if (cur + (*param)->size_buf < end) + buf_size = (*param)->size_buf; + else + buf_size = end - cur; - for (int i = 0; i < (*param)->size_buf; i++) { - buf[i] ^= key[key_cur % key_len]; - key_cur++; - } + fread(buf, buf_size, 1, (*param)->fp_i); - fwrite(buf, (*param)->size_buf, 1, tmp); + for (int i = 0; i < buf_size; i++) + buf[i] ^= key[key_cur++ % key_len]; - if (ftell((*param)->fp_i) == size) + fwrite(buf, buf_size, 1, tmp); + + if ((cur = ftell((*param)->fp_i)) >= end) break; } if ((*param)->stat > 0) printf("Decrypted %ld byte(s)\n", ftell(tmp)); - fseek(tmp, 0, SEEK_SET); + fclose((*param)->fp_i); - FILE *orig = (*param)->fp_i; + fseek(tmp, 0, SEEK_SET); (*param)->fp_i = tmp; cnki_pdf(param); - (*param)->fp_i = orig; - fclose(tmp); - if ((*param)->stat > 0) printf("Conversion ended\n"); From 7ac0971a1711233bc0eaa5e8191590612959867b Mon Sep 17 00:00:00 2001 From: yzrh Date: Tue, 3 Jan 2023 15:39:53 +0000 Subject: [PATCH 31/41] Handle invalid result from PDF parser. Signed-off-by: yzrh --- src/cnki_pdf.c | 55 +++++++++++++++++++++++++++++------------------- src/pdf_parser.c | 3 ++- 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index cfcba25..af38aa6 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -160,10 +160,10 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) pdf_object_t *ptr; if ((*param)->stat > 1) - printf("Deleting duplicated object\n\t%8s\n", "id"); + printf("Deleting duplicated object\n"); ptr = *pdf; - while (ptr->next != NULL) { + while (ptr != NULL && ptr->next != NULL) { if (ptr->id == ptr->next->id) { pdf_get_obj(&ptr, ptr->id, &tmp); pdf_obj_del(&ptr, ptr->id); @@ -174,7 +174,7 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) ret++; if ((*param)->stat > 1) - printf("\t%8d\n", ptr->id); + printf("Deleted duplicated object %d.\n", ptr->id); } ptr = ptr->next; @@ -247,19 +247,19 @@ cnki_pdf(cnki_t **param) int *parent = NULL; pdf_get_parent_id(&pdf, &parent); - if (parent[0] == 0) - return 1; - if ((*param)->stat > 0) printf("Discovered %d parent object(s)\n", parent[0]); - int8_t *parent_missing = malloc(parent[0] * sizeof(int8_t)); - - if (parent_missing == NULL) - return 1; - + int8_t *parent_missing; int *kid; + if (parent[0] > 0) { + parent_missing = malloc(parent[0] * sizeof(int8_t)); + + if (parent_missing == NULL) + return 1; + } + for (int i = 1; i <= parent[0]; i++) { if ((*param)->stat > 1) printf("Searching for object %d\n", parent[i]); @@ -326,7 +326,7 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 1) printf("Searching for root object\n"); - dictionary_size = 128; + dictionary_size = 128 + 12 * parent[0]; dictionary = malloc(dictionary_size); if (dictionary == NULL) { @@ -400,8 +400,10 @@ cnki_pdf(cnki_t **param) root); } + if (parent[0] > 0) + free(parent_missing); + free(parent); - free(parent_missing); int outline = _pdf_cnki_outline(param, &pdf); @@ -1166,14 +1168,6 @@ cnki_pdf_hn(cnki_t **param) free(dictionary); - dictionary_size = 256; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - return 1; - } - /* Add /Parent to page object */ for (int i = 0; i < (*param)->file_stat->page; i++) { if (pdf_get_obj(&pdf, root_kid[i], &tmp) != 0) { @@ -1182,9 +1176,16 @@ cnki_pdf_hn(cnki_t **param) return 1; } - memset(dictionary, 0, dictionary_size); + dictionary_size = tmp->dictionary_size + 24; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + return 1; + } memcpy(dictionary, tmp->dictionary, tmp->dictionary_size); + memset(dictionary + tmp->dictionary_size, 0, 24); snprintf(buf, 64, "/Parent %d 0 R\n>>", root); strcat(dictionary, buf); @@ -1194,10 +1195,20 @@ cnki_pdf_hn(cnki_t **param) free(root_kid); return 1; } + + free(dictionary); } free(root_kid); + dictionary_size = 128; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + return 1; + } + memset(dictionary, 0, dictionary_size); if ((*param)->stat > 0) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 2585e6f..781bafa 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -119,7 +119,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) fseek(*fp, tail + 7, SEEK_SET); head = tail = 0; } else if (head > 0 && tail > 0) { - fseek(*fp, head, SEEK_SET); + if (cur + size_buf < end) + fseek(*fp, head, SEEK_SET); tail = 0; } else { fseek(*fp, -7, SEEK_CUR); From 8276423eb8395eae3e3002442307272eff1c9e8f Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 13:51:13 +0000 Subject: [PATCH 32/41] Prioritise incomplete object during deduplication. Signed-off-by: yzrh --- CHANGE.md | 3 +++ src/cnki_pdf.c | 30 ++++++++++++++++++++---------- src/pdf_parser.c | 25 ++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index 063d93a..3e05e10 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -2,6 +2,8 @@ ================== * Support HN text overlay. +* Handle invalid PDF object token in CAJ and KDH. +* Handle inaccuracy page count in CAJ and KDH. 0.2.5 (2023-01-XX) ================== @@ -9,6 +11,7 @@ * Improve PDF parser. * Handle duplicated object in CAJ. * Handle duplicated image in HN. +* Handle incomplete PDF object in CAJ and KDH. * Fix JBIG decoder. 0.2.4 (2022-12-31) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index af38aa6..76931ea 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -163,10 +163,16 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) printf("Deleting duplicated object\n"); ptr = *pdf; - while (ptr != NULL && ptr->next != NULL) { - if (ptr->id == ptr->next->id) { - pdf_get_obj(&ptr, ptr->id, &tmp); - pdf_obj_del(&ptr, ptr->id); + while (ptr->next != NULL && ptr->next->next != NULL) { + if (ptr->next->id == ptr->next->next->id) { + /* Keep the bigger one, the smaller one is usually incomplete */ + if (ptr->next->size < ptr->next->next->size) { + pdf_get_obj(&ptr, ptr->next->id, &tmp); + pdf_obj_del(&ptr, ptr->next->id); + } else { + pdf_get_obj(&ptr->next, ptr->next->id, &tmp); + pdf_obj_del(&ptr->next, ptr->next->id); + } tmp->next = NULL; pdf_obj_destroy(&tmp); @@ -174,7 +180,9 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) ret++; if ((*param)->stat > 1) - printf("Deleted duplicated object %d.\n", ptr->id); + printf("Deleted duplicated object %d.\n", ptr->next->id); + + continue; } ptr = ptr->next; @@ -236,6 +244,10 @@ cnki_pdf(cnki_t **param) printf("Loaded %d object(s)\n", pdf_get_count(&pdf)); + pdf_obj_sort(&pdf); + + _pdf_obj_dedup(param, &pdf); + int dictionary_size; char *dictionary; @@ -301,7 +313,7 @@ cnki_pdf(cnki_t **param) snprintf(buf, 64, "]\n/Count %d\n>>", - pdf_get_kid_count(&pdf, parent[i])); + pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]); strcat(dictionary, buf); pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); @@ -354,7 +366,7 @@ cnki_pdf(cnki_t **param) } else { for (int i = 0; i < parent[0]; i++) if (parent_missing[i] == 1) - root = i; + root = parent[i + 1]; } if (root == 0) @@ -471,8 +483,6 @@ cnki_pdf(cnki_t **param) _pdf_obj_sort(param, &pdf); - _pdf_obj_dedup(param, &pdf); - _pdf_dump(param, &pdf); pdf_obj_destroy(&pdf); @@ -510,7 +520,7 @@ cnki_pdf_hn(cnki_t **param) if (root_kid == NULL) return 1; - memset(root_kid, 0, (*param)->file_stat->page); + memset(root_kid, 0, (*param)->file_stat->page * sizeof(int)); object_hn_t *ptr = (*param)->object_hn; while (ptr != NULL) { diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 781bafa..ed7bfba 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -183,7 +183,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && memmem(tail + 3, - ptr->size - (tail - buf) - 3, + (tmp - tail) - 3, "stream\r\n", 8) == NULL) tail = tmp; @@ -226,8 +226,27 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) free(buf); } else { - ptr->object_size = ptr->size; - ptr->object = buf; + /* Handle incomplete object */ + head = buf; + while ((tmp = _memmem_whitespace(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 7; + + if (head - buf > 0) { + ptr->object_size = ptr->size - (head - buf); + ptr->object = malloc(ptr->object_size); + + if (ptr->object == NULL) + return 1; + + memcpy(ptr->object, head, ptr->object_size); + + free(buf); + } else { + ptr->object_size = ptr->size; + ptr->object = buf; + } } ptr = ptr->next; From 8cd8a8fbbadaeee6563d6cb5d7c648570d78b2fc Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 17:07:57 +0000 Subject: [PATCH 33/41] Replace catalog object if found. Signed-off-by: yzrh --- src/cnki_pdf.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 76931ea..87fe3f6 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -244,10 +244,6 @@ cnki_pdf(cnki_t **param) printf("Loaded %d object(s)\n", pdf_get_count(&pdf)); - pdf_obj_sort(&pdf); - - _pdf_obj_dedup(param, &pdf); - int dictionary_size; char *dictionary; @@ -262,6 +258,10 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Discovered %d parent object(s)\n", parent[0]); + pdf_obj_sort(&pdf); + + _pdf_obj_dedup(param, &pdf); + int8_t *parent_missing; int *kid; @@ -419,6 +419,20 @@ cnki_pdf(cnki_t **param) int outline = _pdf_cnki_outline(param, &pdf); + snprintf(buf, 64, + "<<\n/Type /Catalog\n/Pages %d 0 R\n", + root); + strcat(dictionary, buf); + + if (outline != -1) { + snprintf(buf, 64, + "/Outlines %d 0 R\n/PageMode /UseOutlines\n", + outline); + strcat(dictionary, buf); + } + + strcat(dictionary, ">>"); + if ((*param)->stat > 1) printf("Searching for catalog object\n"); @@ -427,6 +441,14 @@ cnki_pdf(cnki_t **param) if (catalog != 0) { if ((*param)->stat > 0) printf("Catalog object is %d.\n", catalog); + + if ((*param)->stat > 1) + printf("Replacing catalog object\n"); + + pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0); + + if ((*param)->stat > 0) + printf("Replaced catalog object\n"); } else { if ((*param)->stat > 0) printf("Catalog object is missing\n"); @@ -434,20 +456,6 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 1) printf("Generating catalog object\n"); - snprintf(buf, 64, - "<<\n/Type /Catalog\n/Pages %d 0 R\n", - root); - strcat(dictionary, buf); - - if (outline != -1) { - snprintf(buf, 64, - "/Outlines %d 0 R\n/PageMode /UseOutlines\n", - outline); - strcat(dictionary, buf); - } - - strcat(dictionary, ">>"); - pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); if ((*param)->stat > 0) From c2afbb3cbc947dec4d2878c9c3608306039f9c8b Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 17:19:06 +0000 Subject: [PATCH 34/41] Handle invalid PDF object. Signed-off-by: yzrh --- src/pdf_parser.c | 99 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 28 deletions(-) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index ed7bfba..70d72d5 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -79,8 +79,25 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) memset(buf + end - cur, 0, size_buf - end + cur); } - if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) - head = cur + (pos - buf) + 7; + if (head == 0) { + /* Hack needed for invalid object */ + pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6); + tmp = memmem(buf, size_buf, " 0 obj", 6); + + while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b) + tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6); + + if (pos != NULL && tmp != NULL) { + if (pos - buf < tmp - buf) + head = cur + (pos - buf) + 7; + else + head = cur + (tmp - buf) + 6; + } else if (pos != NULL) { + head = cur + (pos - buf) + 7; + } else if (tmp != NULL) { + head = cur + (tmp - buf) + 6; + } + } if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) { /* We need to check if it is the object stored in stream */ @@ -156,9 +173,46 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if (buf == NULL) return 1; - fseek(*fp, ptr->address - 15, SEEK_SET); + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + + /* Handle incomplete object */ + head = buf; + while ((tmp = _memmem_whitespace(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 7; + + /* Hack needed for invalid object */ + while ((tmp = memmem(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 6; + + if (head - buf > 0) { + ptr->address += head - buf; + ptr->size -= head - buf; + + tmp = realloc(buf, ptr->size); + + if (tmp == NULL) + return 1; + + buf = tmp; + + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + } + + /* Hack needed for invalid object */ + fseek(*fp, ptr->address - 14, SEEK_SET); fread(str, 8, 1, *fp); + if (str[7] < '0' || str[7] > '9') { + fseek(*fp, ptr->address - 15, SEEK_SET); + fread(str, 8, 1, *fp); + } + for (int i = 7; i >= 0; i--) { if (str[i] < '0' || str[i] > '9') { if (i < 7) @@ -170,11 +224,10 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) } } - fseek(*fp, ptr->address, SEEK_SET); - fread(buf, ptr->size, 1, *fp); - if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && - (tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { + ((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL || + /* Hack needed for invalid object */ + (tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) { /* * A dictionary object may have nested dictionary, * but it should not be in a stream @@ -187,6 +240,15 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) "stream\r\n", 8) == NULL) tail = tmp; + /* Hack needed for invalid object */ + while ((tmp = memmem(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL && + memmem(tail + 2, + (tmp - tail) - 2, + "stream\r\n", 8) == NULL) + tail = tmp; + ptr->dictionary_size = tail - head + 2; ptr->dictionary = malloc(ptr->dictionary_size + 1); @@ -226,27 +288,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) free(buf); } else { - /* Handle incomplete object */ - head = buf; - while ((tmp = _memmem_whitespace(head, - ptr->size - (head - buf), - " 0 obj", 6)) != NULL) - head = tmp + 7; - - if (head - buf > 0) { - ptr->object_size = ptr->size - (head - buf); - ptr->object = malloc(ptr->object_size); - - if (ptr->object == NULL) - return 1; - - memcpy(ptr->object, head, ptr->object_size); - - free(buf); - } else { - ptr->object_size = ptr->size; - ptr->object = buf; - } + ptr->object_size = ptr->size; + ptr->object = buf; } ptr = ptr->next; From 56ffe14d5a8c10163850d3e682c67a282d9b8abc Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 17:29:07 +0000 Subject: [PATCH 35/41] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGE.md b/CHANGE.md index 3e05e10..4b5a830 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -2,7 +2,6 @@ ================== * Support HN text overlay. -* Handle invalid PDF object token in CAJ and KDH. * Handle inaccuracy page count in CAJ and KDH. 0.2.5 (2023-01-XX) @@ -12,6 +11,7 @@ * Handle duplicated object in CAJ. * Handle duplicated image in HN. * Handle incomplete PDF object in CAJ and KDH. +* Handle invalid PDF object token in CAJ and KDH. * Fix JBIG decoder. 0.2.4 (2022-12-31) From a7ecc156141b15cfae8b309697e12deae2740841 Mon Sep 17 00:00:00 2001 From: yzrh Date: Wed, 4 Jan 2023 17:50:25 +0000 Subject: [PATCH 36/41] Replace catalog object only if root object does not exist. Signed-off-by: yzrh --- src/cnki_pdf.c | 16 ++++++++++------ src/pdf_parser.c | 6 ++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 87fe3f6..0c1ebb0 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -375,9 +375,11 @@ cnki_pdf(cnki_t **param) printf("Root object is %d.\n", root); } + int root_gen; + pdf_object_t *tmp; - if (pdf_get_obj(&pdf, root, &tmp) != 0) { + if ((root_gen = pdf_get_obj(&pdf, root, &tmp)) != 0) { if ((*param)->stat > 0) printf("Root object is missing\n"); @@ -442,13 +444,15 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Catalog object is %d.\n", catalog); - if ((*param)->stat > 1) - printf("Replacing catalog object\n"); + if (root_gen != 0) { + if ((*param)->stat > 1) + printf("Replacing catalog object\n"); - pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0); + pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0); - if ((*param)->stat > 0) - printf("Replaced catalog object\n"); + if ((*param)->stat > 0) + printf("Replaced catalog object\n"); + } } else { if ((*param)->stat > 0) printf("Catalog object is missing\n"); diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 70d72d5..6520fd5 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -232,7 +232,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) * A dictionary object may have nested dictionary, * but it should not be in a stream */ - while ((tmp = _memmem_whitespace(tail + 3, + while (ptr->size - (tail - buf) > 3 && + (tmp = _memmem_whitespace(tail + 3, ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && memmem(tail + 3, @@ -241,7 +242,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) tail = tmp; /* Hack needed for invalid object */ - while ((tmp = memmem(tail + 2, + while (ptr->size - (tail - buf) > 2 && + (tmp = memmem(tail + 2, ptr->size - (tail - buf) - 2, ">>", 2)) != NULL && memmem(tail + 2, From 13cb0a1b8dd7cdee9af519dab10d6b1c1036c321 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 5 Jan 2023 11:21:54 +0000 Subject: [PATCH 37/41] Fix invalid token parsing. Signed-off-by: yzrh --- src/pdf_parser.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 6520fd5..e6d8ac6 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -228,28 +228,31 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) ((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL || /* Hack needed for invalid object */ (tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) { - /* - * A dictionary object may have nested dictionary, - * but it should not be in a stream - */ - while (ptr->size - (tail - buf) > 3 && - (tmp = _memmem_whitespace(tail + 3, - ptr->size - (tail - buf) - 3, - ">>", 2)) != NULL && - memmem(tail + 3, - (tmp - tail) - 3, - "stream\r\n", 8) == NULL) - tail = tmp; + if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) { + tail = memmem(buf, ptr->size, ">>", 2); - /* Hack needed for invalid object */ - while (ptr->size - (tail - buf) > 2 && - (tmp = memmem(tail + 2, - ptr->size - (tail - buf) - 2, - ">>", 2)) != NULL && - memmem(tail + 2, - (tmp - tail) - 2, - "stream\r\n", 8) == NULL) - tail = tmp; + while (ptr->size - (tail - buf) > 2 && + (tmp = memmem(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL && + memmem(tail + 2, + (tmp - tail) - 2, + "stream\r\n", 8) == NULL) + tail = tmp; + } else { + /* + * A dictionary object may have nested dictionary, + * but it should not be in a stream + */ + while (ptr->size - (tail - buf) > 3 && + (tmp = _memmem_whitespace(tail + 3, + ptr->size - (tail - buf) - 3, + ">>", 2)) != NULL && + memmem(tail + 3, + (tmp - tail) - 3, + "stream\r\n", 8) == NULL) + tail = tmp; + } ptr->dictionary_size = tail - head + 2; ptr->dictionary = malloc(ptr->dictionary_size + 1); From 283446dba5ce6e1a61b46377999a45f0b85a6937 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 5 Jan 2023 17:32:13 +0000 Subject: [PATCH 38/41] Update CHANGE. Signed-off-by: yzrh --- CHANGE.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index 4b5a830..9071c91 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -2,9 +2,10 @@ ================== * Support HN text overlay. -* Handle inaccuracy page count in CAJ and KDH. +* Support HN page with text. +* Handle inaccurate page count in CAJ and KDH. -0.2.5 (2023-01-XX) +0.2.5 (2023-01-05) ================== * Improve PDF parser. From 123d62141cce0cbeb2ae6eb80b669af7db1c8c72 Mon Sep 17 00:00:00 2001 From: yzrh Date: Thu, 5 Jan 2023 19:13:37 +0000 Subject: [PATCH 39/41] Add document information dictionary to output. Signed-off-by: yzrh --- src/pdf_writer.c | 37 +++++++++++++++++++++++++------------ src/version.h | 4 ++-- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/pdf_writer.c b/src/pdf_writer.c index 465d26b..6afa89b 100644 --- a/src/pdf_writer.c +++ b/src/pdf_writer.c @@ -1,19 +1,39 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ #include +#include #include +#include "version.h" #include "md5.h" #include "pdf.h" +static int +_info_obj(pdf_object_t **pdf) +{ + char dictionary[128] = "<<\n" + "/Producer (Melon " VERSION "." RELEASE "." PATCH EXTRA ")\n" + "/CreationDate (D:"; + + char buf[64]; + + time_t timestamp = time(NULL); + + strftime(buf, 64, "%Y%m%d%H%M%S", gmtime(×tamp)); + strcat(dictionary, buf); + strcat(dictionary, "+00'00')\n>>"); + + return pdf_obj_append(pdf, 0, NULL, dictionary, NULL, 0); +} + int pdf_dump_obj(pdf_object_t **pdf, FILE **fp) { - if (*pdf == NULL || *fp == NULL) + if (*pdf == NULL || *fp == NULL || _info_obj(pdf) != 0) return 1; long cur; @@ -152,18 +172,11 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) while (ptr->next != NULL) ptr = ptr->next; - /* - * TODO: Document information dictionary - * `"/Producer (Melon)"' - * `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"' - * - * Trailer dictionary - * `"/Info %d 0 R"' - */ fprintf(*fp, - "/Size %d\n/Root %d 0 R\n", + "/Size %d\n/Root %d 0 R\n/Info %d 0 R\n", ptr->id + 1, - pdf_get_catalog_id(pdf)); + pdf_get_catalog_id(pdf), + ptr->id); fputs("/ID [", *fp); diff --git a/src/version.h b/src/version.h index 46eeb34..c3ff314 100644 --- a/src/version.h +++ b/src/version.h @@ -5,6 +5,6 @@ */ #define VERSION "0" -#define RELEASE "2" -#define PATCH "5" +#define RELEASE "3" +#define PATCH "0" #define EXTRA "" From dd5854678cfe7bab499925175b5b40314d71fede Mon Sep 17 00:00:00 2001 From: yzrh Date: Fri, 6 Jan 2023 12:00:01 +0000 Subject: [PATCH 40/41] Fix JBIG2 allocation. Signed-off-by: yzrh --- src/jbig2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/jbig2.c b/src/jbig2.c index 9b3a9be..ea9233c 100644 --- a/src/jbig2.c +++ b/src/jbig2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, yzrh + * Copyright (c) 2022-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -31,5 +31,6 @@ strdec_jbig2(char **bitmap, } jbig2_release_page(ctx, image); + jbig2_ctx_free(ctx); return 0; } From 2fa2b760aef552982250dad346bd255be08cd9bb Mon Sep 17 00:00:00 2001 From: yzrh Date: Sat, 14 Jan 2023 23:52:28 +0000 Subject: [PATCH 41/41] Fix HN text parsing. Signed-off-by: yzrh --- src/cnki_pdf.c | 96 ++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 0c1ebb0..d96ea49 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -850,73 +850,75 @@ cnki_pdf_hn(cnki_t **param) for (int i = 0, j = 0; i < ptr->text_size - 1;) { switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) { case 0x8001: - if (ptr->address_next > ptr->address) - strcat(dictionary, "T*\n"); - case 0x8070: - if (ptr->address_next > ptr->address) { - i += 4; + if (ptr->address_next <= ptr->address) { + if (i + 7 >= ptr->text_size) { + i += 2; + break; + } - for (;;) { - if (i + 3 >= ptr->text_size || - (unsigned char) ptr->text[i + 1] == 0x80) - break; + conv_src[0] = ptr->text[i + 7]; + conv_src[1] = ptr->text[i + 6]; - conv_src[0] = ptr->text[i + 3]; - conv_src[1] = ptr->text[i + 2]; + //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") + //strcat(dictionary, buf); - //snprintf(buf, 64, "%f %f Td\n"); - //strcat(dictionary, buf); + conv_size = 6; - conv_size = 6; - - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, " Tj\n"); + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + if (conv_size - 2 > 0) { + strcat(dictionary, "<"); + for (int k = 0; k < conv_size - 2; k++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[k]); + strcat(dictionary, conv_hex); } - free(conv_dst); + strcat(dictionary, "> Tj\n"); } - - i += 4; + free(conv_dst); } + i += 8; break; } - if (i + 7 >= ptr->text_size) { - i += 2; + strcat(dictionary, "T*\n"); + case 0x8070: + i += 4; + + if (ptr->address_next <= ptr->address) break; - } - conv_src[0] = ptr->text[i + 7]; - conv_src[1] = ptr->text[i + 6]; + for (;;) { + if (i + 3 >= ptr->text_size || + (unsigned char) ptr->text[i + 1] == 0x80) + break; - //snprintf(buf, 64, "%f %f Td\n"); - //strcat(dictionary, buf); + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; - conv_size = 6; + //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") + //strcat(dictionary, buf); - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, " 0) { + strcat(dictionary, "<"); + for (int k = 0; k < conv_size - 2; k++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[k]); + strcat(dictionary, conv_hex); + } + strcat(dictionary, "> Tj\n"); } - strcat(dictionary, "> Tj\n"); + free(conv_dst); } - free(conv_dst); + + i += 4; } - i += 8; break; case 0x800a: if (i + 27 >= ptr->text_size || j >= ptr->image_length) {