diff --git a/CHANGE.md b/CHANGE.md index 9071c91..e4217a5 100644 --- a/CHANGE.md +++ b/CHANGE.md @@ -1,44 +1,7 @@ -0.3.0 (2023-XX-XX) -================== - -* Support HN text overlay. -* Support HN page with text. -* Handle inaccurate page count in CAJ and KDH. - -0.2.5 (2023-01-05) -================== - -* Improve PDF parser. -* Handle duplicated object in CAJ. -* Handle duplicated image in HN. -* Handle incomplete PDF object in CAJ and KDH. -* Handle invalid PDF object token in CAJ and KDH. -* Fix JBIG decoder. - -0.2.4 (2022-12-31) -================== - -* Fix HN image compositing. -* Fix PDF object check. - -0.2.3 (2022-12-30) -================== - -* Support HN figure placement. - -0.2.2 (2022-12-29) +0.3.0 (2022-XX-XX) ================== * Support JPEG 2000 for HN. -* Handle missing but referenced root object. -* Handle HN with more than one image per page. -* Fix buffer overflow. - -0.2.1 (2022-12-26) -================== - -* Handle different JPEG colour component. -* Handle headless HN and page with no image. 0.2.0 (2022-12-22) ================== diff --git a/README.md b/README.md index 1fdcf6c..471282e 100644 --- a/README.md +++ b/README.md @@ -9,15 +9,16 @@ Development Currently, CAJ, KDH, and HN can be converted. Please report any failures with a sample that can reproduce the behaviour. +HN support does not support JPEG 2000 yet. + Dependency ---------- -1. libcrypto (OpenSSL) -2. zlib -3. jbig2dec -4. libjpeg-turbo -5. openjpeg -6. pkgconf +1. OpenSSL +2. libiconv +3. zlib +4. jbig2dec +5. libjpeg-turbo Usage ===== @@ -35,12 +36,12 @@ Options Specify output file -b, --buffer -Set input buffer size (default 512k) +Set buffer size (default 512k) -v, --verbose -Print more information (twice for even more, three times for HN image processing information as well) +Print more information (twice for even more, three times for HN image decoding information as well) Thanks ====== -This project is inspired by [https://github.com/caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf) +This project is inspired by [https://github.com/JeziL/caj2pdf](https://github.com/JeziL/caj2pdf) diff --git a/src/Makefile b/src/Makefile index b2346a7..6943af3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,28 +4,23 @@ # SPDX-License-Identifier: Apache-2.0 # -src = melon.c iconv.c zlib.c jbig2.c jpeg.c jp2.c md5.c \ - cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c cnki_pdf.c \ - cnki_zlib.c cnki_jbig.c cnki_jbig_dec.c cnki_jbig2.c cnki.c \ +src = melon.c iconv.c zlib.c jbig.c jbig2.c jpeg.c \ + cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c \ + cnki_pdf.c cnki_zlib.c cnki_jbig.c cnki_jbig2.c cnki.c \ pdf_cnki.c pdf_get.c pdf_parser.c pdf_writer.c pdf.c -inc = extern.h version.h iconv.h zlib.h jbig2.h jpeg.h jp2.h md5.h \ - cnki.h pdf_cnki.h cnki_jbig.h cnki_jbig_dec.h pdf.h +inc = extern.h version.h iconv.h zlib.h jbig.h jbig2.h jpeg.h \ + cnki.h pdf_cnki.h cnki_jbig.h pdf.h obj = ${src:.c=.o} PREFIX = /usr/local CFLAGS = -O2 -pipe -flto -Wall -Wextra -LDFLAGS = -Wl,-O2 -lcrypto -lz -ljbig2dec -ljpeg -lopenjp2 -Wl,--as-needed +LDFLAGS = -Wl,-O2 -lcrypto -liconv -lz -ljbig2dec -ljpeg -Wl,--as-needed CFLAGS += -I/usr/local/include LDFLAGS += -L/usr/local/lib -OPENJPEG_CFLAGS != pkgconf --cflags libopenjp2 - -CFLAGS += ${OPENJPEG_CFLAGS} -CFLAGS += -DLIBICONV_PLUG - all: ${obj} ${inc} ${CC} ${LDFLAGS} -o melon ${obj} diff --git a/src/cnki.c b/src/cnki.c index 8c2e6e6..5f120d0 100644 --- a/src/cnki.c +++ b/src/cnki.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -54,11 +54,6 @@ cnki_destroy(cnki_t **param) object_hn_t *ptr_hn; while ((ptr_hn = (*param)->object_hn) != NULL) { (*param)->object_hn = (*param)->object_hn->next; - free(ptr_hn->text); - if (ptr_hn->image_data != NULL) - for (int i = 0; i < ptr_hn->image_length; i++) - free(ptr_hn->image_data[i].image); - free(ptr_hn->image_data); free(ptr_hn); } @@ -76,19 +71,12 @@ cnki_info(cnki_t **param) printf("Reading file header at 0x%x\n", ADDRESS_HEAD); int addr[2]; - unsigned char str[2]; fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET); fread((*param)->file_stat->type, 4, 1, (*param)->fp_i); - fread(str, 2, 1, (*param)->fp_i); - - if ((*param)->stat > 0) { - if ((unsigned char) (*param)->file_stat->type[0] > 0x7f) - printf("File type is '%02x'\n", (unsigned char) (*param)->file_stat->type[0]); - else - printf("File type is '%s'\n", (*param)->file_stat->type); - } + if ((*param)->stat > 0) + printf("File type is '%s'\n", (*param)->file_stat->type); if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) { return 0; @@ -98,9 +86,6 @@ cnki_info(cnki_t **param) } else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) { addr[0] = ADDRESS_HN_PAGE; addr[1] = ADDRESS_HN_OUTLINE; - } else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) { - addr[0] = ADDRESS_C8_PAGE; - addr[1] = ADDRESS_HN_OUTLINE; } else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) { return 0; } else { @@ -117,14 +102,6 @@ cnki_info(cnki_t **param) printf("Advised %d page(s)\n", (*param)->file_stat->page); - if (strncmp((*param)->file_stat->type, "HN", 2) == 0 && str[0] == 0xc8 && str[1] == 0x00) { - fseek((*param)->fp_i, 0xd8, SEEK_SET); - return 0; - } else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) { - fseek((*param)->fp_i, 0x50, SEEK_SET); - return 0; - } - if ((*param)->stat > 1) printf("Reading outline count at 0x%x\n", addr[1]); @@ -138,7 +115,7 @@ cnki_info(cnki_t **param) if ((*param)->file_stat->outline > 0) { if ((*param)->stat > 1) { printf("Loading outline(s)\n"); - printf("\t%19s\t%-24s\t%12s\t%12s\t%5s\n", + printf("\t%16s\t%-24s\t%12s\t%12s\t%5s\n", "title", "hierarchy", "page", diff --git a/src/cnki.h b/src/cnki.h index 7d7d15e..237a2c1 100644 --- a/src/cnki.h +++ b/src/cnki.h @@ -16,8 +16,6 @@ #define ADDRESS_HN_PAGE 0x0090 #define ADDRESS_HN_OUTLINE 0x0158 -#define ADDRESS_C8_PAGE 0x0008 - #define ADDRESS_KDH_BODY 0x00fe #define KEY_KDH "FZHMEI" @@ -58,10 +56,6 @@ typedef struct _hn_image_t { int32_t format; /* hn_code */ int32_t address; int32_t size; - uint16_t x; - uint16_t y; - uint16_t w; - uint16_t h; char *image; } hn_image_t; @@ -70,8 +64,7 @@ typedef struct _object_hn_t { int32_t text_size; int16_t image_length; int16_t page; - int32_t unknown; /* TODO: what is it? */ - int32_t address_next; + int32_t unknown[2]; /* TODO: what is it? */ char *text; struct _hn_image_t *image_data; struct _object_hn_t *next; diff --git a/src/cnki_hn.c b/src/cnki_hn.c index c2f76ec..feabb48 100644 --- a/src/cnki_hn.c +++ b/src/cnki_hn.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -27,13 +27,12 @@ cnki_hn(cnki_t **param) if ((*param)->stat > 1) { printf("Loading page(s)\n"); - printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4s\t%8s\t%8s\n", + printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n", "address", "text", "length", "page", "unknown", - "next", "code", "address", "image"); @@ -45,8 +44,7 @@ cnki_hn(cnki_t **param) fread(&ptr->text_size, 4, 1, (*param)->fp_i); fread(&ptr->image_length, 2, 1, (*param)->fp_i); fread(&ptr->page, 2, 1, (*param)->fp_i); - fread(&ptr->unknown, 4, 1, (*param)->fp_i); - fread(&ptr->address_next, 4, 1, (*param)->fp_i); + fread(&ptr->unknown, 8, 1, (*param)->fp_i); ptr->text = NULL; ptr->image_data = NULL; @@ -64,80 +62,66 @@ cnki_hn(cnki_t **param) ptr = (*param)->object_hn; while (ptr != NULL) { - if (ptr->text_size > 0) { - ptr->text = malloc(ptr->text_size); + ptr->text = malloc(ptr->text_size); - if (ptr->text == NULL) - return 1; + if (ptr->text == NULL) + return 1; - fseek((*param)->fp_i, ptr->address, SEEK_SET); - fread(ptr->text, ptr->text_size, 1, (*param)->fp_i); - } + fseek((*param)->fp_i, ptr->address, SEEK_SET); + fread(ptr->text, ptr->text_size, 1, (*param)->fp_i); if ((*param)->stat > 1) - printf("\t%08x\t%8d\t%6d\t%4d\t%8d\t%08x", + printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}", ptr->address, ptr->text_size, ptr->image_length, ptr->page, - ptr->unknown, - ptr->address_next); + ptr->unknown[0], + ptr->unknown[1]); - if (ptr->image_length > 0) { - ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t)); + ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t)); - if (ptr->image_data == NULL) + if (ptr->image_data == NULL) + return 1; + + for (int i = 0; i < ptr->image_length; i++) { + fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); + fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); + fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); + fseek((*param)->fp_i, + ptr->image_data[i].address + ptr->image_data[i].size, + SEEK_SET); + } + + for (int i = 0; i < ptr->image_length; i++) { + ptr->image_data[i].image = malloc(ptr->image_data[i].size); + + if (ptr->image_data[i].image == NULL) return 1; - for (int i = 0; i < ptr->image_length; i++) { - fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); - fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); - fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); - ptr->image_data[i].x = 0; - ptr->image_data[i].y = 0; - ptr->image_data[i].w = 0; - ptr->image_data[i].h = 0; - fseek((*param)->fp_i, - ptr->image_data[i].address + ptr->image_data[i].size, - SEEK_SET); - } + fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET); + fread(ptr->image_data[i].image, + ptr->image_data[i].size, 1, + (*param)->fp_i); - for (int i = 0; i < ptr->image_length; i++) { - ptr->image_data[i].image = malloc(ptr->image_data[i].size); - - if (ptr->image_data[i].image == NULL) - return 1; - - fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET); - fread(ptr->image_data[i].image, - ptr->image_data[i].size, 1, - (*param)->fp_i); - - if ((*param)->stat > 1) { - if (i == 0) { - printf("\t%4d\t%08x\t%8d\n", - ptr->image_data[i].format, - ptr->image_data[i].address, - ptr->image_data[i].size); - } else { - printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4d\t%08x\t%8d\n", - "", - "", - "", - "", - "", - "", - ptr->image_data[i].format, - ptr->image_data[i].address, - ptr->image_data[i].size); - } + if ((*param)->stat > 1) { + if (i == 0) { + printf("\t%4d\t%08x\t%8d\n", + ptr->image_data[i].format, + ptr->image_data[i].address, + ptr->image_data[i].size); + } else { + printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n", + "", + "", + "", + "", + "", + ptr->image_data[i].format, + ptr->image_data[i].address, + ptr->image_data[i].size); } } - } else if ((*param)->stat > 1) { - printf("\t%4s\t%8s\t%8s\n", - "", - "", - ""); } ptr = ptr->next; diff --git a/src/cnki_jbig.c b/src/cnki_jbig.c index acc43eb..f35d1d5 100644 --- a/src/cnki_jbig.c +++ b/src/cnki_jbig.c @@ -8,7 +8,7 @@ #include #include "cnki_jbig.h" -#include "cnki_jbig_dec.h" +#include "jbig.h" int cnki_jbig(char **bitmap, int *bitmap_size, diff --git a/src/cnki_jbig.h b/src/cnki_jbig.h index 2983607..701b4df 100644 --- a/src/cnki_jbig.h +++ b/src/cnki_jbig.h @@ -27,8 +27,8 @@ typedef struct _dib_t { uint16_t depth; uint32_t compression; /* dib_compression_code */ uint32_t size; - int32_t resolution_h; - int32_t resolution_v; + uint32_t resolution_h; + uint32_t resolution_v; uint32_t colour; uint32_t colour_used; } dib_t; diff --git a/src/cnki_kdh.c b/src/cnki_kdh.c index af453a7..b13434d 100644 --- a/src/cnki_kdh.c +++ b/src/cnki_kdh.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -15,18 +15,16 @@ cnki_kdh(cnki_t **param) if ((*param)->stat > 0) printf("Begin 'KDH' decryption\n"); - long cur = ADDRESS_KDH_BODY; - long end; - fseek((*param)->fp_i, 0, SEEK_END); - end = ftell((*param)->fp_i); - fseek((*param)->fp_i, cur, SEEK_SET); + + long size = ftell((*param)->fp_i); + + fseek((*param)->fp_i, ADDRESS_KDH_BODY, SEEK_SET); const char key[] = KEY_KDH; const int key_len = KEY_KDH_LENGTH; long key_cur = 0; - int buf_size; char buf[(*param)->size_buf]; FILE *tmp = tmpfile(); @@ -35,32 +33,32 @@ cnki_kdh(cnki_t **param) return 1; for (;;) { - if (cur + (*param)->size_buf < end) - buf_size = (*param)->size_buf; - else - buf_size = end - cur; + fread(buf, (*param)->size_buf, 1, (*param)->fp_i); - fread(buf, buf_size, 1, (*param)->fp_i); + for (int i = 0; i < (*param)->size_buf; i++) { + buf[i] ^= key[key_cur % key_len]; + key_cur++; + } - for (int i = 0; i < buf_size; i++) - buf[i] ^= key[key_cur++ % key_len]; + fwrite(buf, (*param)->size_buf, 1, tmp); - fwrite(buf, buf_size, 1, tmp); - - if ((cur = ftell((*param)->fp_i)) >= end) + if (ftell((*param)->fp_i) == size) break; } if ((*param)->stat > 0) printf("Decrypted %ld byte(s)\n", ftell(tmp)); - fclose((*param)->fp_i); - fseek(tmp, 0, SEEK_SET); + + FILE *orig = (*param)->fp_i; (*param)->fp_i = tmp; cnki_pdf(param); + (*param)->fp_i = orig; + fclose(tmp); + if ((*param)->stat > 0) printf("Conversion ended\n"); diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index d96ea49..b59b7c6 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -11,7 +11,6 @@ #include "iconv.h" #include "zlib.h" #include "jpeg.h" -#include "jp2.h" #include "pdf.h" #include "pdf_cnki.h" @@ -145,54 +144,11 @@ _pdf_obj_sort(cnki_t **param, pdf_object_t **pdf) ret = pdf_obj_sort(pdf); - if ((*param)->stat > 0) - printf("Sorted object(s)\n"); - - return ret; -} - -static int -_pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) -{ - int ret = 0; - - pdf_object_t *tmp; - pdf_object_t *ptr; - - if ((*param)->stat > 1) - printf("Deleting duplicated object\n"); - - ptr = *pdf; - while (ptr->next != NULL && ptr->next->next != NULL) { - if (ptr->next->id == ptr->next->next->id) { - /* Keep the bigger one, the smaller one is usually incomplete */ - if (ptr->next->size < ptr->next->next->size) { - pdf_get_obj(&ptr, ptr->next->id, &tmp); - pdf_obj_del(&ptr, ptr->next->id); - } else { - pdf_get_obj(&ptr->next, ptr->next->id, &tmp); - pdf_obj_del(&ptr->next, ptr->next->id); - } - - tmp->next = NULL; - pdf_obj_destroy(&tmp); - - ret++; - - if ((*param)->stat > 1) - printf("Deleted duplicated object %d.\n", ptr->next->id); - - continue; - } - - ptr = ptr->next; - } - if ((*param)->stat > 0) { if (ret == 0) - printf("No duplicated object\n"); + printf("Sorted object(s)\n"); else - printf("Deleted %d duplicated object(s)\n", ret); + printf("Object(s) not sorted\n"); } return ret; @@ -255,23 +211,19 @@ cnki_pdf(cnki_t **param) int *parent = NULL; pdf_get_parent_id(&pdf, &parent); + if (parent[0] == 0) + return 1; + if ((*param)->stat > 0) printf("Discovered %d parent object(s)\n", parent[0]); - pdf_obj_sort(&pdf); + int *parent_missing = malloc(parent[0] * sizeof(int)); - _pdf_obj_dedup(param, &pdf); + if (parent_missing == NULL) + return 1; - int8_t *parent_missing; int *kid; - if (parent[0] > 0) { - parent_missing = malloc(parent[0] * sizeof(int8_t)); - - if (parent_missing == NULL) - return 1; - } - for (int i = 1; i <= parent[0]; i++) { if ((*param)->stat > 1) printf("Searching for object %d\n", parent[i]); @@ -313,7 +265,7 @@ cnki_pdf(cnki_t **param) snprintf(buf, 64, "]\n/Count %d\n>>", - pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]); + pdf_get_kid_count(&pdf, parent[i])); strcat(dictionary, buf); pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); @@ -338,7 +290,7 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 1) printf("Searching for root object\n"); - dictionary_size = 128 + 12 * parent[0]; + dictionary_size = 128; dictionary = malloc(dictionary_size); if (dictionary == NULL) { @@ -356,9 +308,7 @@ cnki_pdf(cnki_t **param) if (parent_missing[i] == 1) root_kid++; - if (root_kid > 1) { - root = pdf_get_free_id(&pdf); - } else { + if (root_kid <= 1) { if (root_kid == 0) { for (int i = 1; i <= parent[0]; i++) if (root == 0 || root < parent[i]) @@ -366,32 +316,30 @@ cnki_pdf(cnki_t **param) } else { for (int i = 0; i < parent[0]; i++) if (parent_missing[i] == 1) - root = parent[i + 1]; + root = i; } - if (root == 0) - root = pdf_get_free_id(&pdf); - else if ((*param)->stat > 0) - printf("Root object is %d.\n", root); - } - - int root_gen; - - pdf_object_t *tmp; - - if ((root_gen = pdf_get_obj(&pdf, root, &tmp)) != 0) { + if ((*param)->stat > 0) + printf("Root object is %d.\n", + root); + } else { if ((*param)->stat > 0) printf("Root object is missing\n"); if ((*param)->stat > 1) printf("Generating root object\n"); + root = pdf_get_free_id(&pdf); + snprintf(buf, 64, - "<<\n/Type /Pages\n/Kids ["); + "<<\n/Type /Pages\n/Kids "); strcat(dictionary, buf); + if (parent[0] > 1) + strcat(dictionary, "["); + for (int i = 0, j = 0; i < parent[0]; i++) { - if (parent_missing[i] == 1) { + if (parent_missing[i]) { snprintf(buf, 64, "%d 0 R", parent[i + 1]); strcat(dictionary, buf); @@ -400,7 +348,12 @@ cnki_pdf(cnki_t **param) } } - snprintf(buf, 64, "]\n/Count %d\n", (*param)->file_stat->page); + if (parent[0] > 1) + strcat(dictionary, "]"); + + strcat(dictionary, "\n"); + + snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page); strcat(dictionary, buf); strcat(dictionary, ">>"); @@ -414,27 +367,11 @@ cnki_pdf(cnki_t **param) root); } - if (parent[0] > 0) - free(parent_missing); - free(parent); + free(parent_missing); int outline = _pdf_cnki_outline(param, &pdf); - snprintf(buf, 64, - "<<\n/Type /Catalog\n/Pages %d 0 R\n", - root); - strcat(dictionary, buf); - - if (outline != -1) { - snprintf(buf, 64, - "/Outlines %d 0 R\n/PageMode /UseOutlines\n", - outline); - strcat(dictionary, buf); - } - - strcat(dictionary, ">>"); - if ((*param)->stat > 1) printf("Searching for catalog object\n"); @@ -443,16 +380,6 @@ cnki_pdf(cnki_t **param) if (catalog != 0) { if ((*param)->stat > 0) printf("Catalog object is %d.\n", catalog); - - if (root_gen != 0) { - if ((*param)->stat > 1) - printf("Replacing catalog object\n"); - - pdf_obj_replace(&pdf, catalog, NULL, dictionary, NULL, 0); - - if ((*param)->stat > 0) - printf("Replaced catalog object\n"); - } } else { if ((*param)->stat > 0) printf("Catalog object is missing\n"); @@ -460,6 +387,20 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 1) printf("Generating catalog object\n"); + snprintf(buf, 64, + "<<\n/Type /Catalog\n/Pages %d 0 R\n", + root); + strcat(dictionary, buf); + + if (outline != -1) { + snprintf(buf, 64, + "/Outlines %d 0 R\n/PageMode /UseOutlines\n", + outline); + strcat(dictionary, buf); + } + + strcat(dictionary, ">>"); + pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); if ((*param)->stat > 0) @@ -478,6 +419,8 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 1) printf("Deleting xref object\n"); + pdf_object_t *tmp; + pdf_get_obj(&pdf, xref, &tmp); pdf_obj_del(&pdf, xref); @@ -513,9 +456,6 @@ cnki_pdf_hn(cnki_t **param) if (pdf_obj_create(&pdf) != 0) return 1; - int font = pdf_get_free_id(&pdf); - pdf_obj_append(&pdf, font, NULL, "<<\n/Type /Font\n/Subtype /TrueType\n/BaseFont /NotoSansCJKSC\n>>", NULL, 0); - if ((*param)->stat > 1) printf("Generating PDF object(s)\n"); @@ -524,22 +464,20 @@ cnki_pdf_hn(cnki_t **param) char buf[64]; - pdf_object_t *tmp; - int cnt = 0; int *root_kid = malloc((*param)->file_stat->page * sizeof(int)); if (root_kid == NULL) return 1; - memset(root_kid, 0, (*param)->file_stat->page * sizeof(int)); + memset(root_kid, 0, (*param)->file_stat->page); object_hn_t *ptr = (*param)->object_hn; while (ptr != NULL) { /* * External object (ptr->image_length) + - * resource object + * content object + + * resource object + * page object */ int *ids = NULL; @@ -551,30 +489,26 @@ cnki_pdf_hn(cnki_t **param) int stream_size; char *stream; - double *dim; - - if (ptr->image_length > 0) { - dim = malloc(2 * ptr->image_length * sizeof(double)); - - if (dim == NULL) { - free(root_kid); - free(ids); - return 1; - } - - dictionary_size = 256; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; - } - } + int *dim = malloc(2 * ptr->image_length * sizeof(int)); int ret; - int info[3]; + int wh[2]; + + if (dim == NULL) { + free(root_kid); + free(ids); + return 1; + } + + dictionary_size = 256; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(ids); + free(dim); + return 1; + } for (int i = 0; i < ptr->image_length; i++) { memset(dictionary, 0, dictionary_size); @@ -583,15 +517,15 @@ cnki_pdf_hn(cnki_t **param) "/Subtype /Image\n"); if ((*param)->stat > 2) - printf("\tProcessing image, page %04d item %d format %d... ", + printf("\tDecoding data, page %04d item %02d format %d... ", ptr->page, i, ptr->image_data[i].format); switch (ptr->image_data[i].format) { case JBIG: ret = cnki_jbig(&bitmap, &bitmap_size, - &info[0], - &info[1], + &wh[0], + &wh[1], ptr->image_data[i].image, ptr->image_data[i].size); @@ -613,7 +547,7 @@ cnki_pdf_hn(cnki_t **param) free(bitmap); snprintf(buf, 64, "/Width %d\n/Height %d\n", - info[0], info[1]); + wh[0], wh[1]); strcat(dictionary, buf); strcat(dictionary, "/ColorSpace /DeviceGray\n" @@ -626,14 +560,13 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "/Filter /FlateDecode\n"); - dim[i * 2] = info[0]; - dim[i * 2 + 1] = info[1]; + dim[i * 2] = wh[0]; + dim[i * 2 + 1] = wh[1]; break; case DCT_0: case DCT_1: - ret = strinfo_jpeg_dim(&info[0], - &info[1], - &info[2], + ret = strinfo_jpeg_dim(&wh[0], + &wh[1], ptr->image_data[i].image, ptr->image_data[i].size); @@ -655,17 +588,11 @@ cnki_pdf_hn(cnki_t **param) memcpy(stream, ptr->image_data[i].image, stream_size); snprintf(buf, 64, "/Width %d\n/Height %d\n", - info[0], info[1]); + wh[0], wh[1]); strcat(dictionary, buf); - if (info[2] == 1) - strcat(dictionary, "/ColorSpace /DeviceGray\n"); - else if (info[2] == 3) - strcat(dictionary, "/ColorSpace /DeviceRGB\n"); - else - strcat(dictionary, "/ColorSpace /DeviceCMYK\n"); - - strcat(dictionary, "/BitsPerComponent 8\n"); + strcat(dictionary, "/ColorSpace /DeviceGray\n" + "/BitsPerComponent 8\n"); snprintf(buf, 64, "/Length %d\n", stream_size); @@ -673,14 +600,14 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "/Filter /DCTDecode\n"); - dim[i * 2] = info[0]; - dim[i * 2 + 1] = info[1]; + dim[i * 2] = wh[0]; + dim[i * 2 + 1] = wh[1]; break; case JBIG2: ret = cnki_jbig2(&bitmap, &bitmap_size, - &info[0], - &info[1], + &wh[0], + &wh[1], ptr->image_data[i].image, ptr->image_data[i].size); @@ -702,7 +629,7 @@ cnki_pdf_hn(cnki_t **param) free(bitmap); snprintf(buf, 64, "/Width %d\n/Height %d\n", - info[0], info[1]); + wh[0], wh[1]); strcat(dictionary, buf); strcat(dictionary, "/ColorSpace /DeviceGray\n" @@ -715,45 +642,10 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "/Filter /FlateDecode\n"); - dim[i * 2] = info[0]; - dim[i * 2 + 1] = info[1]; + dim[i * 2] = wh[0]; + dim[i * 2 + 1] = wh[1]; break; case JPX: - ret = strinfo_jp2_dim(&info[0], - &info[1], - ptr->image_data[i].image, - ptr->image_data[i].size); - - if (ret != 0) { - dim[i * 2] = 0; - dim[i * 2 + 1] = 0; - break; - } - - stream_size = ptr->image_data[i].size; - stream = malloc(stream_size); - if (stream == NULL) { - free(root_kid); - free(ids); - free(dim); - free(dictionary); - return 1; - } - memcpy(stream, ptr->image_data[i].image, stream_size); - - snprintf(buf, 64, "/Width %d\n/Height %d\n", - info[0], info[1]); - strcat(dictionary, buf); - - snprintf(buf, 64, "/Length %d\n", - stream_size); - strcat(dictionary, buf); - - strcat(dictionary, "/Filter /JPXDecode\n"); - - dim[i * 2] = info[0]; - dim[i * 2 + 1] = info[1]; - break; default: ret = -1; dim[i * 2] = -1; @@ -766,7 +658,7 @@ cnki_pdf_hn(cnki_t **param) if (ret == 0) { if ((*param)->stat > 2) printf("%6d byte(s), width %4d, height %4d.\n", - stream_size, info[0], info[1]); + stream_size, wh[0], wh[1]); pdf_obj_append(&pdf, ids[i], NULL, dictionary, stream, stream_size); @@ -785,10 +677,104 @@ cnki_pdf_hn(cnki_t **param) } } - if (ptr->image_length > 0) - free(dictionary); + memset(dictionary, 0, dictionary_size); - dictionary_size = 128 + 2 * ptr->text_size + 128 * ptr->image_length; + strcat(dictionary, "<<\n/XObject <<"); + + for (int i = 0; i < ptr->image_length; i++) { + snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); + strcat(dictionary, buf); + + if (i + 1 < ptr->image_length) + strcat(dictionary, " "); + } + + strcat(dictionary, ">>\n>>"); + + pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); + + free(dictionary); + + int conv_size; + char *conv_dst; + char conv_src[2]; + char conv_hex[3]; + + if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) { + cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); + + dictionary_size = 64 + 2 * stream_size; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(ids); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, ""); + } else { + dictionary_size = 64 + 2 * ptr->text_size; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(ids); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "text_size; i += 4) { + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; + + conv_size = 6; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + for (int j = 0; j < conv_size - 2; j++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[j]); + strcat(dictionary, conv_hex); + } + free(conv_dst); + } + } + + strcat(dictionary, ">"); + } + + /* FIXME: Use the text somehow? */ + free(dictionary); + + dictionary_size = 64 + 64 * ptr->image_length; dictionary = malloc(dictionary_size); if (dictionary == NULL) { @@ -800,336 +786,94 @@ cnki_pdf_hn(cnki_t **param) memset(dictionary, 0, dictionary_size); - strcat(dictionary, "<<\n"); + strcat(dictionary, "q\n"); - if (ptr->text_size > 0) { - snprintf(buf, 64, "/Font <>\n", font); + strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); + + double resize_x; + double resize_y; + + for (int i = 0; i < ptr->image_length; i++) { + if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) + continue; + + /* Scale within bound of A4 paper */ + resize_x = 595.276 * 4 / dim[i * 2]; + resize_y = 841.89 * 4 / dim[i * 2 + 1]; + + if (resize_y < resize_x) + snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", + resize_y, resize_y); + else + snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", + resize_x, resize_x); + strcat(dictionary, buf); + + /* Apply transformation matrix */ + if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { + snprintf(buf, 64, "1 0 0 1 0 %d cm\n", + dim[i * 2 + 1]); + strcat(dictionary, buf); + + strcat(dictionary, "1 0 0 -1 0 0 cm\n"); + } + + snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", + dim[i * 2], dim[i * 2 + 1]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Im%d Do\n", i); strcat(dictionary, buf); } - if (ptr->image_length > 0) { - strcat(dictionary, "/XObject <<"); + strcat(dictionary, "Q"); - for (int i = 0; i < ptr->image_length; i++) { - snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); - strcat(dictionary, buf); - - if (i < ptr->image_length - 1) - strcat(dictionary, " "); - } - - strcat(dictionary, ">>\n"); + if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { + free(root_kid); + free(ids); + free(dim); + free(dictionary); + return 1; } + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n"); + + snprintf(buf, 64, "/Length %d\n", stream_size); + strcat(dictionary, buf); + + strcat(dictionary, "/Filter /FlateDecode\n"); + strcat(dictionary, ">>"); - pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); + pdf_obj_append(&pdf, ids[ptr->image_length + 1], + NULL, dictionary, stream, stream_size); - memset(dictionary, 0, dictionary_size); - - int conv_size; - char *conv_dst; - char conv_src[2]; - char conv_hex[3]; - - if (ptr->text_size > 0) { - if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0 || - strncmp(ptr->text, "COMPRESSTEXT", 12) == 0) { - cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); - - free(ptr->text); - - ptr->text_size = stream_size; - ptr->text = stream; - } - - strcat(dictionary, "BT\n"); - - strcat(dictionary, "/F0 10 Tf\n"); - - for (int i = 0, j = 0; i < ptr->text_size - 1;) { - switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) { - case 0x8001: - if (ptr->address_next <= ptr->address) { - if (i + 7 >= ptr->text_size) { - i += 2; - break; - } - - conv_src[0] = ptr->text[i + 7]; - conv_src[1] = ptr->text[i + 6]; - - //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") - //strcat(dictionary, buf); - - conv_size = 6; - - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, "<"); - for (int k = 0; k < conv_size - 2; k++) { - snprintf(conv_hex, 3, - "%02x", (unsigned char) conv_dst[k]); - strcat(dictionary, conv_hex); - } - strcat(dictionary, "> Tj\n"); - } - free(conv_dst); - } - - i += 8; - break; - } - - strcat(dictionary, "T*\n"); - case 0x8070: - i += 4; - - if (ptr->address_next <= ptr->address) - break; - - for (;;) { - if (i + 3 >= ptr->text_size || - (unsigned char) ptr->text[i + 1] == 0x80) - break; - - conv_src[0] = ptr->text[i + 3]; - conv_src[1] = ptr->text[i + 2]; - - //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") - //strcat(dictionary, buf); - - conv_size = 6; - - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, "<"); - for (int k = 0; k < conv_size - 2; k++) { - snprintf(conv_hex, 3, - "%02x", (unsigned char) conv_dst[k]); - strcat(dictionary, conv_hex); - } - strcat(dictionary, "> Tj\n"); - } - free(conv_dst); - } - - i += 4; - } - - break; - case 0x800a: - if (i + 27 >= ptr->text_size || j >= ptr->image_length) { - i += 2; - - if (j >= ptr->image_length) - i += 26; - break; - } - - if (ptr->image_length > 0) { - ptr->image_data[j].x += (unsigned char) ptr->text[i + 5] << 8; - ptr->image_data[j].x += (unsigned char) ptr->text[i + 4]; - - ptr->image_data[j].y += (unsigned char) ptr->text[i + 7] << 8; - ptr->image_data[j].y += (unsigned char) ptr->text[i + 6]; - - ptr->image_data[j].w += (unsigned char) ptr->text[i + 9] << 8; - ptr->image_data[j].w += (unsigned char) ptr->text[i + 8]; - - ptr->image_data[j].h += (unsigned char) ptr->text[i + 11] << 8; - ptr->image_data[j].h += (unsigned char) ptr->text[i + 10]; - - if ((*param)->stat > 2) - printf("\tItem %d: origin (%4d, %4d), width %4d, height %4d\n", - j, - ptr->image_data[j].x, - ptr->image_data[j].y, - ptr->image_data[j].w, - ptr->image_data[j].h); - } - - i += 28; - - if (j == 0 || ptr->image_data[j].x > 0 || ptr->image_data[j].y > 0) - j++; - break; - default: - i += 4; - break; - } - } - - strcat(dictionary, "ET"); - - if (ptr->image_length > 0) - strcat(dictionary, "\n"); - } - - /* FIXME: Use the text somehow? */ - memset(dictionary, 0, dictionary_size); - - if (ptr->image_length > 0) { - double resize_x = 1; - double resize_y = 1; - - double margin_x = 0; - double margin_y = 0; - - if (ptr->image_data[0].x == 0 && ptr->image_data[0].y == 0 && dim[0] > 0 && dim[1] > 0) { - /* Scale within bound of A4 paper */ - resize_x = 2480.315 / dim[0]; - resize_y = 3507.874 / dim[1]; - - if (resize_y < resize_x) { - for (int i = 0; i < ptr->image_length; i++) { - dim[i * 2] *= resize_y; - dim[i * 2 + 1] *= resize_y; - } - } else { - for (int i = 0; i < ptr->image_length; i++) { - dim[i * 2] *= resize_x; - dim[i * 2 + 1] *= resize_x; - } - } - - margin_x = (2480.315 - dim[0]) / 2; - margin_y = (3507.874 - dim[1]) / 2; - } - - /* Remove duplicated image, ptr->image_length is sometimes squared */ - for (int i = 1; i < ptr->image_length; i++) { - if ((ptr->image_data[i].x > 0 || ptr->image_data[i].y > 0) && - dim[i * 2] < dim[0] && dim[i * 2 + 1] < dim[1]) - continue; - - for (int j = i; j < ptr->image_length; j++) { - pdf_get_obj(&pdf, ids[j], &tmp); - pdf_obj_del(&pdf, ids[j]); - - tmp->next = NULL; - pdf_obj_destroy(&tmp); - - dim[j * 2] = -1; - dim[j * 2 + 1] = -1; - - pdf_obj_append(&pdf, ids[j], NULL, NULL, NULL, 0); - } - - break; - } - - for (int i = 0; i < ptr->image_length; i++) { - if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) - continue; - - strcat(dictionary, "q\n"); - - strcat(dictionary, "0.24 0 0 0.24 0 0 cm\n"); - - /* Rotate image */ - if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { - snprintf(buf, 64, "1 0 0 -1 0 %f cm\n", dim[i * 2 + 1]); - strcat(dictionary, buf); - } - - /* Translate figure */ - if (ptr->image_data[i].x > 0 || ptr->image_data[i].y > 0) { - double origin_x = ptr->image_data[i].x * 0.40433; - double origin_y = ptr->image_data[i].y * 0.40433; - - if (resize_y < resize_x) { - origin_x *= resize_y; - origin_y *= resize_y; - } else { - origin_x *= resize_x; - origin_y *= resize_x; - } - - if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) - origin_y = -3507.874 + origin_y + dim[i * 2 + 1]; - else - origin_y = 3507.874 - origin_y - dim[i * 2 + 1]; - - snprintf(buf, 64, "1 0 0 1 %f %f cm\n", origin_x, origin_y); - strcat(dictionary, buf); - } - - if (margin_x > 0 || margin_y > 0) { - if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { - snprintf(buf, 64, "1 0 0 1 %f %f cm\n", margin_x, -margin_y); - strcat(dictionary, buf); - } else { - snprintf(buf, 64, "1 0 0 1 %f %f cm\n", margin_x, margin_y); - strcat(dictionary, buf); - } - } - - snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", dim[i * 2], dim[i * 2 + 1]); - strcat(dictionary, buf); - - snprintf(buf, 64, "/Im%d Do\n", i); - strcat(dictionary, buf); - - strcat(dictionary, "Q"); - - if (i < ptr->image_length - 1) - strcat(dictionary, "\n"); - } - - free(dim); - } - - if (strlen(dictionary) > 0) { - if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { - free(root_kid); - free(ids); - free(dictionary); - return 1; - } - - memset(dictionary, 0, dictionary_size); - - strcat(dictionary, "<<\n"); - - snprintf(buf, 64, "/Length %d\n", stream_size); - strcat(dictionary, buf); - - strcat(dictionary, "/Filter /FlateDecode\n"); - - strcat(dictionary, ">>"); - - pdf_obj_append(&pdf, ids[ptr->image_length + 1], - NULL, dictionary, stream, stream_size); - - free(stream); - } else { - pdf_obj_append(&pdf, ids[ptr->image_length + 1], - NULL, NULL, NULL, 0); - } + free(stream); memset(dictionary, 0, dictionary_size); strcat(dictionary, "<<\n/Type /Page\n"); - /* A4 paper */ - strcat(dictionary, "/MediaBox [0 0 595.2756 841.8898]\n"); - snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); strcat(dictionary, buf); snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); strcat(dictionary, buf); + /* A4 paper */ + strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n"); + /* Add /Parent when we know root */ pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); + free(dictionary); + root_kid[cnt++] = ids[ptr->image_length + 2]; free(ids); - free(dictionary); + free(dim); ptr = ptr->next; } @@ -1173,17 +917,25 @@ cnki_pdf_hn(cnki_t **param) int root = pdf_get_free_id(&pdf); - snprintf(buf, 64, "<<\n/Type /Pages\n/Kids ["); + snprintf(buf, 64, "<<\n/Type /Pages\n/Kids "); strcat(dictionary, buf); + if ((*param)->file_stat->page > 1) + strcat(dictionary, "["); + for (int i = 0; i < (*param)->file_stat->page; i++) { snprintf(buf, 64, "%d 0 R", root_kid[i]); strcat(dictionary, buf); - if (i < (*param)->file_stat->page - 1) + if (i + 1 < (*param)->file_stat->page) strcat(dictionary, " "); } - snprintf(buf, 64, "]\n/Count %d\n", (*param)->file_stat->page); + if ((*param)->file_stat->page > 1) + strcat(dictionary, "]"); + + strcat(dictionary, "\n"); + + snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page); strcat(dictionary, buf); strcat(dictionary, ">>"); @@ -1192,6 +944,16 @@ cnki_pdf_hn(cnki_t **param) free(dictionary); + dictionary_size = 256; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + return 1; + } + + pdf_object_t *tmp = NULL; + /* Add /Parent to page object */ for (int i = 0; i < (*param)->file_stat->page; i++) { if (pdf_get_obj(&pdf, root_kid[i], &tmp) != 0) { @@ -1200,16 +962,9 @@ cnki_pdf_hn(cnki_t **param) return 1; } - dictionary_size = tmp->dictionary_size + 24; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - return 1; - } + memset(dictionary, 0, dictionary_size); memcpy(dictionary, tmp->dictionary, tmp->dictionary_size); - memset(dictionary + tmp->dictionary_size, 0, 24); snprintf(buf, 64, "/Parent %d 0 R\n>>", root); strcat(dictionary, buf); @@ -1219,20 +974,10 @@ cnki_pdf_hn(cnki_t **param) free(root_kid); return 1; } - - free(dictionary); } free(root_kid); - dictionary_size = 128; - dictionary = malloc(dictionary_size); - - if (dictionary == NULL) { - free(root_kid); - return 1; - } - memset(dictionary, 0, dictionary_size); if ((*param)->stat > 0) diff --git a/src/cnki_zlib.c b/src/cnki_zlib.c index 075456b..edff141 100644 --- a/src/cnki_zlib.c +++ b/src/cnki_zlib.c @@ -13,17 +13,12 @@ int cnki_zlib(char **dst, int *dst_size, const char * restrict src, int src_size) { - uint8_t padding = 0; int32_t size; - - if (strncmp(src + 8, "COMPRESSTEXT", 12) == 0) - padding = 8; - - memcpy(&size, src + 12 + padding, 4); + memcpy(&size, src + 20, 4); *dst_size = size; - if (strinflate(dst, size, src + 16 + padding, src_size - 16 - padding) != 0) + if (strinflate(dst, size, src + 24, src_size - 24) != 0) return 1; return 0; diff --git a/src/cnki_jbig_dec.c b/src/jbig.c similarity index 87% rename from src/cnki_jbig_dec.c rename to src/jbig.c index 5b262e6..09a3d92 100644 --- a/src/cnki_jbig_dec.c +++ b/src/jbig.c @@ -9,7 +9,7 @@ #include #include -static const uint16_t _LSZ[0x71] = { +static const uint16_t _LSZ[256] = { 0x5a1d, 0x2586, 0x1114, 0x080b, 0x03d8, 0x01da, 0x00e5, 0x006f, 0x0036, 0x001a, 0x000d, 0x0006, 0x0003, 0x0001, 0x5a7f, 0x3f25, 0x2cf2, @@ -28,7 +28,7 @@ static const uint16_t _LSZ[0x71] = { 0x5627, 0x50e7, 0x4b85, 0x5597, 0x504f, 0x5a10, 0x5522, 0x59eb }; -static const uint8_t _NLPS[0x71] = { +static const uint8_t _NLPS[256] = { 1, 14, 16, 18, 20, 23, 25, 28, 30, 33, 35, 9, 10, 12, 15, 36, 38, @@ -47,7 +47,7 @@ static const uint8_t _NLPS[0x71] = { 105, 108, 109, 110, 111, 110, 112, 112 }; -static const uint8_t _NMPS[0x71] = { +static const uint8_t _NMPS[256] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 15, 16, 17, @@ -66,7 +66,7 @@ static const uint8_t _NMPS[0x71] = { 106, 107, 103, 109, 107, 111, 109, 111 }; -static const bool _SWTCH[0x71] = { +static const bool _SWTCH[256] = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, @@ -99,7 +99,7 @@ static int _height; static int _width_padded; static int _ret_pos; -static unsigned char *_ret; +static char *_ret; static int _scd_size; static unsigned char *_scd; @@ -108,7 +108,7 @@ static void _bytein(void) { if (_ret_pos < _scd_size) - _reg_c += _scd[_ret_pos++] << 8; + _reg_c += *(_scd + _ret_pos++) << 8; _ct = 8; } @@ -207,18 +207,7 @@ static void _procline(int line, char *a, char *b, char *c) { /* The encoder must be erroneous */ - uint16_t cx = 0; - - if (line > 0) { - cx += (_ret[_width_padded * (_height - line)] & 0x20) << 2; - cx += _ret[_width_padded * (_height - line)] & 0x40; - cx += (_ret[_width_padded * (_height - line)] & 0x80) >> 2; - } - - if (line > 1) { - cx += (_ret[_width_padded * (_height - line + 1)] & 0x40) >> 4; - cx += (_ret[_width_padded * (_height - line + 1)] & 0x80) >> 6; - } + uint16_t cx = (*b & 0x01) << 2; for (int i = 0; i < _width; i++) { _decode(cx); @@ -226,19 +215,19 @@ _procline(int line, char *a, char *b, char *c) cx >>= 1; if (_pix == 1) { - _ret[_width_padded * (_height - line - 1) + i / 8] |= _pix << (7 - (i & 0x07)); - c[i] = 1; + *(_ret + _width_padded * (_height - line - 1) + i / 8) |= _pix << (7 - (i & 0x07)); + *(c + i) = 1; cx |= 0x0200; } else { cx &= 0xfdff; } - if (i + 2 < _width && a[i + 2] == 1) + if (i + 2 < _width && *(a + i + 2) == 1) cx |= 0x0004; else cx &= 0xfffb; - if (i + 3 < _width && b[i + 3] == 1) + if (i + 3 < _width && *(b + i + 3) == 1) cx |= 0x0080; else cx &= 0xff7f; @@ -304,7 +293,7 @@ strdec_jbig(char **bitmap, int width, int height, memset(*bitmap, 0, _height * _width_padded); _ret_pos = 0; - _ret = (unsigned char *) *bitmap; + _ret = *bitmap; _scd_size = jbig_size; _scd = (unsigned char *) jbig; diff --git a/src/cnki_jbig_dec.h b/src/jbig.h similarity index 100% rename from src/cnki_jbig_dec.h rename to src/jbig.h diff --git a/src/jbig2.c b/src/jbig2.c index ea9233c..9b3a9be 100644 --- a/src/jbig2.c +++ b/src/jbig2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, yzrh + * Copyright (c) 2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -31,6 +31,5 @@ strdec_jbig2(char **bitmap, } jbig2_release_page(ctx, image); - jbig2_ctx_free(ctx); return 0; } diff --git a/src/jp2.c b/src/jp2.c deleted file mode 100644 index a9d4429..0000000 --- a/src/jp2.c +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2022, yzrh - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#include - -#include - -typedef struct _stream_user_data { - OPJ_SIZE_T pos; - OPJ_SIZE_T size; - const unsigned char *data; -} stream_user_data; - -static OPJ_SIZE_T -_opj_stream_read(void *p_buffer, OPJ_SIZE_T p_nb_bytes, void *p_user_data) -{ - stream_user_data *d = (stream_user_data *) p_user_data; - - if (d->pos >= d->size) - return (OPJ_SIZE_T) - 1; - - OPJ_SIZE_T ret_size = p_nb_bytes; - - if (d->pos + ret_size > d->size) - ret_size = d->size - d->pos; - - memcpy(p_buffer, d->data + d->pos, ret_size); - - d->pos += ret_size; - - return ret_size; -} - -static OPJ_OFF_T -_opj_stream_skip(OPJ_OFF_T p_nb_bytes, void *p_user_data) -{ - stream_user_data *d = (stream_user_data *) p_user_data; - - if (d->pos + p_nb_bytes <= d->size) - d->pos += p_nb_bytes; - else - d->pos = d->size; - - return d->pos; -} - -static OPJ_BOOL -_opj_stream_seek(OPJ_OFF_T p_nb_bytes, void *p_user_data) -{ - stream_user_data *d = (stream_user_data *) p_user_data; - - if (p_nb_bytes <= (OPJ_OFF_T) d->size) { - d->pos = p_nb_bytes; - return OPJ_TRUE; - } - - return OPJ_FALSE; -} - -int -strinfo_jp2_dim(int *jp2_width, int *jp2_height, - const char * restrict data, int data_size) -{ - opj_codec_t *codec; - opj_dparameters_t param; - opj_stream_t *stream; - opj_image_t *image; - stream_user_data d; - - if (data_size < 2) - return 1; - - opj_set_default_decoder_parameters(¶m); - - if ((unsigned char) data[0] == 0xff && (unsigned char) data[1] == 0x4f) - codec = opj_create_decompress(OPJ_CODEC_J2K); - else - codec = opj_create_decompress(OPJ_CODEC_JP2); - - if (!opj_setup_decoder(codec, ¶m)) { - opj_destroy_codec(codec); - return 1; - } - - stream = opj_stream_default_create(OPJ_TRUE); - - d.pos = 0; - d.size = data_size; - d.data = (unsigned char *) data; - - opj_stream_set_read_function(stream, _opj_stream_read); - opj_stream_set_skip_function(stream, _opj_stream_skip); - opj_stream_set_seek_function(stream, _opj_stream_seek); - opj_stream_set_user_data(stream, &d, NULL); - opj_stream_set_user_data_length(stream, data_size); - - if (!opj_read_header(stream, codec, &image)) { - opj_destroy_codec(codec); - opj_stream_destroy(stream); - return 1; - } - - opj_destroy_codec(codec); - opj_stream_destroy(stream); - - *jp2_width = image->x1 - image->x0; - *jp2_height = image->y1 - image->y0; - - opj_image_destroy(image); - - return 0; -} diff --git a/src/jp2.h b/src/jp2.h deleted file mode 100644 index 5644938..0000000 --- a/src/jp2.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * Copyright (c) 2022, yzrh - * - * SPDX-License-Identifier: Apache-2.0 - */ - -int strinfo_jp2_dim(int *jp2_width, int *jp2_height, - const char * restrict data, int data_size); diff --git a/src/jpeg.c b/src/jpeg.c index cdcae7b..4ea4d7f 100644 --- a/src/jpeg.c +++ b/src/jpeg.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -9,7 +9,7 @@ #include int -strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components, +strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, const char * restrict data, int data_size) { struct jpeg_decompress_struct cinfo; @@ -27,7 +27,6 @@ strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components, *jpeg_width = cinfo.output_width; *jpeg_height = cinfo.output_height; - *jpeg_components = cinfo.output_components; jpeg_destroy((struct jpeg_common_struct *) &cinfo); diff --git a/src/jpeg.h b/src/jpeg.h index 1f5caa7..db35d94 100644 --- a/src/jpeg.h +++ b/src/jpeg.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ -int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components, +int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, const char * restrict data, int data_size); diff --git a/src/md5.c b/src/md5.c deleted file mode 100644 index e5ab95e..0000000 --- a/src/md5.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2023, yzrh - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#include - -#include - -int -strmd5(unsigned char **dst, int *dst_size, - const unsigned char * restrict src, int src_size) -{ - *dst_size = MD5_DIGEST_LENGTH; - *dst = malloc(*dst_size); - - if (*dst == NULL) - return 1; - - MD5(src, src_size, *dst); - - return 0; -} diff --git a/src/md5.h b/src/md5.h deleted file mode 100644 index 9c1745d..0000000 --- a/src/md5.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Copyright (c) 2023, yzrh - * - * SPDX-License-Identifier: Apache-2.0 - */ - -int -strmd5(unsigned char **dst, int *dst_size, - const unsigned char * restrict src, int src_size); diff --git a/src/melon.c b/src/melon.c index f8bb645..af6aaf4 100644 --- a/src/melon.c +++ b/src/melon.c @@ -98,8 +98,7 @@ main(int argc, char **argv) strerror(errno)); return EXIT_FAILURE; } - } else if (strncmp(param->file_stat->type, "HN", 2) == 0 || - (unsigned char) param->file_stat->type[0] == 0xc8) { + } else if (strncmp(param->file_stat->type, "HN", 2) == 0) { if (cnki_hn(¶m) != 0) { fprintf(stderr, "%s: %s\n", argv[0], strerror(errno)); diff --git a/src/pdf_get.c b/src/pdf_get.c index bde5bf2..a72c68d 100644 --- a/src/pdf_get.c +++ b/src/pdf_get.c @@ -89,7 +89,7 @@ pdf_get_free_id(pdf_object_t **pdf) int id = 0; - for (int i = 1; i < 100000000; i++) { + for (int i = 1; i < 99999999; i++) { ptr = (*pdf)->next; while (ptr != NULL) { if (ptr->id == i) { @@ -123,7 +123,7 @@ pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count) int id = 0; pdf_object_t *ptr; - for (int i = 1; i < 100000000; i++) { + for (int i = 1; i < 99999999; i++) { ptr = (*pdf)->next; while (ptr != NULL) { if (ptr->id == i) { diff --git a/src/pdf_parser.c b/src/pdf_parser.c index e6d8ac6..3b29c52 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -19,35 +19,26 @@ static void * _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) { const char whitespace[6] = { - 0x00, - 0x09, - 0x0a, - 0x0c, - 0x0d, - 0x20 + '\r', + '\n', + '\f', + '\t', + '\0', + ' ' }; - char *ret = NULL; + char tmp[s1 + 1]; + memcpy(tmp, p1, s1); - char str[s1 + 1]; - memcpy(str, p1, s1); - - size_t tmp_size = 0; - char *tmp; + char *ret; for (int i = 0; i < 6; i++) { - str[s1] = whitespace[i]; - - if ((tmp = memmem(p0, s0, str, s1 + 1)) == NULL) - continue; - - if (tmp_size == 0 || (size_t) (tmp - (char *) p0) < tmp_size) { - tmp_size = tmp - (char *) p0; - ret = tmp; - } + tmp[s1] = whitespace[i]; + if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) + return ret; } - return ret; + return NULL; } static int @@ -66,45 +57,23 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) end = ftell(*fp); fseek(*fp, cur, SEEK_SET); - long head = 0; - long tail = 0; + int head = 0; + int tail = 0; char *pos; char *tmp; for (;;) { - if (cur + size_buf < end) { - fread(buf, size_buf, 1, *fp); - } else { - fread(buf, end - cur, 1, *fp); - memset(buf + end - cur, 0, size_buf - end + cur); - } + fread(buf, size_buf, 1, *fp); - if (head == 0) { - /* Hack needed for invalid object */ - pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6); - tmp = memmem(buf, size_buf, " 0 obj", 6); - - while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b) - tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6); - - if (pos != NULL && tmp != NULL) { - if (pos - buf < tmp - buf) - head = cur + (pos - buf) + 7; - else - head = cur + (tmp - buf) + 6; - } else if (pos != NULL) { - head = cur + (pos - buf) + 7; - } else if (tmp != NULL) { - head = cur + (tmp - buf) + 6; - } - } + if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) + head = cur + (pos - buf) + 7; if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) { /* We need to check if it is the object stored in stream */ while (memcmp(pos + 7, "\r\nendstream", 11) == 0 && - (tmp = _memmem_whitespace(pos + 7, - size_buf - (pos - buf) - 7, + (tmp = _memmem_whitespace(pos + 6, + size_buf - (pos - buf) - 6, "endobj", 6)) != NULL) pos = tmp; @@ -133,17 +102,13 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) ptr->address = head; ptr->size = tail - head; - fseek(*fp, tail + 7, SEEK_SET); + fseek(*fp, tail + 6, SEEK_SET); head = tail = 0; - } else if (head > 0 && tail > 0) { - if (cur + size_buf < end) - fseek(*fp, head, SEEK_SET); - tail = 0; } else { - fseek(*fp, -7, SEEK_CUR); + fseek(*fp, -6, SEEK_CUR); } - if ((cur = ftell(*fp)) + 7 >= end) + if ((cur = ftell(*fp)) + 6 >= end) break; } @@ -161,7 +126,6 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) pdf_object_t *ptr = (*pdf)->next; - char str[8]; char *buf; char *head; char *tail; @@ -173,86 +137,34 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if (buf == NULL) return 1; - fseek(*fp, ptr->address, SEEK_SET); - fread(buf, ptr->size, 1, *fp); + memset(buf, 0, ptr->size); - /* Handle incomplete object */ - head = buf; - while ((tmp = _memmem_whitespace(head, - ptr->size - (head - buf), - " 0 obj", 6)) != NULL) - head = tmp + 7; - - /* Hack needed for invalid object */ - while ((tmp = memmem(head, - ptr->size - (head - buf), - " 0 obj", 6)) != NULL) - head = tmp + 6; - - if (head - buf > 0) { - ptr->address += head - buf; - ptr->size -= head - buf; - - tmp = realloc(buf, ptr->size); - - if (tmp == NULL) - return 1; - - buf = tmp; - - fseek(*fp, ptr->address, SEEK_SET); - fread(buf, ptr->size, 1, *fp); - } - - /* Hack needed for invalid object */ - fseek(*fp, ptr->address - 14, SEEK_SET); - fread(str, 8, 1, *fp); - - if (str[7] < '0' || str[7] > '9') { - fseek(*fp, ptr->address - 15, SEEK_SET); - fread(str, 8, 1, *fp); - } - - for (int i = 7; i >= 0; i--) { - if (str[i] < '0' || str[i] > '9') { - if (i < 7) - ptr->id = atoi(str + i + 1); - else - ptr->id = 0; + fseek(*fp, ptr->address - 12, SEEK_SET); + fread(buf, 8, 1, *fp); + for (int i = 0; i < 8; i++) { + if (buf[i] >= '0' && buf[i] <= '9') { + ptr->id = atoi(buf + i); break; } } - if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && - ((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL || - /* Hack needed for invalid object */ - (tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) { - if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) { - tail = memmem(buf, ptr->size, ">>", 2); + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); - while (ptr->size - (tail - buf) > 2 && - (tmp = memmem(tail + 2, - ptr->size - (tail - buf) - 2, - ">>", 2)) != NULL && - memmem(tail + 2, - (tmp - tail) - 2, - "stream\r\n", 8) == NULL) - tail = tmp; - } else { - /* - * A dictionary object may have nested dictionary, - * but it should not be in a stream - */ - while (ptr->size - (tail - buf) > 3 && - (tmp = _memmem_whitespace(tail + 3, - ptr->size - (tail - buf) - 3, - ">>", 2)) != NULL && - memmem(tail + 3, - (tmp - tail) - 3, - "stream\r\n", 8) == NULL) - tail = tmp; - } + if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && + (tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { + /* + * A dictionary object may have nested dictionary, + * but it should not be in a stream + */ + while ((tmp = _memmem_whitespace(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL && + memmem(tail + 2, + ptr->size - (tail - buf) - 2, + "stream\r\n", 8) == NULL) + tail = tmp; ptr->dictionary_size = tail - head + 2; ptr->dictionary = malloc(ptr->dictionary_size + 1); @@ -260,8 +172,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) if (ptr->dictionary == NULL) return 1; + memset(ptr->dictionary, 0, ptr->dictionary_size + 1); memcpy(ptr->dictionary, head, ptr->dictionary_size); - memset(ptr->dictionary + ptr->dictionary_size, 0, 1); if ((head = memmem(tail, ptr->size - (tail - buf), @@ -274,11 +186,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) * contains another object that * contains another stream */ - while (_memmem_whitespace(tail + 10, - ptr->size - (tail - buf) - 10, + while (_memmem_whitespace(tail, + ptr->size - (tail - buf), "endobj", 6) != NULL && - (tmp = _memmem_whitespace(tail + 10, - ptr->size - (tail - buf) - 10, + (tmp = _memmem_whitespace(tail + 9, + ptr->size - (tail - buf) - 9, "endstream", 9)) != NULL) tail = tmp; @@ -290,13 +202,19 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) memcpy(ptr->stream, head + 8, ptr->stream_size); } - - free(buf); } else { ptr->object_size = ptr->size; - ptr->object = buf; + ptr->object = malloc(ptr->object_size + 1); + + if (ptr->object == NULL) + return 1; + + memset(ptr->object, 0, ptr->object_size + 1); + memcpy(ptr->object, buf, ptr->object_size); } + free(buf); + ptr = ptr->next; } diff --git a/src/pdf_writer.c b/src/pdf_writer.c index 6afa89b..be64e49 100644 --- a/src/pdf_writer.c +++ b/src/pdf_writer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -8,32 +8,14 @@ #include #include -#include "version.h" -#include "md5.h" +#include + #include "pdf.h" -static int -_info_obj(pdf_object_t **pdf) -{ - char dictionary[128] = "<<\n" - "/Producer (Melon " VERSION "." RELEASE "." PATCH EXTRA ")\n" - "/CreationDate (D:"; - - char buf[64]; - - time_t timestamp = time(NULL); - - strftime(buf, 64, "%Y%m%d%H%M%S", gmtime(×tamp)); - strcat(dictionary, buf); - strcat(dictionary, "+00'00')\n>>"); - - return pdf_obj_append(pdf, 0, NULL, dictionary, NULL, 0); -} - int pdf_dump_obj(pdf_object_t **pdf, FILE **fp) { - if (*pdf == NULL || *fp == NULL || _info_obj(pdf) != 0) + if (*pdf == NULL || *fp == NULL) return 1; long cur; @@ -162,28 +144,35 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) buf_size = snprintf(buf, 64, "%lx%x", timestamp, size); #endif - int fid_size; - unsigned char *fid; + unsigned char str[64]; + memcpy(str, buf, 64); - if (strmd5(&fid, &fid_size, (unsigned char *) buf, buf_size) != 0) - return 1; + unsigned char fid[MD5_DIGEST_LENGTH]; + MD5(str, buf_size, fid); pdf_object_t *ptr = *pdf; while (ptr->next != NULL) ptr = ptr->next; + /* + * TODO: Document information dictionary + * `"/Producer (Melon)"' + * `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"' + * + * Trailer dictionary + * `"/Info %d 0 R"' + */ fprintf(*fp, - "/Size %d\n/Root %d 0 R\n/Info %d 0 R\n", + "/Size %d\n/Root %d 0 R\n", ptr->id + 1, - pdf_get_catalog_id(pdf), - ptr->id); + pdf_get_catalog_id(pdf)); fputs("/ID [", *fp); for (int i = 0; i < 2; i++) { fputs("<", *fp); - for (int j = 0; j < fid_size; j++) + for (int j = 0; j < MD5_DIGEST_LENGTH; j++) fprintf(*fp, "%02x", fid[j]); fputs(">", *fp); @@ -202,7 +191,5 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) fputs("%%EOF\n", *fp); - free(fid); - return 0; } diff --git a/src/version.h b/src/version.h index c3ff314..4e5cfa6 100644 --- a/src/version.h +++ b/src/version.h @@ -1,10 +1,10 @@ /* - * Copyright (c) 2020-2023, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ #define VERSION "0" -#define RELEASE "3" +#define RELEASE "2" #define PATCH "0" #define EXTRA ""