diff --git a/README.md b/README.md index c86b965..78a4107 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ Dependency 1. OpenSSL 2. libiconv 3. zlib +4. JBIG-KIT +5. libjpeg-turbo Usage ===== diff --git a/src/Makefile b/src/Makefile index f8c1e34..005f6da 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,22 +1,22 @@ # -# Copyright (c) 2020, yzrh +# Copyright (c) 2020-2021, yzrh # # SPDX-License-Identifier: Apache-2.0 # -src = melon.c iconv.c zlib.c \ +src = melon.c iconv.c zlib.c jbig.c jpeg.c \ cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c \ - cnki_pdf.c cnki_xml.c cnki_zlib.c cnki.c \ + cnki_pdf.c cnki_xml.c cnki_zlib.c cnki_jbig.c cnki.c \ pdf_cnki.c pdf_get.c pdf_parser.c pdf_writer.c pdf.c -inc = extern.h version.h iconv.h zlib.h \ - cnki.h pdf_cnki.h pdf.h +inc = extern.h version.h iconv.h zlib.h jbig.h jpeg.h \ + cnki.h pdf_cnki.h cnki_jbig.h pdf.h obj = ${src:.c=.o} PREFIX = /usr/local -CFLAGS = -O3 -march=native -pipe -flto=thin -Wall -LDFLAGS = -Wl,-O3 -lcrypto -liconv -lz -Wl,--as-needed +CFLAGS = -O3 -march=native -pipe -flto=thin -Wall -Wextra -Wno-unused-parameter +LDFLAGS = -Wl,-O3 -lcrypto -liconv -lz -ljbig -ljpeg -Wl,--as-needed CFLAGS += -I/usr/local/include LDFLAGS += -L/usr/local/lib diff --git a/src/cnki.c b/src/cnki.c index 3234403..6866f8e 100644 --- a/src/cnki.c +++ b/src/cnki.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/cnki.h b/src/cnki.h index 30557c1..816bcc8 100644 --- a/src/cnki.h +++ b/src/cnki.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -82,6 +82,7 @@ typedef struct _cnki_t { /* cnki_pdf.c */ int cnki_pdf(cnki_t **param); +int cnki_pdf_hn(cnki_t **param); /* cnki_outline_tree.c */ int cnki_outline_tree(object_outline_tree_t **outline_tree, @@ -91,5 +92,10 @@ int cnki_outline_tree(object_outline_tree_t **outline_tree, int cnki_zlib(char **dst, int *dst_size, const char * restrict src, int src_size); +/* cnki_jbig.c */ +int cnki_jbig(char **bitmap, int *bitmap_size, + int *bitmap_width, int *bitmap_height, + const char * restrict jbig, int jbig_size); + /* cnki_xml.c */ int cnki_xml(char **xml, FILE **fp); diff --git a/src/cnki_caj.c b/src/cnki_caj.c index 1e3bd8b..3c0b1b9 100644 --- a/src/cnki_caj.c +++ b/src/cnki_caj.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/cnki_hn.c b/src/cnki_hn.c index f6a4c24..0e16a76 100644 --- a/src/cnki_hn.c +++ b/src/cnki_hn.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -9,6 +9,8 @@ #include "cnki.h" #include "iconv.h" +#include "zlib.h" +#include "jpeg.h" #include "pdf.h" #include "pdf_cnki.h" @@ -131,231 +133,13 @@ cnki_hn(cnki_t **param) ptr = ptr->next; } - if ((*param)->stat > 1) + if ((*param)->stat > 0) printf("Loaded %d page(s)\n", (*param)->file_stat->page); - if ((*param)->stat > 1) - printf("Generating PDF object(s)\n"); - - pdf_object_t *pdf = NULL; - - if (pdf_obj_create(&pdf) != 0) - return 1; - - int buf_size; - char *buf; - - int str_size; - char *str; - - int conv_size; - char *conv_dst; - char conv_src[2]; - char conv_hex[3]; - - ptr = (*param)->object_hn; - while (ptr != NULL) { - if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) { - cnki_zlib(&buf, &buf_size, ptr->text, ptr->text_size); - - str_size = buf_size / 8 + 7; - str = malloc(str_size); - - if (str == NULL) - return 1; - - memset(str, 0, str_size); - - strcat(str, ""); - } else { - str_size = ptr->text_size; - str = malloc(str_size); - - if (str == NULL) - return 1; - - memset(str, 0, str_size); - - strcat(str, "text_size; i += 4) { - conv_src[0] = ptr->text[i + 3]; - conv_src[1] = ptr->text[i + 2]; - - conv_size = 6; - - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - for (int j = 0; j < conv_size - 2; j++) { - snprintf(conv_hex, 3, - "%02x", (unsigned char) conv_dst[j]); - strcat(str, conv_hex); - } - free(conv_dst); - } - } - - strcat(str, ">"); - } - - pdf_obj_append(&pdf, 0, str, NULL, NULL); - - free(str); - - ptr = ptr->next; - } - - if ((*param)->stat > 1) { - printf("\t%8s\t%12s\t%12s\t%12s\n", - "id", - "object", - "dictionary", - "stream"); - - pdf_object_t *ptr = pdf->next; - while (ptr != NULL) { - printf("\t%8d\t%12d\t%12d\t%12d\n", - ptr->id, - ptr->object_size, - ptr->dictionary_size, - ptr->stream_size); - ptr = ptr->next; - } - } + cnki_pdf_hn(param); if ((*param)->stat > 0) - printf("Generated %d object(s)\n", - pdf_get_count(&pdf)); + printf("Conversion ended\n"); - int *ids = NULL; - - if ((*param)->file_stat->outline > 0) { - if ((*param)->stat > 1) - printf("Generating outline object(s)\n\t%8s\n", "id"); - - pdf_get_free_ids(&pdf, &ids, (*param)->file_stat->outline + 1); - int outline = pdf_cnki_outline(&pdf, &(*param)->object_outline, &ids); - - if ((*param)->stat > 1) - for (int i = 0; i < (*param)->file_stat->outline + 1; i++) - printf("\t%8d\n", ids[i]); - - if ((*param)->stat > 0) { - if (outline != 0) - printf("No outline information\n"); - else - printf("Generated %d outline object(s)\n", - (*param)->file_stat->outline + 1); - } - } - - if ((*param)->stat > 1) - printf("Writing header\n"); - - long cur = 0; - - if ((*param)->stat > 0) - cur = ftell((*param)->fp_o); - - if (pdf_dump_header(&pdf, &(*param)->fp_o) != 0) { - fprintf(stderr, "Header not written\n"); - return 1; - } else { - if ((*param)->stat > 0) - printf("Header %ld byte(s) written\n", - ftell((*param)->fp_o) - cur); - } - - if ((*param)->stat > 1) - printf("Writing object(s)\n"); - - pdf_dump_obj(&pdf, &(*param)->fp_o); - - if ((*param)->stat > 1) { - printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n", - "address", - "size", - "id", - "object", - "dictionary", - "stream"); - - pdf_object_t *ptr = pdf->next; - while (ptr != NULL) { - printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n", - ptr->address, - ptr->size, - ptr->id, - ptr->object_size, - ptr->dictionary_size, - ptr->stream_size); - ptr = ptr->next; - } - } - - if ((*param)->stat > 0) - printf("%d object(s) %ld byte(s) written\n", - pdf_get_count(&pdf), - ftell((*param)->fp_o)); - - long xref = ftell((*param)->fp_o); - - if ((*param)->stat > 1) - printf("Writing cross-reference table\n"); - - if (pdf_dump_xref(&pdf, &(*param)->fp_o) != 0) { - if ((*param)->stat > 0) - printf("Cross-reference table not written\n"); - } else { - if ((*param)->stat > 0) - printf("Cross-reference table %ld byte(s) written\n", - ftell((*param)->fp_o) - xref); - } - - if ((*param)->stat > 1) - printf("Writing trailer\n"); - - if ((*param)->stat > 0) - cur = ftell((*param)->fp_o); - - if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) { - if ((*param)->stat > 0) - printf("Trailer not written\n"); - } else { - if ((*param)->stat > 0) - printf("Trailer %ld byte(s) written\n", - ftell((*param)->fp_o) - cur); - } - - if ((*param)->stat > 0) - printf("Total %ld byte(s) written\n", - ftell((*param)->fp_o)); - - pdf_obj_destroy(&pdf); - - if ((*param)->stat > 0) - printf("Conversion ended (partial)\n"); - - /* TODO: Finish me please :) */ return 0; } diff --git a/src/cnki_jbig.c b/src/cnki_jbig.c new file mode 100644 index 0000000..02040be --- /dev/null +++ b/src/cnki_jbig.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020-2021, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include "cnki_jbig.h" +#include "jbig.h" + +int +cnki_jbig(char **bitmap, int *bitmap_size, + int *bitmap_width, int *bitmap_height, + const char * restrict jbig, int jbig_size) +{ + dib_t *dib = malloc(sizeof(dib_t)); + + if (dib == NULL) + return 1; + + memcpy(dib, jbig, 40); + + bih_t *bih = malloc(sizeof(bih_t)); + + if (bih == NULL) { + free(dib); + return 1; + } + + memset(bih, 0, sizeof(bih_t)); + + bih->d_l = 0; + bih->d = 0; + + bih->p = 1; + + bih->fill = 0; + + bih->x_d = dib->width; + bih->y_d = dib->height; + bih->l_0 = bih->y_d / 35; + + while (bih->l_0 > 128) + bih->l_0--; + if (bih->l_0 < 2) + bih->l_0 = 2; + + bih->m_x = 8; + bih->m_y = 0; + + bih->order |= 1 << 1; + bih->order |= 1 << 0; + + bih->options |= 1 << 4; + bih->options |= 1 << 3; + bih->options |= 1 << 2; + + bih->dptable = NULL; + + int bie_size = jbig_size - 28; /* - 40 - 8 + 20 */ + char *bie = malloc(bie_size); + + if (bie == NULL) { + free(dib); + free(bih); + return 1; + } + + memcpy(bie, bih, 20); + memcpy(bie + 20, jbig + 48, jbig_size - 48); + + int ret = strdec_jbig(bitmap, bitmap_size, bie, bie_size); + + if (ret == 0) { + *bitmap_width = bih->x_d; + *bitmap_height = bih->y_d; + } + + free(dib); + free(bih); + free(bie); + + if (ret != 0) + return 1; + + return 0; +} diff --git a/src/cnki_jbig.h b/src/cnki_jbig.h new file mode 100644 index 0000000..96e4ea8 --- /dev/null +++ b/src/cnki_jbig.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020-2021, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +/* + * order (MSB first): + * 0 + * 0 + * 0 + * 0 + * HITOLO + * SEQ + * ILEAVE (default) + * SMID (default) + * + * options (MSB first): + * 0 + * LRLTWO + * VLENGTH + * TPDON (default) + * TPBON (default) + * DPON (default) + * DPPRIV + * DPLAST + */ +typedef struct _bih_t { + char d_l; /* Initial resolution layer */ + char d; /* Final resolution layer */ + char p; /* Number of bit-planes, for bi-level image, always 1 */ + char fill; /* Always 0 */ + /* MSB first */ + int32_t x_d; /* Horizontal dimension at highestresolution */ + int32_t y_d; /* Vertical dimension at highest resolution */ + int32_t l_0; /* Number of lines per stripe at lowest resolution */ + char m_x; /* Maximum horizontal offsets (default: 8) */ + char m_y; /* Maximum vertical offsets (default: 0) */ + char order; + char options; + char *dptable; /* 0 or 1728 */ +} bih_t; + +typedef enum _dib_compression_code { + BI_RGB, + BI_RLE8, + BI_RLE4, + BI_BITFIELDS, + BI_JPEG, + BI_PNG, + BI_ALPHABITFIELDS, + BI_CMYK = 11, + BI_CMYKRLE8 = 12, + BI_CMYKRLE4 = 13 +} dib_compression_code; + +typedef struct _dib_t { + uint32_t dib_size; /* Always 40 */ + int32_t width; + int32_t height; + uint16_t plane; /* Always 1 */ + uint16_t depth; + uint32_t compression; /* dib_compression_code */ + uint32_t size; + uint32_t resolution_h; + uint32_t resolution_v; + uint32_t colour; + uint32_t colour_used; +} dib_t; + +typedef struct _colour_table { + uint16_t blue; + uint16_t green; + uint16_t red; + uint16_t fill; /* Always 0 */ +} colour_table; diff --git a/src/cnki_kdh.c b/src/cnki_kdh.c index 78e8957..b13434d 100644 --- a/src/cnki_kdh.c +++ b/src/cnki_kdh.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/cnki_outline_tree.c b/src/cnki_outline_tree.c index 58294c3..cf59d10 100644 --- a/src/cnki_outline_tree.c +++ b/src/cnki_outline_tree.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 8c9fbb8..bb76a04 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -8,6 +8,9 @@ #include #include "cnki.h" +#include "iconv.h" +#include "zlib.h" +#include "jpeg.h" #include "pdf.h" #include "pdf_cnki.h" @@ -57,6 +60,11 @@ cnki_pdf(cnki_t **param) printf("Loaded %d object(s)\n", pdf_get_count(&pdf)); + int dictionary_size; + char *dictionary; + + char buf[64]; + if ((*param)->stat > 1) printf("Searching for parent object(s)\n"); @@ -69,12 +77,8 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Discovered %d parent object(s)\n", parent[0]); - char buf[64]; - int parent_missing[parent[0]]; int *kid; - int dictionary_size; - char *dictionary; for (int i = 1; i <= parent[0]; i++) { if ((*param)->stat > 1) @@ -101,20 +105,23 @@ cnki_pdf(cnki_t **param) snprintf(buf, 64, "<<\n/Type /Pages\n/Kids ["); strcat(dictionary, buf); + for (int j = 1; j <= kid[0]; j++) { snprintf(buf, 64, "%d 0 R", kid[j]); strcat(dictionary, buf); + if (j < kid[0]) strcat(dictionary, " "); } + snprintf(buf, 64, "]\n/Count %d\n>>", pdf_get_kid_count(&pdf, parent[i])); strcat(dictionary, buf); - pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL); + pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); parent_missing[i - 1] = 1; @@ -185,6 +192,7 @@ cnki_pdf(cnki_t **param) if (parent_missing[i]) { snprintf(buf, 64, "%d 0 R", parent[i + 1]); strcat(dictionary, buf); + if (i < root_kid) strcat(dictionary, " "); } @@ -200,7 +208,7 @@ cnki_pdf(cnki_t **param) strcat(dictionary, ">>"); - pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL); + pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL, 0); memset(dictionary, 0, dictionary_size); @@ -260,7 +268,7 @@ cnki_pdf(cnki_t **param) strcat(dictionary, ">>"); - pdf_obj_append(&pdf, 0, NULL, dictionary, NULL); + pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); if ((*param)->stat > 0) printf("Generated catalog object\n"); @@ -383,3 +391,611 @@ cnki_pdf(cnki_t **param) return 0; } + +int +cnki_pdf_hn(cnki_t **param) +{ + if (*param == NULL) + return 1; + + pdf_object_t *pdf = NULL; + + if (pdf_obj_create(&pdf) != 0) + return 1; + + if ((*param)->stat > 1) + printf("Generating PDF object(s)\n"); + + int dictionary_size; + char *dictionary; + + char buf[64]; + + int *ids = NULL; + + int cnt = 0; + int *root_kid = malloc((*param)->file_stat->page * sizeof(int)); + + if (root_kid == NULL) + return 1; + + memset(root_kid, 0, (*param)->file_stat->page); + + object_hn_t *ptr = (*param)->object_hn; + while (ptr != NULL) { + /* + * External object (ptr->image_length) + + * content object + + * resource object + + * page object + */ + pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); + + int stream_size; + char *stream; + + int *dim = malloc(2 * ptr->image_length * sizeof(int)); + + int ret; + int wh[2]; + + if (dim == NULL) { + free(root_kid); + return 1; + } + + for (int i = 0; i < ptr->image_length; i++) { + dictionary_size = 128; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n/Type /XObject\n" + "/Subtype /Image\n"); + + if ((*param)->stat > 2) + printf("\tDecoding data, page %04d item %02d... ", + ptr->page, i); + + switch (ptr->image_data[i].format) { + case JBIG: + ret = cnki_jbig(&stream, + &stream_size, + &wh[0], + &wh[1], + ptr->image_data[i].image, + ptr->image_data[i].size); + + if (ret != 0) { + dim[i * 2] = 0; + dim[i * 2 + 1] = 0; + break; + } + + snprintf(buf, 64, "/Width %d\n/Height %d\n", + wh[0], wh[1]); + strcat(dictionary, buf); + + strcat(dictionary, "/ColorSpace /DeviceGray\n" + "/BitsPerComponent 1\n"); + + snprintf(buf, 64, "/Length %d\n", + stream_size); + strcat(dictionary, buf); + + strcat(dictionary, "/Filter /CCITTFaxDecode\n"); + + dim[i * 2] = wh[0]; + dim[i * 2 + 1] = wh[1]; + break; + case DCT_0: + case DCT_1: + ret = strinfo_jpeg_dim(&wh[0], + &wh[1], + ptr->image_data[i].image, + ptr->image_data[i].size); + + if (ret != 0) { + dim[i * 2] = 0; + dim[i * 2 + 1] = 0; + break; + } + + stream_size = ptr->image_data[i].size; + stream = malloc(stream_size); + if (stream == NULL) { + free(dictionary); + free(root_kid); + free(dim); + return 1; + } + memcpy(stream, ptr->image_data[i].image, stream_size); + + snprintf(buf, 64, "/Width %d\n/Height %d\n", + wh[0], wh[1]); + strcat(dictionary, buf); + + strcat(dictionary, "/ColorSpace /DeviceRGB\n" + "/BitsPerComponent 8\n"); + + snprintf(buf, 64, "/Length %d\n", + stream_size); + strcat(dictionary, buf); + + strcat(dictionary, "/Filter /DCTDecode\n"); + + dim[i * 2] = wh[0]; + dim[i * 2 + 1] = wh[1]; + break; + case JBIG2: + case JPX: + default: + ret = -1; + dim[i * 2] = -1; + dim[i * 2 + 1] = -1; + break; + } + + strcat(dictionary, ">>"); + + if (ret == 0) { + if ((*param)->stat > 2) + printf("Succeed\n"); + + pdf_obj_append(&pdf, ids[i], + NULL, dictionary, stream, stream_size); + + free(dictionary); + free(stream); + } else if (ret == 1) { + if ((*param)->stat > 2) + printf("; Failed\n"); + + free(dictionary); + + pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0); + } else { + free(dictionary); + } + } + + dictionary_size = 128; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n/XObject <<"); + + for (int i = 0; i < ptr->image_length; i++) { + snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); + strcat(dictionary, buf); + + if (i + 1 < ptr->image_length) + strcat(dictionary, " "); + } + + strcat(dictionary, ">>\n>>"); + + pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); + + free(dictionary); + + int conv_size; + char *conv_dst; + char conv_src[2]; + char conv_hex[3]; + + if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) { + cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); + + dictionary_size = stream_size / 8 + 7; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, ""); + } else { + dictionary_size = ptr->text_size; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "text_size; i += 4) { + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; + + conv_size = 6; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + for (int j = 0; j < conv_size - 2; j++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[j]); + strcat(dictionary, conv_hex); + } + free(conv_dst); + } + } + + strcat(dictionary, ">"); + } + + /* FIXME: Use the text somehow? */ + free(dictionary); + + dictionary_size = 64 + 12 * ptr->image_length; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "q\n"); + + strcat(dictionary, "0.120000 0 0 0.120000 0 0 cm\n"); + + for (int i = 0; i < ptr->image_length; i++) { + if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) + continue; + + /* Apply transformation matrix */ + if (ptr->image_data[i].format == DCT_1) + strcat(dictionary, "-1 0 0 -1 0 0 cm\n"); + + snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", + dim[i * 2], dim[i * 2 + 1]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Im%d Do\n", i); + strcat(dictionary, buf); + } + + strcat(dictionary, "Q"); + + if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { + free(dictionary); + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n"); + + snprintf(buf, 64, "/Length %d\n", stream_size); + strcat(dictionary, buf); + + strcat(dictionary, "/Filter /FlateDecode\n"); + + strcat(dictionary, ">>"); + + pdf_obj_append(&pdf, ids[ptr->image_length + 1], + NULL, dictionary, stream, stream_size); + + free(stream); + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n/Type /Page\n"); + + snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); + strcat(dictionary, buf); + + /* A4 paper */ + strcat(dictionary, "/MediaBox [ 0 0 595.276 841.89 ]\n"); + + /* Add /Parent when we know root */ + pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); + + free(dictionary); + + root_kid[cnt++] = ids[ptr->image_length + 2]; + + free(ids); + ids = NULL; + + free(dim); + + ptr = ptr->next; + } + + if ((*param)->stat > 1) { + printf("\t%8s\t%12s\t%12s\t%12s\n", + "id", + "object", + "dictionary", + "stream"); + + pdf_object_t *ptr = pdf->next; + while (ptr != NULL) { + printf("\t%8d\t%12d\t%12d\t%12d\n", + ptr->id, + ptr->object_size, + ptr->dictionary_size, + ptr->stream_size); + ptr = ptr->next; + } + } + + if ((*param)->stat > 0) + printf("Generated %d object(s)\n", + pdf_get_count(&pdf)); + + ids = NULL; + + if ((*param)->file_stat->outline > 0) { + if ((*param)->stat > 1) + printf("Generating outline object(s)\n\t%8s\n", "id"); + + pdf_get_free_ids(&pdf, &ids, (*param)->file_stat->outline + 1); + int outline = pdf_cnki_outline(&pdf, &(*param)->object_outline, &ids); + + if ((*param)->stat > 1) + for (int i = 0; i < (*param)->file_stat->outline + 1; i++) + printf("\t%8d\n", ids[i]); + + if ((*param)->stat > 0) { + if (outline != 0) + printf("No outline information\n"); + else + printf("Generated %d outline object(s)\n", + (*param)->file_stat->outline + 1); + } + } + + if ((*param)->stat > 1) + printf("Generating root object\n"); + + dictionary_size = 64 + 12 * (*param)->file_stat->page; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + int root = pdf_get_free_id(&pdf); + + snprintf(buf, 64, "<<\n/Type /Pages\n/Kids "); + strcat(dictionary, buf); + + if ((*param)->file_stat->page > 1) + strcat(dictionary, "["); + + for (int i = 0; i < (*param)->file_stat->page; i++) { + snprintf(buf, 64, "%d 0 R", root_kid[i]); + strcat(dictionary, buf); + if (i + 1 < (*param)->file_stat->page) + strcat(dictionary, " "); + } + + if ((*param)->file_stat->page > 1) + strcat(dictionary, "]"); + + strcat(dictionary, "\n"); + + snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page); + strcat(dictionary, buf); + + strcat(dictionary, ">>"); + + pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL, 0); + + free(dictionary); + + dictionary_size = 128; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + pdf_object_t *tmp = NULL; + + /* Add /Parent to page object */ + for (int i = 0; i < (*param)->file_stat->page; i++) { + if (pdf_get_obj(&pdf, root_kid[i], &tmp) != 0) { + free(dictionary); + free(root_kid); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, tmp->dictionary); + + snprintf(buf, 64, "/Parent %d 0 R\n>>", root); + strcat(dictionary, buf); + + if (pdf_obj_replace(&pdf, root_kid[i], NULL, dictionary, NULL, 0) != 0) { + free(dictionary); + free(root_kid); + return 1; + } + } + + free(root_kid); + + memset(dictionary, 0, dictionary_size); + + if ((*param)->stat > 0) + printf("Generated root object %d.\n", + root); + + if ((*param)->stat > 1) + printf("Generating catalog object\n"); + + snprintf(buf, 64, + "<<\n/Type /Catalog\n/Pages %d 0 R\n", + root); + strcat(dictionary, buf); + + if (ids != NULL) { + snprintf(buf, 64, + "/Outlines %d 0 R\n/PageMode /UseOutlines\n", + ids[0]); + strcat(dictionary, buf); + } + + strcat(dictionary, ">>"); + + pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); + + free(dictionary); + + if ((*param)->stat > 0) + printf("Generated catalog object\n"); + + if ((*param)->stat > 1) + printf("Sorting object(s)\n"); + + pdf_obj_sort(&pdf); + + if ((*param)->stat > 0) + printf("Sorted object(s)\n"); + + if ((*param)->stat > 1) + printf("Writing header\n"); + + long cur = 0; + + if ((*param)->stat > 0) + cur = ftell((*param)->fp_o); + + if (pdf_dump_header(&pdf, &(*param)->fp_o) != 0) { + fprintf(stderr, "Header not written\n"); + return 1; + } else { + if ((*param)->stat > 0) + printf("Header %ld byte(s) written\n", + ftell((*param)->fp_o) - cur); + } + + if ((*param)->stat > 1) + printf("Writing object(s)\n"); + + pdf_dump_obj(&pdf, &(*param)->fp_o); + + if ((*param)->stat > 1) { + printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n", + "address", + "size", + "id", + "object", + "dictionary", + "stream"); + + pdf_object_t *ptr = pdf->next; + while (ptr != NULL) { + printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n", + ptr->address, + ptr->size, + ptr->id, + ptr->object_size, + ptr->dictionary_size, + ptr->stream_size); + ptr = ptr->next; + } + } + + if ((*param)->stat > 0) + printf("%d object(s) %ld byte(s) written\n", + pdf_get_count(&pdf), + ftell((*param)->fp_o)); + + long xref = ftell((*param)->fp_o); + + if ((*param)->stat > 1) + printf("Writing cross-reference table\n"); + + if (pdf_dump_xref(&pdf, &(*param)->fp_o) != 0) { + if ((*param)->stat > 0) + printf("Cross-reference table not written\n"); + } else { + if ((*param)->stat > 0) + printf("Cross-reference table %ld byte(s) written\n", + ftell((*param)->fp_o) - xref); + } + + if ((*param)->stat > 1) + printf("Writing trailer\n"); + + if ((*param)->stat > 0) + cur = ftell((*param)->fp_o); + + if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) { + if ((*param)->stat > 0) + printf("Trailer not written\n"); + } else { + if ((*param)->stat > 0) + printf("Trailer %ld byte(s) written\n", + ftell((*param)->fp_o) - cur); + } + + if ((*param)->stat > 0) + printf("Total %ld byte(s) written\n", + ftell((*param)->fp_o)); + + pdf_obj_destroy(&pdf); + + return 0; +} diff --git a/src/cnki_xml.c b/src/cnki_xml.c index f8e693c..7f870d1 100644 --- a/src/cnki_xml.c +++ b/src/cnki_xml.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/cnki_zlib.c b/src/cnki_zlib.c index fd4cedf..7731036 100644 --- a/src/cnki_zlib.c +++ b/src/cnki_zlib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/extern.h b/src/extern.h index 4109396..3d71df3 100644 --- a/src/extern.h +++ b/src/extern.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/iconv.c b/src/iconv.c index f5a3dbe..ae7764b 100644 --- a/src/iconv.c +++ b/src/iconv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/iconv.h b/src/iconv.h index da7fefa..5a2bb6b 100644 --- a/src/iconv.h +++ b/src/iconv.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/jbig.c b/src/jbig.c new file mode 100644 index 0000000..1e9afb3 --- /dev/null +++ b/src/jbig.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020-2021, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include /* FIXME: test */ +#include +#include + +#include + +int +strdec_jbig(char **bitmap, int *bitmap_size, + const char * restrict data, int data_size) +{ + struct jbg_dec_state sd; + + jbg_dec_init(&sd); + + unsigned char *data_ptr[1] = {(unsigned char *) data}; + + /* FIXME: test */ + int ret; + if ((ret = jbg_dec_in(&sd, (unsigned char *) data_ptr, + data_size, NULL)) != JBG_EOK) { + printf("%s", jbg_strerror(ret)); + jbg_dec_free(&sd); + return 1; + } + + *bitmap_size = jbg_dec_getsize(&sd); + *bitmap = malloc(*bitmap_size); + + if (*bitmap != NULL) + memcpy(*bitmap, jbg_dec_getimage(&sd, 0), *bitmap_size); + + jbg_dec_free(&sd); + + return 0; +} diff --git a/src/jbig.h b/src/jbig.h new file mode 100644 index 0000000..170eda1 --- /dev/null +++ b/src/jbig.h @@ -0,0 +1,8 @@ +/* + * Copyright (c) 2020-2021, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +int strdec_jbig(char **bitmap, int *bitmap_size, + const char * restrict data, int data_size); diff --git a/src/jpeg.c b/src/jpeg.c new file mode 100644 index 0000000..4ea4d7f --- /dev/null +++ b/src/jpeg.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020-2021, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +int +strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, + const char * restrict data, int data_size) +{ + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + + cinfo.err = jpeg_std_error(&jerr); + + jpeg_create_decompress(&cinfo); + + jpeg_mem_src(&cinfo, (unsigned char *) data, data_size); + + jpeg_read_header(&cinfo, TRUE); + + jpeg_calc_output_dimensions(&cinfo); + + *jpeg_width = cinfo.output_width; + *jpeg_height = cinfo.output_height; + + jpeg_destroy((struct jpeg_common_struct *) &cinfo); + + jpeg_destroy_decompress(&cinfo); + + return 0; +} diff --git a/src/jpeg.h b/src/jpeg.h new file mode 100644 index 0000000..db35d94 --- /dev/null +++ b/src/jpeg.h @@ -0,0 +1,8 @@ +/* + * Copyright (c) 2020-2021, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, + const char * restrict data, int data_size); diff --git a/src/melon.c b/src/melon.c index 375cf09..9a90551 100644 --- a/src/melon.c +++ b/src/melon.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -82,7 +82,7 @@ main(int argc, char **argv, char **envp) if (param->stat > 0) printf("Melon " VERSION "." RELEASE "." PATCH EXTRA "\n" - "Copyright (c) 2020, yzrh \n\n"); + "Copyright (c) 2020-2021, yzrh \n\n"); cnki_info(¶m); diff --git a/src/pdf.c b/src/pdf.c index 202b6d2..6700e5b 100644 --- a/src/pdf.c +++ b/src/pdf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -67,7 +67,8 @@ int pdf_obj_add(pdf_object_t **pdf, int id, const char * restrict object, const char * restrict dictionary, - const char * restrict stream) + const char * restrict stream, + int stream_size) { if (*pdf != NULL || id <= 0 || (object != NULL && dictionary != NULL)) @@ -112,14 +113,15 @@ pdf_obj_add(pdf_object_t **pdf, int id, (*pdf)->dictionary = NULL; } - if (stream != NULL) { - (*pdf)->stream_size = sizeof(stream); + if (stream != NULL && stream_size > 0) { + (*pdf)->stream_size = stream_size + 1; (*pdf)->stream = malloc((*pdf)->stream_size); if ((*pdf)->stream == NULL) return 1; memcpy((*pdf)->stream, stream, (*pdf)->stream_size); + (*pdf)->stream[(*pdf)->stream_size - 1] = '\n'; } else { (*pdf)->stream_size = 0; (*pdf)->stream = NULL; @@ -153,7 +155,8 @@ int pdf_obj_prepend(pdf_object_t **pdf, int id, const char * restrict object, const char * restrict dictionary, - const char * restrict stream) + const char * restrict stream, + int stream_size) { if (*pdf == NULL) return 1; @@ -163,7 +166,8 @@ pdf_obj_prepend(pdf_object_t **pdf, int id, pdf_object_t *ptr = NULL; - if (pdf_obj_add(&ptr, id, object, dictionary, stream) != 0) { + if (pdf_obj_add(&ptr, id, object, dictionary, + stream, stream_size) != 0) { free(ptr); return 1; } @@ -178,7 +182,8 @@ int pdf_obj_append(pdf_object_t **pdf, int id, const char * restrict object, const char * restrict dictionary, - const char * restrict stream) + const char * restrict stream, + int stream_size) { if (*pdf == NULL) return 1; @@ -190,12 +195,67 @@ pdf_obj_append(pdf_object_t **pdf, int id, while (ptr->next != NULL) ptr = ptr->next; - if (pdf_obj_add(&ptr->next, id, object, dictionary, stream) != 0) + if (pdf_obj_add(&ptr->next, id, object, dictionary, + stream, stream_size) != 0) return 1; return 0; } +int +pdf_obj_replace(pdf_object_t **pdf, int id, + const char * restrict object, + const char * restrict dictionary, + const char * restrict stream, + int stream_size) +{ + pdf_object_t *ptr; + char *ret; + + if (pdf_get_obj(pdf, id, &ptr) != 0) + return 1; + + if (object != NULL && dictionary != NULL) + return 1; + + if (dictionary != NULL) { + ret = realloc(ptr->dictionary, strlen(dictionary)); + + if (ret == NULL) + return 1; + + ptr->dictionary_size = strlen(dictionary); + ptr->dictionary = ret; + + memcpy(ptr->dictionary, dictionary, ptr->dictionary_size); + } else if (object != NULL) { + ret = realloc(ptr->object, strlen(object)); + + if (ret == NULL) + return 1; + + ptr->object_size = strlen(object); + ptr->object = ret; + + memcpy(ptr->object, object, ptr->object_size); + } + + if (stream != NULL && stream_size > 0) { + ret = realloc(ptr->stream, stream_size + 1); + + if (ret == NULL) + return 1; + + ptr->stream_size = stream_size + 1; + ptr->stream = ret; + + memcpy(ptr->stream, stream, ptr->stream_size); + ptr->stream[ptr->stream_size - 1] = '\n'; + } + + return 0; +} + int pdf_obj_sort(pdf_object_t **pdf) { diff --git a/src/pdf.h b/src/pdf.h index a5630d8..f02035a 100644 --- a/src/pdf.h +++ b/src/pdf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -26,16 +26,24 @@ void pdf_obj_destroy(pdf_object_t **pdf); int pdf_obj_add(pdf_object_t **pdf, int id, const char * restrict object, const char * restrict dictionary, - const char * restrict stream); + const char * restrict stream, + int stream_size); int pdf_obj_del(pdf_object_t **pdf, int id); int pdf_obj_prepend(pdf_object_t **pdf, int id, const char * restrict object, const char * restrict dictionary, - const char * restrict stream); + const char * restrict stream, + int stream_size); int pdf_obj_append(pdf_object_t **pdf, int id, const char * restrict object, const char * restrict dictionary, - const char * restrict stream); + const char * restrict stream, + int stream_size); +int pdf_obj_replace(pdf_object_t **pdf, int id, + const char * restrict object, + const char * restrict dictionary, + const char * restrict stream, + int stream_size); int pdf_obj_sort(pdf_object_t **pdf); /* pdf_parser.c */ diff --git a/src/pdf_cnki.c b/src/pdf_cnki.c index 6e5f810..84274b8 100644 --- a/src/pdf_cnki.c +++ b/src/pdf_cnki.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -93,7 +93,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int * atoi(ptr->item->page) - 1); strcat(dictionary, buf); - pdf_obj_append(pdf, ptr->id, NULL, dictionary, NULL); + pdf_obj_append(pdf, ptr->id, NULL, dictionary, NULL, 0); if (ptr->left == NULL) (*stat)[1] = ptr->id; @@ -128,7 +128,7 @@ pdf_cnki_outline(pdf_object_t **pdf, object_outline_t **outline, int **ids) free(ret); - pdf_obj_append(pdf, (*ids)[0], NULL, buf, NULL); + pdf_obj_append(pdf, (*ids)[0], NULL, buf, NULL, 0); return 0; } diff --git a/src/pdf_cnki.h b/src/pdf_cnki.h index 6885d4f..9ec2e1c 100644 --- a/src/pdf_cnki.h +++ b/src/pdf_cnki.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/pdf_get.c b/src/pdf_get.c index c5ab788..f72f4aa 100644 --- a/src/pdf_get.c +++ b/src/pdf_get.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 9531d28..3b29c52 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/pdf_writer.c b/src/pdf_writer.c index cda998a..cd188fc 100644 --- a/src/pdf_writer.c +++ b/src/pdf_writer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/version.h b/src/version.h index ea04c55..4731e6a 100644 --- a/src/version.h +++ b/src/version.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ diff --git a/src/zlib.c b/src/zlib.c index 49004b7..76f049e 100644 --- a/src/zlib.c +++ b/src/zlib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -20,12 +20,34 @@ strinflate(char **dst, int dst_size, unsigned long size = dst_size; - uncompress((Bytef *) *dst, &size, (const Bytef *) src, src_size); - - if (size != dst_size) { + if (uncompress((Bytef *) *dst, + &size, (const Bytef *) src, src_size) != Z_OK) { free(*dst); return 1; } return 0; } + +int +strdeflate(char **dst, int *dst_size, + const char * restrict src, int src_size) +{ + *dst_size = compressBound(src_size); + *dst = malloc(*dst_size); + + if (*dst == NULL) + return 1; + + unsigned long size = *dst_size; + + if (compress((Bytef *) *dst, &size, + (const Bytef *) src, src_size) != Z_OK) { + free(*dst); + return 1; + } + + *dst_size = size; + + return 0; +} diff --git a/src/zlib.h b/src/zlib.h index 1563c6c..6c9f36a 100644 --- a/src/zlib.h +++ b/src/zlib.h @@ -1,8 +1,11 @@ /* - * Copyright (c) 2020, yzrh + * Copyright (c) 2020-2021, yzrh * * SPDX-License-Identifier: Apache-2.0 */ int strinflate(char **dst, int dst_size, const char * restrict src, int src_size); + +int strdeflate(char **dst, int *dst_size, + const char * restrict src, int src_size);