Compare commits

...

60 commits

Author SHA1 Message Date
2fa2b760ae Fix HN text parsing.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-15 15:34:46 +00:00
dd5854678c Fix JBIG2 allocation.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-06 12:02:43 +00:00
123d62141c Add document information dictionary to output.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 19:15:01 +00:00
283446dba5 Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 17:32:13 +00:00
13cb0a1b8d Fix invalid token parsing.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 11:21:54 +00:00
a7ecc15614 Replace catalog object only if root object does not exist.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:50:25 +00:00
56ffe14d5a Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:29:07 +00:00
c2afbb3cbc Handle invalid PDF object.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:19:06 +00:00
8cd8a8fbba Replace catalog object if found.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:07:57 +00:00
8276423eb8 Prioritise incomplete object during deduplication.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 13:51:13 +00:00
7ac0971a17 Handle invalid result from PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-03 15:39:53 +00:00
e0fe937e1a Fix KDH decryption.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-03 12:12:42 +00:00
4a02b8bfc7 Fix inconsistent whitespace detection in PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-03 00:13:56 +00:00
7d9d658461 Handle duplicated image in HN.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-02 15:38:45 +00:00
000405693e Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 21:26:44 +00:00
d6fa934b5f Handle incomplete PDF object in parser.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 20:51:09 +00:00
1a1fee1034 Handle duplicated object in CAJ.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 19:31:33 +00:00
cde014cffb Improve PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 18:58:43 +00:00
9019a18449 Split md5 function.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 11:11:56 +00:00
a18de8f2ef Rename JBIG decoder.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 10:09:08 +00:00
70e1e7ea97 Fix JBIG decoder data type.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 00:42:20 +00:00
bffb8ce8a4 Fix JBIG decoder.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-31 21:17:28 +00:00
3ac51d66b9 Fix JBIG table length.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-31 18:52:06 +00:00
0bbf8e65dd Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-31 11:28:03 +00:00
220a81c2ad Fix HN image compositing.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-31 10:48:29 +00:00
1d899d934d Fix PDF object check.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-30 20:16:53 +00:00
226f16ddf4 Handle HN page with figure only.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-30 15:04:32 +00:00
9646ee61c3 Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-30 02:04:43 +00:00
5466a441df Fix type casting when processing data.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-30 02:00:12 +00:00
1ce3f89574 Handle combination of text and image in page content.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 21:10:03 +00:00
5a1afb0056 Link against libc for iconv, find openjpeg header with pkgconf.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 17:30:36 +00:00
060bc00a0d Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 06:30:59 +00:00
97931e1470 Fix PDF object check.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 05:23:04 +00:00
cd0af5ba3c Fix buffer overflow when object size is less than 8 bytes.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 04:05:34 +00:00
988a751c15 Handle missing root object which is parent of others.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 03:00:11 +00:00
8083b30530 Add JPEG 2000 support.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 00:40:14 +00:00
abce2fd2e4 Add preliminary support for HN figure placement.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-28 19:29:46 +00:00
224a09a015 Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-26 00:13:18 +00:00
c2ad6549fb Handle headless HN and page with no image.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-25 23:18:17 +00:00
d2826fa075 Simplify JBIG decoder.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-25 05:15:56 +00:00
288b65a1fd Handle different JPEG colour component.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-25 01:26:05 +00:00
9c1f1d0b75 Fix HN conversion and add JBIG2 support.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-24 23:29:56 +00:00
ac3b1dda63 Fix memory leak and data type.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-22 19:48:48 +00:00
63728e1340 Add error message for JBIG.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-20 00:23:46 +00:00
3550095959 Update Makefile
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-19 23:54:06 +00:00
86b6487fff Remove -march=native in Makefile and remove char **envp for POSIX compliance. 2021-09-02 17:04:05 +00:00
409acceffa Fix memory leak. 2021-01-29 21:50:20 +00:00
7270c1771f Update CHANGE. 2021-01-13 17:30:11 +00:00
7a5dd05425 Add wrapper for PDF operation. 2021-01-13 17:07:45 +00:00
057a7acc51 Fix transformation matrix. 2021-01-12 03:13:07 +00:00
f685e91d35 Fix HN dictionary generation. 2021-01-12 02:26:39 +00:00
2aab394684 Fix root object dictionary generation. 2021-01-11 22:57:59 +00:00
1994f122cc Decode JBIG and JPEG during HN conversion. 2021-01-03 03:01:28 +00:00
b20c6ad3ed Handle binary data in dictionary. 2020-12-31 20:38:02 +00:00
3bd7ea7520 Improve portability. 2020-12-31 18:45:02 +00:00
1f62c53da6 Produce PDF directly from KDH. 2020-12-30 21:12:52 +00:00
98691d4203 Add HN text extraction. 2020-12-30 03:09:00 +00:00
8d6fbb43c9 Update README. 2020-12-29 02:33:03 +00:00
5c5ddc926b Update HN data structure. 2020-12-29 02:10:17 +00:00
bcb8ef9cd9 Change e-mail address. 2020-09-08 00:58:40 +00:00
41 changed files with 2561 additions and 477 deletions

View file

@ -1,3 +1,54 @@
0.3.0 (2023-XX-XX)
==================
* Support HN text overlay.
* Support HN page with text.
* Handle inaccurate page count in CAJ and KDH.
0.2.5 (2023-01-05)
==================
* Improve PDF parser.
* Handle duplicated object in CAJ.
* Handle duplicated image in HN.
* Handle incomplete PDF object in CAJ and KDH.
* Handle invalid PDF object token in CAJ and KDH.
* Fix JBIG decoder.
0.2.4 (2022-12-31)
==================
* Fix HN image compositing.
* Fix PDF object check.
0.2.3 (2022-12-30)
==================
* Support HN figure placement.
0.2.2 (2022-12-29)
==================
* Support JPEG 2000 for HN.
* Handle missing but referenced root object.
* Handle HN with more than one image per page.
* Fix buffer overflow.
0.2.1 (2022-12-26)
==================
* Handle different JPEG colour component.
* Handle headless HN and page with no image.
0.2.0 (2022-12-22)
==================
* KDH conversion now produces a valid PDF
* Handle binary data in dictionary.
* Add preliminary support for HN
* Fix root object dictionary generation when root object has more than two children.
* Fix memory leak and data type.
0.1.0 (2020-04-08)
==================

View file

@ -6,12 +6,18 @@ Melon: Converter that produces PDF from CNKI proprietary formats
Development
-----------
Currently, PDF, CAJ, and KDH can be converted. Please report
Currently, CAJ, KDH, and HN can be converted. Please report
any failures with a sample that can reproduce the behaviour.
KDH is essentially an invalid PDF file xor'ed with a predetermined key.
You may want to convert the decrypted KDH to valid PDF, although some
PDF readers can display the invalid PDF.
Dependency
----------
1. libcrypto (OpenSSL)
2. zlib
3. jbig2dec
4. libjpeg-turbo
5. openjpeg
6. pkgconf
Usage
=====
@ -29,12 +35,12 @@ Options
Specify output file
-b, --buffer
Set buffer size (default 512k)
Set input buffer size (default 512k)
-v, --verbose
Print more information (twice for even more)
Print more information (twice for even more, three times for HN image processing information as well)
Thanks
======
This project is inspired by [https://github.com/JeziL/caj2pdf](https://github.com/JeziL/caj2pdf)
This project is inspired by [https://github.com/caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf)

View file

@ -1,28 +0,0 @@
#
# Copyright (c) 2020, yzrh <yzrh@tuta.io>
#
# SPDX-License-Identifier: Apache-2.0
#
src != ls *.c
obj = ${src:.c=.o}
PREFIX = /usr/local
CFLAGS = -O3 -march=native -pipe -Wall
LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed
all: ${obj}
${CC} ${LDFLAGS} -o melon $^
clean:
rm -f melon ${obj}
install:
install -d ${PREFIX}/bin
install melon ${PREFIX}/bin/
deinstall:
rm -f ${PREFIX}/bin/melon
.PHONY: all clean install deinstall

View file

@ -1,19 +1,33 @@
#
# Copyright (c) 2020, yzrh <yzrh@tuta.io>
# Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
#
# SPDX-License-Identifier: Apache-2.0
#
src != ls *.c
src = melon.c iconv.c zlib.c jbig2.c jpeg.c jp2.c md5.c \
cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c cnki_pdf.c \
cnki_zlib.c cnki_jbig.c cnki_jbig_dec.c cnki_jbig2.c cnki.c \
pdf_cnki.c pdf_get.c pdf_parser.c pdf_writer.c pdf.c
inc = extern.h version.h iconv.h zlib.h jbig2.h jpeg.h jp2.h md5.h \
cnki.h pdf_cnki.h cnki_jbig.h cnki_jbig_dec.h pdf.h
obj = ${src:.c=.o}
PREFIX = /usr/local
CFLAGS = -O3 -march=native -pipe -flto=thin -Wall
LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed
CFLAGS = -O2 -pipe -flto -Wall -Wextra
LDFLAGS = -Wl,-O2 -lcrypto -lz -ljbig2dec -ljpeg -lopenjp2 -Wl,--as-needed
all: ${obj}
${CC} ${LDFLAGS} -o melon $>
CFLAGS += -I/usr/local/include
LDFLAGS += -L/usr/local/lib
OPENJPEG_CFLAGS != pkgconf --cflags libopenjp2
CFLAGS += ${OPENJPEG_CFLAGS}
CFLAGS += -DLIBICONV_PLUG
all: ${obj} ${inc}
${CC} ${LDFLAGS} -o melon ${obj}
clean:
rm -f melon ${obj}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -33,7 +33,7 @@ cnki_create(cnki_t **param)
memset((*param)->file_stat, 0, sizeof(file_stat_t));
(*param)->object_outline = NULL;
(*param)->object_nh = NULL;
(*param)->object_hn = NULL;
return 0;
}
@ -44,10 +44,24 @@ cnki_destroy(cnki_t **param)
if (*param != NULL) {
if ((*param)->file_stat != NULL)
free((*param)->file_stat);
if ((*param)->object_outline != NULL)
free((*param)->object_outline);
if ((*param)->object_nh != NULL)
free((*param)->object_nh);
object_outline_t *ptr_outline;
while ((ptr_outline = (*param)->object_outline) != NULL) {
(*param)->object_outline = (*param)->object_outline->next;
free(ptr_outline);
}
object_hn_t *ptr_hn;
while ((ptr_hn = (*param)->object_hn) != NULL) {
(*param)->object_hn = (*param)->object_hn->next;
free(ptr_hn->text);
if (ptr_hn->image_data != NULL)
for (int i = 0; i < ptr_hn->image_length; i++)
free(ptr_hn->image_data[i].image);
free(ptr_hn->image_data);
free(ptr_hn);
}
free(*param);
}
}
@ -59,32 +73,42 @@ cnki_info(cnki_t **param)
return 1;
if ((*param)->stat > 1)
printf("Reading file header at %x\n", ADDRESS_HEAD);
printf("Reading file header at 0x%x\n", ADDRESS_HEAD);
int addr[2];
unsigned char str[2];
fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET);
fread((*param)->file_stat->type, 4, 1, (*param)->fp_i);
if ((*param)->stat > 0)
printf("File type is '%s'\n", (*param)->file_stat->type);
fread(str, 2, 1, (*param)->fp_i);
if (strcmp((*param)->file_stat->type, "%PDF") == 0) {
if ((*param)->stat > 0) {
if ((unsigned char) (*param)->file_stat->type[0] > 0x7f)
printf("File type is '%02x'\n", (unsigned char) (*param)->file_stat->type[0]);
else
printf("File type is '%s'\n", (*param)->file_stat->type);
}
if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) {
return 0;
} else if (strcmp((*param)->file_stat->type, "CAJ") == 0) {
} else if (strncmp((*param)->file_stat->type, "CAJ", 3) == 0) {
addr[0] = ADDRESS_CAJ_PAGE;
addr[1] = ADDRESS_CAJ_OUTLINE;
} else if (strcmp((*param)->file_stat->type, "HN") == 0) {
} else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) {
addr[0] = ADDRESS_HN_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if (strcmp((*param)->file_stat->type, "KDH ") == 0) {
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
addr[0] = ADDRESS_C8_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) {
return 0;
} else {
return 1;
}
if ((*param)->stat > 1)
printf("Reading page count at %x\n", addr[0]);
printf("Reading page count at 0x%x\n", addr[0]);
fseek((*param)->fp_i, addr[0], SEEK_SET);
fread(&(*param)->file_stat->page, 4, 1, (*param)->fp_i);
@ -93,8 +117,16 @@ cnki_info(cnki_t **param)
printf("Advised %d page(s)\n",
(*param)->file_stat->page);
if (strncmp((*param)->file_stat->type, "HN", 2) == 0 && str[0] == 0xc8 && str[1] == 0x00) {
fseek((*param)->fp_i, 0xd8, SEEK_SET);
return 0;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
fseek((*param)->fp_i, 0x50, SEEK_SET);
return 0;
}
if ((*param)->stat > 1)
printf("Reading outline count at %x\n", addr[1]);
printf("Reading outline count at 0x%x\n", addr[1]);
fseek((*param)->fp_i, addr[1], SEEK_SET);
fread(&(*param)->file_stat->outline, 4, 1, (*param)->fp_i);
@ -106,7 +138,7 @@ cnki_info(cnki_t **param)
if ((*param)->file_stat->outline > 0) {
if ((*param)->stat > 1) {
printf("Loading outline(s)\n");
printf("\t%16s\t%-24s\t%12s\t%12s\t%5s\n",
printf("\t%19s\t%-24s\t%12s\t%12s\t%5s\n",
"title",
"hierarchy",
"page",

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -16,6 +16,8 @@
#define ADDRESS_HN_PAGE 0x0090
#define ADDRESS_HN_OUTLINE 0x0158
#define ADDRESS_C8_PAGE 0x0008
#define ADDRESS_KDH_BODY 0x00fe
#define KEY_KDH "FZHMEI"
@ -44,26 +46,36 @@ typedef struct _object_outline_tree_t {
struct _object_outline_tree_t *right;
} object_outline_tree_t;
typedef enum _nh_code {
CCITTFAX,
typedef enum _hn_code {
JBIG, /* Inverted */
DCT_0,
DCT_1,
DCT_1, /* Inverted */
JBIG2,
JPX
} nh_code;
} hn_code;
typedef struct _object_nh_t {
int32_t address; /* Starting at end of object_outline_t */
typedef struct _hn_image_t {
int32_t format; /* hn_code */
int32_t address;
int32_t size;
int16_t page[2];
int32_t zero[2];
char *text;
int32_t image_format; /* nh_code */
int32_t image_address;
int32_t image_size;
uint16_t x;
uint16_t y;
uint16_t w;
uint16_t h;
char *image;
struct _object_nh_t *next;
} object_nh_t;
} hn_image_t;
typedef struct _object_hn_t {
int32_t address; /* Starting at end of object_outline_t */
int32_t text_size;
int16_t image_length;
int16_t page;
int32_t unknown; /* TODO: what is it? */
int32_t address_next;
char *text;
struct _hn_image_t *image_data;
struct _object_hn_t *next;
} object_hn_t;
typedef struct _cnki_t {
int stat;
@ -72,15 +84,27 @@ typedef struct _cnki_t {
FILE *fp_o;
file_stat_t *file_stat;
object_outline_t *object_outline;
object_nh_t *object_nh;
object_hn_t *object_hn;
} cnki_t;
/* cnki_pdf.c */
int cnki_pdf(cnki_t **param);
int cnki_pdf_hn(cnki_t **param);
/* cnki_outline_tree.c */
int cnki_outline_tree(object_outline_tree_t **outline_tree,
object_outline_t **outline, int *ids);
/* cnki_xml.c */
int cnki_xml(char **xml, FILE **fp);
/* cnki_zlib.c */
int cnki_zlib(char **dst, int *dst_size,
const char * restrict src, int src_size);
/* cnki_jbig.c */
int cnki_jbig(char **bitmap, int *bitmap_size,
int *bitmap_width, int *bitmap_height,
const char * restrict jbig, int jbig_size);
/* cnki_jbig2.c */
int cnki_jbig2(char **bitmap, int *bitmap_size,
int *bitmap_width, int *bitmap_height,
const char * restrict jbig, int jbig_size);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -18,7 +18,7 @@ cnki_caj(cnki_t **param)
printf("Begin 'CAJ' conversion\n");
if ((*param)->stat > 1)
printf("Reading document body address at %x\n", ADDRESS_CAJ_BODY);
printf("Reading document body address at 0x%x\n", ADDRESS_CAJ_BODY);
int addr;
@ -29,7 +29,7 @@ cnki_caj(cnki_t **param)
fseek((*param)->fp_i, addr, SEEK_SET);
if ((*param)->stat > 0)
printf("Advised document body address is %x\n", addr);
printf("Advised document body address is 0x%x\n", addr);
cnki_pdf(param);

155
src/cnki_hn.c Normal file
View file

@ -0,0 +1,155 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdlib.h>
#include "cnki.h"
int
cnki_hn(cnki_t **param)
{
if (*param == NULL)
return 1;
if ((*param)->stat > 0)
printf("Begin 'HN' conversion\n");
if ((*param)->file_stat->page > 0)
(*param)->object_hn = malloc(sizeof(object_hn_t));
else
return 1;
if ((*param)->object_hn == NULL)
return 1;
if ((*param)->stat > 1) {
printf("Loading page(s)\n");
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4s\t%8s\t%8s\n",
"address",
"text",
"length",
"page",
"unknown",
"next",
"code",
"address",
"image");
}
object_hn_t *ptr = (*param)->object_hn;
for (int i = 0; i < (*param)->file_stat->page; i++) {
fread(&ptr->address, 4, 1, (*param)->fp_i);
fread(&ptr->text_size, 4, 1, (*param)->fp_i);
fread(&ptr->image_length, 2, 1, (*param)->fp_i);
fread(&ptr->page, 2, 1, (*param)->fp_i);
fread(&ptr->unknown, 4, 1, (*param)->fp_i);
fread(&ptr->address_next, 4, 1, (*param)->fp_i);
ptr->text = NULL;
ptr->image_data = NULL;
ptr->next = NULL;
if (i < (*param)->file_stat->page - 1) {
ptr->next = malloc(sizeof(object_hn_t));
if (ptr->next == NULL)
return 1;
}
ptr = ptr->next;
}
ptr = (*param)->object_hn;
while (ptr != NULL) {
if (ptr->text_size > 0) {
ptr->text = malloc(ptr->text_size);
if (ptr->text == NULL)
return 1;
fseek((*param)->fp_i, ptr->address, SEEK_SET);
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
}
if ((*param)->stat > 1)
printf("\t%08x\t%8d\t%6d\t%4d\t%8d\t%08x",
ptr->address,
ptr->text_size,
ptr->image_length,
ptr->page,
ptr->unknown,
ptr->address_next);
if (ptr->image_length > 0) {
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
if (ptr->image_data == NULL)
return 1;
for (int i = 0; i < ptr->image_length; i++) {
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
ptr->image_data[i].x = 0;
ptr->image_data[i].y = 0;
ptr->image_data[i].w = 0;
ptr->image_data[i].h = 0;
fseek((*param)->fp_i,
ptr->image_data[i].address + ptr->image_data[i].size,
SEEK_SET);
}
for (int i = 0; i < ptr->image_length; i++) {
ptr->image_data[i].image = malloc(ptr->image_data[i].size);
if (ptr->image_data[i].image == NULL)
return 1;
fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET);
fread(ptr->image_data[i].image,
ptr->image_data[i].size, 1,
(*param)->fp_i);
if ((*param)->stat > 1) {
if (i == 0) {
printf("\t%4d\t%08x\t%8d\n",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
} else {
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4d\t%08x\t%8d\n",
"",
"",
"",
"",
"",
"",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
}
}
}
} else if ((*param)->stat > 1) {
printf("\t%4s\t%8s\t%8s\n",
"",
"",
"");
}
ptr = ptr->next;
}
if ((*param)->stat > 0)
printf("Loaded %d page(s)\n", (*param)->file_stat->page);
cnki_pdf_hn(param);
if ((*param)->stat > 0)
printf("Conversion ended\n");
return 0;
}

43
src/cnki_jbig.c Normal file
View file

@ -0,0 +1,43 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdlib.h>
#include <string.h>
#include "cnki_jbig.h"
#include "cnki_jbig_dec.h"
int
cnki_jbig(char **bitmap, int *bitmap_size,
int *bitmap_width, int *bitmap_height,
const char * restrict jbig, int jbig_size)
{
dib_t *dib = malloc(sizeof(dib_t));
if (dib == NULL)
return 1;
memcpy(dib, jbig, 40);
int width_padded = (dib->width * dib->depth + 7) / 8;
*bitmap_size = dib->height * width_padded;
*bitmap = malloc(*bitmap_size);
if (*bitmap == NULL) {
free(dib);
return 1;
}
strdec_jbig(bitmap, dib->width, dib->height, jbig + 48, jbig_size - 48);
*bitmap_width = dib->width;
*bitmap_height = dib->height;
free(dib);
return 0;
}

41
src/cnki_jbig.h Normal file
View file

@ -0,0 +1,41 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdint.h>
typedef enum _dib_compression_code {
BI_RGB,
BI_RLE8,
BI_RLE4,
BI_BITFIELDS,
BI_JPEG,
BI_PNG,
BI_ALPHABITFIELDS,
BI_CMYK = 11,
BI_CMYKRLE8 = 12,
BI_CMYKRLE4 = 13
} dib_compression_code;
typedef struct _dib_t {
uint32_t dib_size; /* Always 40 */
int32_t width;
int32_t height;
uint16_t plane; /* Always 1 */
uint16_t depth;
uint32_t compression; /* dib_compression_code */
uint32_t size;
int32_t resolution_h;
int32_t resolution_v;
uint32_t colour;
uint32_t colour_used;
} dib_t;
typedef struct _colour_table {
uint16_t blue;
uint16_t green;
uint16_t red;
uint16_t fill; /* Always 0 */
} colour_table;

43
src/cnki_jbig2.c Normal file
View file

@ -0,0 +1,43 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdlib.h>
#include <string.h>
#include "cnki_jbig.h"
#include "jbig2.h"
int
cnki_jbig2(char **bitmap, int *bitmap_size,
int *bitmap_width, int *bitmap_height,
const char * restrict jbig, int jbig_size)
{
dib_t *dib = malloc(sizeof(dib_t));
if (dib == NULL)
return 1;
memcpy(dib, jbig, 40);
int width_padded = (dib->width * dib->depth + 7) / 8;
*bitmap_size = dib->height * width_padded;
*bitmap = malloc(*bitmap_size);
if (*bitmap == NULL) {
free(dib);
return 1;
}
strdec_jbig2(bitmap, jbig + 48, jbig_size - 48);
*bitmap_width = dib->width;
*bitmap_height = dib->height;
free(dib);
return 0;
}

314
src/cnki_jbig_dec.c Normal file
View file

@ -0,0 +1,314 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
static const uint16_t _LSZ[0x71] = {
0x5a1d,
0x2586, 0x1114, 0x080b, 0x03d8, 0x01da, 0x00e5, 0x006f, 0x0036,
0x001a, 0x000d, 0x0006, 0x0003, 0x0001, 0x5a7f, 0x3f25, 0x2cf2,
0x207c, 0x17b9, 0x1182, 0x0cef, 0x09a1, 0x072f, 0x055c, 0x0406,
0x0303, 0x0240, 0x01b1, 0x0144, 0x00f5, 0x00b7, 0x008a, 0x0068,
0x004e, 0x003b, 0x002c, 0x5ae1, 0x484c, 0x3a0d, 0x2ef1, 0x261f,
0x1f33, 0x19a8, 0x1518, 0x1177, 0x0e74, 0x0bfb, 0x09f8, 0x0861,
0x0706, 0x05cd, 0x04de, 0x040f, 0x0363, 0x02d4, 0x025c, 0x01f8,
0x01a4, 0x0160, 0x0125, 0x00f6, 0x00cb, 0x00ab, 0x008f, 0x5b12,
0x4d04, 0x412c, 0x37d8, 0x2fe8, 0x293c, 0x2379, 0x1edf, 0x1aa9,
0x174e, 0x1424, 0x119c, 0x0f6b, 0x0d51, 0x0bb6, 0x0a40, 0x5832,
0x4d1c, 0x438e, 0x3bdd, 0x34ee, 0x2eae, 0x299a, 0x2516, 0x5570,
0x4ca9, 0x44d9, 0x3e22, 0x3824, 0x32b4, 0x2e17, 0x56a8, 0x4f46,
0x47e5, 0x41cf, 0x3c3d, 0x375e, 0x5231, 0x4c0f, 0x4639, 0x415e,
0x5627, 0x50e7, 0x4b85, 0x5597, 0x504f, 0x5a10, 0x5522, 0x59eb
};
static const uint8_t _NLPS[0x71] = {
1,
14, 16, 18, 20, 23, 25, 28, 30,
33, 35, 9, 10, 12, 15, 36, 38,
39, 40, 42, 43, 45, 46, 48, 49,
51, 52, 54, 56, 57, 59, 60, 62,
63, 32, 33, 37, 64, 65, 67, 68,
69, 70, 72, 73, 74, 75, 77, 78,
79, 48, 50, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59, 61, 61, 65,
80, 81, 82, 83, 84, 86, 87, 87,
72, 72, 74, 74, 75, 77, 77, 80,
88, 89, 90, 91, 92, 93, 86, 88,
95, 96, 97, 99, 99, 93, 95, 101,
102, 103, 104, 99, 105, 106, 107, 103,
105, 108, 109, 110, 111, 110, 112, 112
};
static const uint8_t _NMPS[0x71] = {
1,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 13, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 9, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49,
50, 51, 52, 53, 54, 55, 56, 57,
58, 59, 60, 61, 62, 63, 32, 65,
66, 67, 68, 69, 70, 71, 72, 73,
74, 75, 76, 77, 78, 79, 48, 81,
82, 83, 84, 85, 86, 87, 71, 89,
90, 91, 92, 93, 94, 86, 96, 97,
98, 99, 100, 93, 102, 103, 104, 99,
106, 107, 103, 109, 107, 111, 109, 111
};
static const bool _SWTCH[0x71] = {
1,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 1, 0, 1
};
static uint8_t _ct;
static uint8_t _pix;
static uint16_t _reg_a;
static uint32_t _reg_c;
static uint8_t _mps[0x1000];
static uint8_t _st[0x1000];
static int _width;
static int _height;
static int _width_padded;
static int _ret_pos;
static unsigned char *_ret;
static int _scd_size;
static unsigned char *_scd;
static void
_bytein(void)
{
if (_ret_pos < _scd_size)
_reg_c += _scd[_ret_pos++] << 8;
_ct = 8;
}
static void
_initdec(void)
{
memset(_mps, 0, 0x1000);
memset(_st, 0, 0x1000);
_reg_c = 0;
_bytein();
_reg_c <<= 8;
_bytein();
_reg_c <<= 8;
_bytein();
_reg_a = 0x0000;
}
static void
_exchange_lps(uint16_t cx)
{
uint8_t st_cx = _st[cx];
uint16_t lsz_st_cx = _LSZ[_st[cx]];
if (_reg_a < lsz_st_cx) {
_pix = _mps[cx];
_st[cx] = _NMPS[st_cx];
} else {
_pix = 1 - _mps[cx];
if (_SWTCH[st_cx])
_mps[cx] = _pix;
_st[cx] = _NLPS[st_cx];
}
_reg_c -= _reg_a << 16;
_reg_a = lsz_st_cx;
}
static void
_exchange_mps(uint16_t cx)
{
uint8_t st_cx = _st[cx];
uint16_t lsz_st_cx = _LSZ[_st[cx]];
if (_reg_a < lsz_st_cx) {
_pix = 1 - _mps[cx];
if (_SWTCH[st_cx])
_mps[cx] = _pix;
_st[cx] = _NLPS[st_cx];
} else {
_pix = _mps[cx];
_st[cx] = _NMPS[st_cx];
}
}
static void
_renormd(void)
{
do {
if (_ct == 0)
_bytein();
_reg_a <<= 1;
_reg_c <<= 1;
_ct--;
} while (_reg_a < 0x8000);
if (_ct == 0)
_bytein();
}
static void
_decode(uint16_t cx)
{
_reg_a -= _LSZ[_st[cx]];
if (_reg_a > _reg_c >> 16) {
if (_reg_a < 0x8000) {
_exchange_mps(cx);
_renormd();
} else {
_pix = _mps[cx];
}
} else {
_exchange_lps(cx);
_renormd();
}
}
static void
_procline(int line, char *a, char *b, char *c)
{
/* The encoder must be erroneous */
uint16_t cx = 0;
if (line > 0) {
cx += (_ret[_width_padded * (_height - line)] & 0x20) << 2;
cx += _ret[_width_padded * (_height - line)] & 0x40;
cx += (_ret[_width_padded * (_height - line)] & 0x80) >> 2;
}
if (line > 1) {
cx += (_ret[_width_padded * (_height - line + 1)] & 0x40) >> 4;
cx += (_ret[_width_padded * (_height - line + 1)] & 0x80) >> 6;
}
for (int i = 0; i < _width; i++) {
_decode(cx);
cx >>= 1;
if (_pix == 1) {
_ret[_width_padded * (_height - line - 1) + i / 8] |= _pix << (7 - (i & 0x07));
c[i] = 1;
cx |= 0x0200;
} else {
cx &= 0xfdff;
}
if (i + 2 < _width && a[i + 2] == 1)
cx |= 0x0004;
else
cx &= 0xfffb;
if (i + 3 < _width && b[i + 3] == 1)
cx |= 0x0080;
else
cx &= 0xff7f;
}
}
static int
_procstripe(void)
{
if (_height <= 0 || _width_padded <= 0)
return 1;
int pix_size = 8 * _width_padded;
char *buf = malloc(3 * pix_size);
if (buf == NULL)
return 1;
memset(buf, 0, 3 * pix_size);
char *a = buf;
char *b = a + pix_size;
char *c = b + pix_size;
char *z;
for (int i = 0; i < _height; i++) {
_decode(0x029c);
if (_pix == 1) {
if (i > 0)
memcpy(_ret + _width_padded * (_height - i - 1),
_ret + _width_padded * (_height - i),
_width_padded);
memcpy(c, b, pix_size);
} else {
/* line atypical */
memset(c, 0, pix_size);
_procline(i, a, b, c);
}
z = a;
a = b;
b = c;
c = z;
}
free(buf);
return 0;
}
int
strdec_jbig(char **bitmap, int width, int height,
const char * restrict jbig, int jbig_size)
{
_width = width;
_height = height;
_width_padded = (_width + 7) / 8;
memset(*bitmap, 0, _height * _width_padded);
_ret_pos = 0;
_ret = (unsigned char *) *bitmap;
_scd_size = jbig_size;
_scd = (unsigned char *) jbig;
_initdec();
return _procstripe();
}

8
src/cnki_jbig_dec.h Normal file
View file

@ -0,0 +1,8 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strdec_jbig(char **bitmap, int width, int height,
const char * restrict jbig, int jbig_size);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -15,35 +15,54 @@ cnki_kdh(cnki_t **param)
if ((*param)->stat > 0)
printf("Begin 'KDH' decryption\n");
long cur = ADDRESS_KDH_BODY;
long end;
fseek((*param)->fp_i, 0, SEEK_END);
long size = ftell((*param)->fp_i);
fseek((*param)->fp_i, ADDRESS_KDH_BODY, SEEK_SET);
end = ftell((*param)->fp_i);
fseek((*param)->fp_i, cur, SEEK_SET);
const char key[] = KEY_KDH;
const int key_len = KEY_KDH_LENGTH;
long key_cur = 0;
int buf_size;
char buf[(*param)->size_buf];
FILE *tmp = tmpfile();
if (tmp == NULL)
return 1;
for (;;) {
fread(buf, (*param)->size_buf, 1, (*param)->fp_i);
if (cur + (*param)->size_buf < end)
buf_size = (*param)->size_buf;
else
buf_size = end - cur;
for (int i = 0; i < (*param)->size_buf; i++) {
buf[i] ^= key[key_cur % key_len];
key_cur++;
}
fread(buf, buf_size, 1, (*param)->fp_i);
fwrite(buf, (*param)->size_buf, 1, (*param)->fp_o);
for (int i = 0; i < buf_size; i++)
buf[i] ^= key[key_cur++ % key_len];
if (ftell((*param)->fp_i) == size)
fwrite(buf, buf_size, 1, tmp);
if ((cur = ftell((*param)->fp_i)) >= end)
break;
}
if ((*param)->stat > 0)
printf("Decryption ended total %ld byte(s) written\n",
ftell((*param)->fp_o));
printf("Decrypted %ld byte(s)\n", ftell(tmp));
fclose((*param)->fp_i);
fseek(tmp, 0, SEEK_SET);
(*param)->fp_i = tmp;
cnki_pdf(param);
if ((*param)->stat > 0)
printf("Conversion ended\n");
return 0;
}

View file

@ -1,110 +0,0 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdlib.h>
#include "cnki.h"
int
cnki_nh(cnki_t **param)
{
if (*param == NULL)
return 1;
if ((*param)->stat > 0)
printf("Begin 'HN' conversion\n");
if ((*param)->file_stat->page > 0)
(*param)->object_nh = malloc(sizeof(object_nh_t));
else
return 1;
if ((*param)->object_nh == NULL)
return 1;
if ((*param)->stat > 1) {
printf("Loading page(s)\n");
printf("\t%8s\t%8s\t%13s\t%6s\t%4s\t%8s\t%8s\n",
"address",
"text",
"page",
"zero",
"code",
"address",
"image");
}
object_nh_t *ptr = (*param)->object_nh;
for (int i = 0; i < (*param)->file_stat->page; i++) {
fread(&ptr->address, 4, 1, (*param)->fp_i);
fread(&ptr->size, 4, 1, (*param)->fp_i);
fread(&ptr->page, 4, 1, (*param)->fp_i);
fread(&ptr->zero, 8, 1, (*param)->fp_i);
ptr->text = NULL;
ptr->image_format = -1;
ptr->image_address = 0;
ptr->image_size = 0;
ptr->image = NULL;
ptr->next = NULL;
if (i < (*param)->file_stat->page - 1) {
ptr->next = malloc(sizeof(object_nh_t));
if (ptr->next == NULL)
return 1;
}
ptr = ptr->next;
}
ptr = (*param)->object_nh;
while (ptr != NULL) {
ptr->text = malloc(ptr->size);
if (ptr->text == NULL)
return 1;
fseek((*param)->fp_i, ptr->address, SEEK_SET);
fread(ptr->text, ptr->size, 1, (*param)->fp_i);
fread(&ptr->image_format, 4, 1, (*param)->fp_i);
fread(&ptr->image_address, 4, 1, (*param)->fp_i);
fread(&ptr->image_size, 4, 1, (*param)->fp_i);
ptr->image = malloc(ptr->image_size);
if (ptr->image == NULL)
return 1;
fseek((*param)->fp_i, ptr->image_address, SEEK_SET);
fread(ptr->image, ptr->image_size, 1, (*param)->fp_i);
if ((*param)->stat > 1)
printf("\t%08x\t%8d\t{%d, %8d}\t{%d, %d}\t%4d\t%08x\t%8d\n",
ptr->address,
ptr->size,
ptr->page[0],
ptr->page[1],
ptr->zero[0],
ptr->zero[1],
ptr->image_format,
ptr->image_address,
ptr->image_size);
ptr = ptr->next;
}
if ((*param)->stat > 1)
printf("Loaded %d page(s)\n", (*param)->file_stat->page);
/* TODO: Study signed int __fastcall CAJDoc::OpenNHCAJFile(int a1, int a2) */
if ((*param)->stat > 0)
printf("Conversion ended\n");
/* TODO: Finish me please :) */
return 1;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/

File diff suppressed because it is too large Load diff

View file

@ -1,14 +0,0 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdio.h>
int
cnki_xml(char **xml, FILE **fp)
{
/* TODO: Extract XML and embed into `/Metadata' */
return 1;
}

30
src/cnki_zlib.c Normal file
View file

@ -0,0 +1,30 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdint.h>
#include <string.h>
#include "zlib.h"
int
cnki_zlib(char **dst, int *dst_size,
const char * restrict src, int src_size)
{
uint8_t padding = 0;
int32_t size;
if (strncmp(src + 8, "COMPRESSTEXT", 12) == 0)
padding = 8;
memcpy(&size, src + 12 + padding, 4);
*dst_size = size;
if (strinflate(dst, size, src + 16 + padding, src_size - 16 - padding) != 0)
return 1;
return 0;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -14,8 +14,8 @@ int cnki_info(cnki_t **param);
/* cnki_caj.c */
int cnki_caj(cnki_t **param);
/* cnki_nh.c */
int cnki_nh(cnki_t **param);
/* cnki_hn.c */
int cnki_hn(cnki_t **param);
/* cnki_kdh.c */
int cnki_kdh(cnki_t **param);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -9,7 +9,6 @@
#include <iconv.h>
/* So, why would anyone use something other than UTF-8? */
int
strconv(char **dst,
const char * restrict dst_code,
@ -51,8 +50,7 @@ strconv(char **dst,
free(src_start);
return 1;
} else {
/* Not including NULL */
*size -= dst_size + 2;
*size -= dst_size;
*dst = malloc(*size);

View file

@ -1,11 +1,10 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int
strconv(char **dst,
int strconv(char **dst,
const char * restrict dst_code,
const char * restrict src,
const char * restrict src_code,

36
src/jbig2.c Normal file
View file

@ -0,0 +1,36 @@
/*
* Copyright (c) 2022-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <jbig2.h>
int
strdec_jbig2(char **bitmap,
const char * restrict jbig2, int jbig2_size)
{
Jbig2Ctx *ctx = jbig2_ctx_new(NULL, JBIG2_OPTIONS_EMBEDDED, NULL, NULL, NULL);
jbig2_data_in(ctx, (unsigned char *) jbig2, jbig2_size);
jbig2_complete_page(ctx);
Jbig2Image *image = jbig2_page_out(ctx);
int width_padded = (image->width + 7) / 8;
unsigned char *data = image->data;
for (unsigned int i = 0; i < image->height; i++) {
memcpy(*bitmap + i * width_padded, data, width_padded);
data += image->stride;
}
jbig2_release_page(ctx, image);
jbig2_ctx_free(ctx);
return 0;
}

7
src/jbig2.h Normal file
View file

@ -0,0 +1,7 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strdec_jbig2(char **bitmap, const char * restrict jbig2, int jbig2_size);

115
src/jp2.c Normal file
View file

@ -0,0 +1,115 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <string.h>
#include <openjpeg.h>
typedef struct _stream_user_data {
OPJ_SIZE_T pos;
OPJ_SIZE_T size;
const unsigned char *data;
} stream_user_data;
static OPJ_SIZE_T
_opj_stream_read(void *p_buffer, OPJ_SIZE_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (d->pos >= d->size)
return (OPJ_SIZE_T) - 1;
OPJ_SIZE_T ret_size = p_nb_bytes;
if (d->pos + ret_size > d->size)
ret_size = d->size - d->pos;
memcpy(p_buffer, d->data + d->pos, ret_size);
d->pos += ret_size;
return ret_size;
}
static OPJ_OFF_T
_opj_stream_skip(OPJ_OFF_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (d->pos + p_nb_bytes <= d->size)
d->pos += p_nb_bytes;
else
d->pos = d->size;
return d->pos;
}
static OPJ_BOOL
_opj_stream_seek(OPJ_OFF_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (p_nb_bytes <= (OPJ_OFF_T) d->size) {
d->pos = p_nb_bytes;
return OPJ_TRUE;
}
return OPJ_FALSE;
}
int
strinfo_jp2_dim(int *jp2_width, int *jp2_height,
const char * restrict data, int data_size)
{
opj_codec_t *codec;
opj_dparameters_t param;
opj_stream_t *stream;
opj_image_t *image;
stream_user_data d;
if (data_size < 2)
return 1;
opj_set_default_decoder_parameters(&param);
if ((unsigned char) data[0] == 0xff && (unsigned char) data[1] == 0x4f)
codec = opj_create_decompress(OPJ_CODEC_J2K);
else
codec = opj_create_decompress(OPJ_CODEC_JP2);
if (!opj_setup_decoder(codec, &param)) {
opj_destroy_codec(codec);
return 1;
}
stream = opj_stream_default_create(OPJ_TRUE);
d.pos = 0;
d.size = data_size;
d.data = (unsigned char *) data;
opj_stream_set_read_function(stream, _opj_stream_read);
opj_stream_set_skip_function(stream, _opj_stream_skip);
opj_stream_set_seek_function(stream, _opj_stream_seek);
opj_stream_set_user_data(stream, &d, NULL);
opj_stream_set_user_data_length(stream, data_size);
if (!opj_read_header(stream, codec, &image)) {
opj_destroy_codec(codec);
opj_stream_destroy(stream);
return 1;
}
opj_destroy_codec(codec);
opj_stream_destroy(stream);
*jp2_width = image->x1 - image->x0;
*jp2_height = image->y1 - image->y0;
opj_image_destroy(image);
return 0;
}

8
src/jp2.h Normal file
View file

@ -0,0 +1,8 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strinfo_jp2_dim(int *jp2_width, int *jp2_height,
const char * restrict data, int data_size);

37
src/jpeg.c Normal file
View file

@ -0,0 +1,37 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdio.h>
#include <jpeglib.h>
int
strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components,
const char * restrict data, int data_size)
{
struct jpeg_decompress_struct cinfo;
struct jpeg_error_mgr jerr;
cinfo.err = jpeg_std_error(&jerr);
jpeg_create_decompress(&cinfo);
jpeg_mem_src(&cinfo, (unsigned char *) data, data_size);
jpeg_read_header(&cinfo, TRUE);
jpeg_calc_output_dimensions(&cinfo);
*jpeg_width = cinfo.output_width;
*jpeg_height = cinfo.output_height;
*jpeg_components = cinfo.output_components;
jpeg_destroy((struct jpeg_common_struct *) &cinfo);
jpeg_destroy_decompress(&cinfo);
return 0;
}

8
src/jpeg.h Normal file
View file

@ -0,0 +1,8 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components,
const char * restrict data, int data_size);

24
src/md5.c Normal file
View file

@ -0,0 +1,24 @@
/*
* Copyright (c) 2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdlib.h>
#include <openssl/md5.h>
int
strmd5(unsigned char **dst, int *dst_size,
const unsigned char * restrict src, int src_size)
{
*dst_size = MD5_DIGEST_LENGTH;
*dst = malloc(*dst_size);
if (*dst == NULL)
return 1;
MD5(src, src_size, *dst);
return 0;
}

9
src/md5.h Normal file
View file

@ -0,0 +1,9 @@
/*
* Copyright (c) 2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int
strmd5(unsigned char **dst, int *dst_size,
const unsigned char * restrict src, int src_size);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -14,11 +14,8 @@
#include "version.h"
int
main(int argc, char **argv, char **envp)
main(int argc, char **argv)
{
printf("Melon " VERSION "." RELEASE "." PATCH EXTRA "\n");
printf("Copyright (c) 2020, yzrh <yzrh@tuta.io>\n\n");
cnki_t *param = NULL;
if (cnki_create(&param) != 0) {
@ -83,27 +80,32 @@ main(int argc, char **argv, char **envp)
return EXIT_FAILURE;
}
if (param->stat > 0)
printf("Melon " VERSION "." RELEASE "." PATCH EXTRA "\n"
"Copyright (c) 2020-2022, yzrh <yzrh@noema.org>\n\n");
cnki_info(&param);
if (strcmp(param->file_stat->type, "%PDF") == 0) {
if (strncmp(param->file_stat->type, "%PDF", 4) == 0) {
if (cnki_pdf(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));
return EXIT_FAILURE;
}
} else if (strcmp(param->file_stat->type, "CAJ") == 0) {
} else if (strncmp(param->file_stat->type, "CAJ", 3) == 0) {
if (cnki_caj(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));
return EXIT_FAILURE;
}
} else if (strcmp(param->file_stat->type, "HN") == 0) {
if (cnki_nh(&param) != 0) {
} else if (strncmp(param->file_stat->type, "HN", 2) == 0 ||
(unsigned char) param->file_stat->type[0] == 0xc8) {
if (cnki_hn(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));
return EXIT_FAILURE;
}
} else if (strcmp(param->file_stat->type, "KDH ") == 0) {
} else if (strncmp(param->file_stat->type, "KDH ", 4) == 0) {
if (cnki_kdh(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -67,7 +67,8 @@ int
pdf_obj_add(pdf_object_t **pdf, int id,
const char * restrict object,
const char * restrict dictionary,
const char * restrict stream)
const char * restrict stream,
int stream_size)
{
if (*pdf != NULL || id <= 0 ||
(object != NULL && dictionary != NULL))
@ -84,24 +85,24 @@ pdf_obj_add(pdf_object_t **pdf, int id,
(*pdf)->id = id;
if (dictionary != NULL) {
(*pdf)->dictionary_size = strlen(dictionary) + 1;
(*pdf)->dictionary_size = strlen(dictionary);
(*pdf)->dictionary = malloc((*pdf)->dictionary_size);
if ((*pdf)->dictionary == NULL)
return 1;
strncpy((*pdf)->dictionary, dictionary, (*pdf)->dictionary_size);
memcpy((*pdf)->dictionary, dictionary, (*pdf)->dictionary_size);
(*pdf)->object_size = 0;
(*pdf)->object = NULL;
} else if (object != NULL) {
(*pdf)->object_size = strlen(object) + 1;
(*pdf)->object_size = strlen(object);
(*pdf)->object = malloc((*pdf)->object_size);
if ((*pdf)->object == NULL)
return 1;
strncpy((*pdf)->object, object, (*pdf)->object_size);
memcpy((*pdf)->object, object, (*pdf)->object_size);
(*pdf)->dictionary_size = 0;
(*pdf)->dictionary = NULL;
@ -112,14 +113,15 @@ pdf_obj_add(pdf_object_t **pdf, int id,
(*pdf)->dictionary = NULL;
}
if (stream != NULL) {
(*pdf)->stream_size = sizeof(stream);
if (stream != NULL && stream_size > 0) {
(*pdf)->stream_size = stream_size + 1;
(*pdf)->stream = malloc((*pdf)->stream_size);
if ((*pdf)->stream == NULL)
return 1;
memcpy((*pdf)->stream, stream, (*pdf)->stream_size);
memcpy((*pdf)->stream, stream, stream_size);
(*pdf)->stream[(*pdf)->stream_size - 1] = '\n';
} else {
(*pdf)->stream_size = 0;
(*pdf)->stream = NULL;
@ -153,7 +155,8 @@ int
pdf_obj_prepend(pdf_object_t **pdf, int id,
const char * restrict object,
const char * restrict dictionary,
const char * restrict stream)
const char * restrict stream,
int stream_size)
{
if (*pdf == NULL)
return 1;
@ -163,7 +166,8 @@ pdf_obj_prepend(pdf_object_t **pdf, int id,
pdf_object_t *ptr = NULL;
if (pdf_obj_add(&ptr, id, object, dictionary, stream) != 0) {
if (pdf_obj_add(&ptr, id, object, dictionary,
stream, stream_size) != 0) {
free(ptr);
return 1;
}
@ -178,7 +182,8 @@ int
pdf_obj_append(pdf_object_t **pdf, int id,
const char * restrict object,
const char * restrict dictionary,
const char * restrict stream)
const char * restrict stream,
int stream_size)
{
if (*pdf == NULL)
return 1;
@ -190,12 +195,67 @@ pdf_obj_append(pdf_object_t **pdf, int id,
while (ptr->next != NULL)
ptr = ptr->next;
if (pdf_obj_add(&ptr->next, id, object, dictionary, stream) != 0)
if (pdf_obj_add(&ptr->next, id, object, dictionary,
stream, stream_size) != 0)
return 1;
return 0;
}
int
pdf_obj_replace(pdf_object_t **pdf, int id,
const char * restrict object,
const char * restrict dictionary,
const char * restrict stream,
int stream_size)
{
pdf_object_t *ptr;
char *ret;
if (pdf_get_obj(pdf, id, &ptr) != 0)
return 1;
if (object != NULL && dictionary != NULL)
return 1;
if (dictionary != NULL) {
ret = realloc(ptr->dictionary, strlen(dictionary));
if (ret == NULL)
return 1;
ptr->dictionary_size = strlen(dictionary);
ptr->dictionary = ret;
memcpy(ptr->dictionary, dictionary, ptr->dictionary_size);
} else if (object != NULL) {
ret = realloc(ptr->object, strlen(object));
if (ret == NULL)
return 1;
ptr->object_size = strlen(object);
ptr->object = ret;
memcpy(ptr->object, object, ptr->object_size);
}
if (stream != NULL && stream_size > 0) {
ret = realloc(ptr->stream, stream_size + 1);
if (ret == NULL)
return 1;
ptr->stream_size = stream_size + 1;
ptr->stream = ret;
memcpy(ptr->stream, stream, stream_size);
ptr->stream[ptr->stream_size - 1] = '\n';
}
return 0;
}
int
pdf_obj_sort(pdf_object_t **pdf)
{

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -21,23 +21,29 @@ typedef struct _pdf_object_t {
/* pdf.c */
/* TODO: Rewrite object dictionary */
/* TODO: Compact object id */
/* TODO: `mutool clean -gggsz' */
int pdf_obj_create(pdf_object_t **pdf);
void pdf_obj_destroy(pdf_object_t **pdf);
int pdf_obj_add(pdf_object_t **pdf, int id,
const char * restrict object,
const char * restrict dictionary,
const char * restrict stream);
const char * restrict stream,
int stream_size);
int pdf_obj_del(pdf_object_t **pdf, int id);
int pdf_obj_prepend(pdf_object_t **pdf, int id,
const char * restrict object,
const char * restrict dictionary,
const char * restrict stream);
const char * restrict stream,
int stream_size);
int pdf_obj_append(pdf_object_t **pdf, int id,
const char * restrict object,
const char * restrict dictionary,
const char * restrict stream);
const char * restrict stream,
int stream_size);
int pdf_obj_replace(pdf_object_t **pdf, int id,
const char * restrict object,
const char * restrict dictionary,
const char * restrict stream,
int stream_size);
int pdf_obj_sort(pdf_object_t **pdf);
/* pdf_parser.c */
@ -56,6 +62,7 @@ int pdf_get_size(pdf_object_t **pdf);
int pdf_get_free_id(pdf_object_t **pdf);
int pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count);
int pdf_get_catalog_id(pdf_object_t **pdf);
int pdf_get_xref_id(pdf_object_t **pdf);
int pdf_get_parent_id(pdf_object_t **pdf, int **id);
int pdf_get_kid_id(pdf_object_t **pdf, int id, int **kid);
int pdf_get_kid_count(pdf_object_t **pdf, int id);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -42,7 +42,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int *
strcat(dictionary, "<<\n");
size = 512;
size = 514;
str = NULL;
if (strconv(&str, "UTF-16BE",
@ -50,7 +50,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int *
&size) == 0) {
strcat(dictionary, "/Title <feff");
for (int i = 0; i < size; i++) {
for (int i = 0; i < size - 2; i++) {
snprintf(buf, 64, "%02x", (unsigned char) str[i]);
strcat(dictionary, buf);
}
@ -89,11 +89,11 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int *
}
/* Page starts from 0 */
snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>\n",
snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>",
atoi(ptr->item->page) - 1);
strcat(dictionary, buf);
pdf_obj_append(pdf, ptr->id, NULL, dictionary, NULL);
pdf_obj_append(pdf, ptr->id, NULL, dictionary, NULL, 0);
if (ptr->left == NULL)
(*stat)[1] = ptr->id;
@ -106,6 +106,26 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int *
return 0;
}
static int
_outline_free(object_outline_tree_t **outline_tree)
{
object_outline_tree_t *ptr = *outline_tree;
for (;;) {
if (ptr->right != NULL)
_outline_free(&ptr->right);
if (ptr->left != NULL) {
ptr = ptr->left;
free(ptr->up);
} else {
free(ptr);
break;
}
}
return 0;
}
int
pdf_cnki_outline(pdf_object_t **pdf, object_outline_t **outline, int **ids)
{
@ -119,16 +139,15 @@ pdf_cnki_outline(pdf_object_t **pdf, object_outline_t **outline, int **ids)
int *ret;
_outline(pdf, &outline_tree->left, outline_tree->id, &ret);
free(outline_tree);
_outline_free(&outline_tree);
snprintf(buf, 128,
"<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>\n",
"<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>",
ret[0], ret[1], ret[2]);
free(ret);
pdf_obj_append(pdf, (*ids)[0], NULL, buf, NULL);
pdf_obj_append(pdf, (*ids)[0], NULL, buf, NULL, 0);
return 0;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/

View file

@ -1,9 +1,15 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#ifdef __linux__
#define _GNU_SOURCE
#endif /* __linux__ */
#include <stdlib.h>
#include <string.h>
@ -83,7 +89,7 @@ pdf_get_free_id(pdf_object_t **pdf)
int id = 0;
for (int i = 1; i < 99999999; i++) {
for (int i = 1; i < 100000000; i++) {
ptr = (*pdf)->next;
while (ptr != NULL) {
if (ptr->id == i) {
@ -117,7 +123,7 @@ pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count)
int id = 0;
pdf_object_t *ptr;
for (int i = 1; i < 99999999; i++) {
for (int i = 1; i < 100000000; i++) {
ptr = (*pdf)->next;
while (ptr != NULL) {
if (ptr->id == i) {
@ -130,7 +136,7 @@ pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count)
if (i != id) {
(*ids)[pos] = i;
if (pos == count)
if (pos == count - 1)
return 0;
pos++;
@ -152,7 +158,8 @@ pdf_get_catalog_id(pdf_object_t **pdf)
while (ptr != NULL) {
if (ptr->dictionary != NULL &&
strstr(ptr->dictionary, "/Catalog") != NULL)
memmem(ptr->dictionary, ptr->dictionary_size,
"/Catalog", 8) != NULL)
catalog_id = ptr->id;
ptr = ptr->next;
@ -161,6 +168,28 @@ pdf_get_catalog_id(pdf_object_t **pdf)
return catalog_id;
}
int
pdf_get_xref_id(pdf_object_t **pdf)
{
if (*pdf == NULL)
return 1;
int xref_id = 0;
pdf_object_t *ptr = (*pdf)->next;
while (ptr != NULL) {
if (ptr->dictionary != NULL &&
memmem(ptr->dictionary, ptr->dictionary_size,
"/XRef", 5) != NULL)
xref_id = ptr->id;
ptr = ptr->next;
}
return xref_id;
}
int
pdf_get_parent_id(pdf_object_t **pdf, int **id)
{
@ -187,10 +216,11 @@ pdf_get_parent_id(pdf_object_t **pdf, int **id)
while (ptr != NULL) {
if (ptr->dictionary != NULL &&
(head = strstr(ptr->dictionary, "/Parent ")) != NULL &&
(head = memmem(ptr->dictionary, ptr->dictionary_size,
"/Parent ", 8)) != NULL &&
(tail = strchr(head + 8, ' ')) != NULL) {
memset(str, 0, 8);
strncpy(str, head + 8, (tail - head) - 8);
memcpy(str, head + 8, (tail - head) - 8);
str_val = atoi(str);
if (!_id_in(str_val, *id)) {
@ -237,7 +267,8 @@ pdf_get_kid_id(pdf_object_t **pdf, int id, int **kid)
}
if (ptr->dictionary != NULL &&
strstr(ptr->dictionary, str) != NULL) {
memmem(ptr->dictionary, ptr->dictionary_size,
str, strlen(str)) != NULL) {
ret = realloc(*kid, ++kid_size * sizeof(int));
if (ret == NULL)
@ -276,13 +307,15 @@ pdf_get_kid_count(pdf_object_t **pdf, int id)
while (ptr != NULL) {
if (ptr->dictionary != NULL &&
strstr(ptr->dictionary, id_str) != NULL &&
(pos = strstr(ptr->dictionary, "/Count ")) != NULL) {
memmem(ptr->dictionary, ptr->dictionary_size,
id_str, strlen(id_str)) != NULL &&
(pos = memmem(ptr->dictionary, ptr->dictionary_size,
"/Count ", 7)) != NULL) {
for (int i = 8; i >= 0; i--) {
if (i + 7 <= ptr->dictionary_size - (pos - ptr->dictionary) &&
pos[i + 7] >= '0' && pos[i + 7] <= '9') {
memset(str, 0, 8);
strncpy(str, pos + 7, i + 1);
memcpy(str, pos + 7, i + 1);
str_val = atoi(str);
count += str_val;
break;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -19,26 +19,35 @@ static void *
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
{
const char whitespace[6] = {
'\r',
'\n',
'\f',
'\t',
'\0',
' '
0x00,
0x09,
0x0a,
0x0c,
0x0d,
0x20
};
char tmp[s1 + 1];
memcpy(tmp, p1, s1);
char *ret = NULL;
char *ret;
char str[s1 + 1];
memcpy(str, p1, s1);
size_t tmp_size = 0;
char *tmp;
for (int i = 0; i < 6; i++) {
tmp[s1] = whitespace[i];
if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
return ret;
str[s1] = whitespace[i];
if ((tmp = memmem(p0, s0, str, s1 + 1)) == NULL)
continue;
if (tmp_size == 0 || (size_t) (tmp - (char *) p0) < tmp_size) {
tmp_size = tmp - (char *) p0;
ret = tmp;
}
}
return NULL;
return ret;
}
static int
@ -57,23 +66,45 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
end = ftell(*fp);
fseek(*fp, cur, SEEK_SET);
int head = 0;
int tail = 0;
long head = 0;
long tail = 0;
char *pos;
char *tmp;
for (;;) {
fread(buf, size_buf, 1, *fp);
if (cur + size_buf < end) {
fread(buf, size_buf, 1, *fp);
} else {
fread(buf, end - cur, 1, *fp);
memset(buf + end - cur, 0, size_buf - end + cur);
}
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
head = cur + (pos - buf) + 7;
if (head == 0) {
/* Hack needed for invalid object */
pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6);
tmp = memmem(buf, size_buf, " 0 obj", 6);
while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b)
tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6);
if (pos != NULL && tmp != NULL) {
if (pos - buf < tmp - buf)
head = cur + (pos - buf) + 7;
else
head = cur + (tmp - buf) + 6;
} else if (pos != NULL) {
head = cur + (pos - buf) + 7;
} else if (tmp != NULL) {
head = cur + (tmp - buf) + 6;
}
}
if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) {
/* We need to check if it is the object stored in stream */
while (memcmp(pos + 7,
"\r\nendstream", 11) == 0 &&
(tmp = _memmem_whitespace(pos + 6,
size_buf - (pos - buf) - 6,
(tmp = _memmem_whitespace(pos + 7,
size_buf - (pos - buf) - 7,
"endobj", 6)) != NULL)
pos = tmp;
@ -102,13 +133,17 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
ptr->address = head;
ptr->size = tail - head;
fseek(*fp, tail + 6, SEEK_SET);
fseek(*fp, tail + 7, SEEK_SET);
head = tail = 0;
} else if (head > 0 && tail > 0) {
if (cur + size_buf < end)
fseek(*fp, head, SEEK_SET);
tail = 0;
} else {
fseek(*fp, -6, SEEK_CUR);
fseek(*fp, -7, SEEK_CUR);
}
if ((cur = ftell(*fp)) + 6 >= end)
if ((cur = ftell(*fp)) + 7 >= end)
break;
}
@ -126,6 +161,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
pdf_object_t *ptr = (*pdf)->next;
char str[8];
char *buf;
char *head;
char *tail;
@ -137,28 +173,86 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
if (buf == NULL)
return 1;
memset(buf, 0, ptr->size);
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
fseek(*fp, ptr->address - 12, SEEK_SET);
fread(buf, 8, 1, *fp);
/* Handle incomplete object */
head = buf;
while ((tmp = _memmem_whitespace(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 7;
/* Hack needed for invalid object */
while ((tmp = memmem(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 6;
if (head - buf > 0) {
ptr->address += head - buf;
ptr->size -= head - buf;
tmp = realloc(buf, ptr->size);
if (tmp == NULL)
return 1;
buf = tmp;
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
}
/* Hack needed for invalid object */
fseek(*fp, ptr->address - 14, SEEK_SET);
fread(str, 8, 1, *fp);
if (str[7] < '0' || str[7] > '9') {
fseek(*fp, ptr->address - 15, SEEK_SET);
fread(str, 8, 1, *fp);
}
for (int i = 7; i >= 0; i--) {
if (str[i] < '0' || str[i] > '9') {
if (i < 7)
ptr->id = atoi(str + i + 1);
else
ptr->id = 0;
for (int i = 0; i < 8; i++) {
if (buf[i] >= '0' && buf[i] <= '9') {
ptr->id = atoi(buf + i);
break;
}
}
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL &&
(tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) {
/* A dictionary object may have nested dictionary */
while ((tmp = _memmem_whitespace(tail + 2,
ptr->size - (tail - buf) - 2,
">>", 2)) != NULL)
tail = tmp;
((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL ||
/* Hack needed for invalid object */
(tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) {
if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) {
tail = memmem(buf, ptr->size, ">>", 2);
while (ptr->size - (tail - buf) > 2 &&
(tmp = memmem(tail + 2,
ptr->size - (tail - buf) - 2,
">>", 2)) != NULL &&
memmem(tail + 2,
(tmp - tail) - 2,
"stream\r\n", 8) == NULL)
tail = tmp;
} else {
/*
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
while (ptr->size - (tail - buf) > 3 &&
(tmp = _memmem_whitespace(tail + 3,
ptr->size - (tail - buf) - 3,
">>", 2)) != NULL &&
memmem(tail + 3,
(tmp - tail) - 3,
"stream\r\n", 8) == NULL)
tail = tmp;
}
ptr->dictionary_size = tail - head + 2;
ptr->dictionary = malloc(ptr->dictionary_size + 1);
@ -166,8 +260,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
if (ptr->dictionary == NULL)
return 1;
memset(ptr->dictionary, 0, ptr->dictionary_size + 1);
memcpy(ptr->dictionary, head, ptr->dictionary_size);
memset(ptr->dictionary + ptr->dictionary_size, 0, 1);
if ((head = memmem(tail,
ptr->size - (tail - buf),
@ -180,11 +274,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
* contains another object that
* contains another stream
*/
while (_memmem_whitespace(tail,
ptr->size - (tail - buf),
while (_memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
"endobj", 6) != NULL &&
(tmp = _memmem_whitespace(tail + 9,
ptr->size - (tail - buf) - 9,
(tmp = _memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
"endstream", 9)) != NULL)
tail = tmp;
@ -196,19 +290,13 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
memcpy(ptr->stream, head + 8, ptr->stream_size);
}
free(buf);
} else {
ptr->object_size = ptr->size;
ptr->object = malloc(ptr->object_size + 1);
if (ptr->object == NULL)
return 1;
memset(ptr->object, 0, ptr->object_size + 1);
memcpy(ptr->object, buf, ptr->object_size);
ptr->object = buf;
}
free(buf);
ptr = ptr->next;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -8,14 +8,32 @@
#include <string.h>
#include <time.h>
#include <openssl/md5.h>
#include "version.h"
#include "md5.h"
#include "pdf.h"
static int
_info_obj(pdf_object_t **pdf)
{
char dictionary[128] = "<<\n"
"/Producer (Melon " VERSION "." RELEASE "." PATCH EXTRA ")\n"
"/CreationDate (D:";
char buf[64];
time_t timestamp = time(NULL);
strftime(buf, 64, "%Y%m%d%H%M%S", gmtime(&timestamp));
strcat(dictionary, buf);
strcat(dictionary, "+00'00')\n>>");
return pdf_obj_append(pdf, 0, NULL, dictionary, NULL, 0);
}
int
pdf_dump_obj(pdf_object_t **pdf, FILE **fp)
{
if (*pdf == NULL || *fp == NULL)
if (*pdf == NULL || *fp == NULL || _info_obj(pdf) != 0)
return 1;
long cur;
@ -26,12 +44,15 @@ pdf_dump_obj(pdf_object_t **pdf, FILE **fp)
fprintf(*fp, "%d 0 obj\n", ptr->id);
if (ptr->dictionary != NULL)
fputs(ptr->dictionary, *fp);
else if (ptr->object != NULL)
fputs(ptr->object, *fp);
else if (ptr->stream == NULL)
if (ptr->dictionary != NULL) {
fwrite(ptr->dictionary, ptr->dictionary_size, 1, *fp);
fputs("\n", *fp);
} else if (ptr->object != NULL) {
fwrite(ptr->object, ptr->object_size, 1, *fp);
fputs("\n", *fp);
} else if (ptr->stream == NULL) {
fputs("null\n", *fp);
}
if (ptr->stream != NULL) {
fputs("stream\r\n", *fp);
@ -135,37 +156,34 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref)
int buf_size;
char buf[64];
#ifdef __ILP32__
buf_size = snprintf(buf, 64, "%x%x", timestamp, size);
#else
buf_size = snprintf(buf, 64, "%lx%x", timestamp, size);
#endif
unsigned char str[64];
memcpy(str, buf, 64);
int fid_size;
unsigned char *fid;
unsigned char fid[MD5_DIGEST_LENGTH];
MD5(str, buf_size, fid);
if (strmd5(&fid, &fid_size, (unsigned char *) buf, buf_size) != 0)
return 1;
pdf_object_t *ptr = *pdf;
while (ptr->next != NULL)
ptr = ptr->next;
/*
* TODO: Document information dictionary
* `"/Producer (Melon)"'
* `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"'
*
* Trailer dictionary
* `"/Info %d 0 R"'
*/
fprintf(*fp,
"/Size %d\n/Root %d 0 R\n",
"/Size %d\n/Root %d 0 R\n/Info %d 0 R\n",
ptr->id + 1,
pdf_get_catalog_id(pdf));
pdf_get_catalog_id(pdf),
ptr->id);
fputs("/ID [", *fp);
for (int i = 0; i < 2; i++) {
fputs("<", *fp);
for (int j = 0; j < MD5_DIGEST_LENGTH; j++)
for (int j = 0; j < fid_size; j++)
fprintf(*fp, "%02x", fid[j]);
fputs(">", *fp);
@ -184,5 +202,7 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref)
fputs("%%EOF\n", *fp);
free(fid);
return 0;
}

View file

@ -1,10 +1,10 @@
/*
* Copyright (c) 2020, yzrh <yzrh@tuta.io>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#define VERSION "0"
#define RELEASE "1"
#define RELEASE "3"
#define PATCH "0"
#define EXTRA ""

53
src/zlib.c Normal file
View file

@ -0,0 +1,53 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
int
strinflate(char **dst, int dst_size,
const char * restrict src, int src_size)
{
*dst = malloc(dst_size);
if (*dst == NULL)
return 1;
unsigned long size = dst_size;
if (uncompress((Bytef *) *dst,
&size, (const Bytef *) src, src_size) != Z_OK) {
free(*dst);
return 1;
}
return 0;
}
int
strdeflate(char **dst, int *dst_size,
const char * restrict src, int src_size)
{
*dst_size = compressBound(src_size);
*dst = malloc(*dst_size);
if (*dst == NULL)
return 1;
unsigned long size = *dst_size;
if (compress((Bytef *) *dst, &size,
(const Bytef *) src, src_size) != Z_OK) {
free(*dst);
return 1;
}
*dst_size = size;
return 0;
}

11
src/zlib.h Normal file
View file

@ -0,0 +1,11 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strinflate(char **dst, int dst_size,
const char * restrict src, int src_size);
int strdeflate(char **dst, int *dst_size,
const char * restrict src, int src_size);