Compare commits

...

41 commits

Author SHA1 Message Date
2fa2b760ae Fix HN text parsing.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-15 15:34:46 +00:00
dd5854678c Fix JBIG2 allocation.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-06 12:02:43 +00:00
123d62141c Add document information dictionary to output.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 19:15:01 +00:00
283446dba5 Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 17:32:13 +00:00
13cb0a1b8d Fix invalid token parsing.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-05 11:21:54 +00:00
a7ecc15614 Replace catalog object only if root object does not exist.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:50:25 +00:00
56ffe14d5a Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:29:07 +00:00
c2afbb3cbc Handle invalid PDF object.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:19:06 +00:00
8cd8a8fbba Replace catalog object if found.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 17:07:57 +00:00
8276423eb8 Prioritise incomplete object during deduplication.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-04 13:51:13 +00:00
7ac0971a17 Handle invalid result from PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-03 15:39:53 +00:00
e0fe937e1a Fix KDH decryption.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-03 12:12:42 +00:00
4a02b8bfc7 Fix inconsistent whitespace detection in PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-03 00:13:56 +00:00
7d9d658461 Handle duplicated image in HN.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-02 15:38:45 +00:00
000405693e Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 21:26:44 +00:00
d6fa934b5f Handle incomplete PDF object in parser.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 20:51:09 +00:00
1a1fee1034 Handle duplicated object in CAJ.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 19:31:33 +00:00
cde014cffb Improve PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 18:58:43 +00:00
9019a18449 Split md5 function.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 11:11:56 +00:00
a18de8f2ef Rename JBIG decoder.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 10:09:08 +00:00
70e1e7ea97 Fix JBIG decoder data type.
Signed-off-by: yzrh <yzrh@noema.org>
2023-01-01 00:42:20 +00:00
bffb8ce8a4 Fix JBIG decoder.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-31 21:17:28 +00:00
3ac51d66b9 Fix JBIG table length.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-31 18:52:06 +00:00
0bbf8e65dd Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-31 11:28:03 +00:00
220a81c2ad Fix HN image compositing.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-31 10:48:29 +00:00
1d899d934d Fix PDF object check.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-30 20:16:53 +00:00
226f16ddf4 Handle HN page with figure only.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-30 15:04:32 +00:00
9646ee61c3 Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-30 02:04:43 +00:00
5466a441df Fix type casting when processing data.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-30 02:00:12 +00:00
1ce3f89574 Handle combination of text and image in page content.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 21:10:03 +00:00
5a1afb0056 Link against libc for iconv, find openjpeg header with pkgconf.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 17:30:36 +00:00
060bc00a0d Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 06:30:59 +00:00
97931e1470 Fix PDF object check.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 05:23:04 +00:00
cd0af5ba3c Fix buffer overflow when object size is less than 8 bytes.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 04:05:34 +00:00
988a751c15 Handle missing root object which is parent of others.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 03:00:11 +00:00
8083b30530 Add JPEG 2000 support.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-29 00:40:14 +00:00
abce2fd2e4 Add preliminary support for HN figure placement.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-28 19:29:46 +00:00
224a09a015 Update CHANGE.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-26 00:13:18 +00:00
c2ad6549fb Handle headless HN and page with no image.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-25 23:18:17 +00:00
d2826fa075 Simplify JBIG decoder.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-25 05:15:56 +00:00
288b65a1fd Handle different JPEG colour component.
Signed-off-by: yzrh <yzrh@noema.org>
2022-12-25 01:26:05 +00:00
25 changed files with 1075 additions and 461 deletions

View file

@ -1,7 +1,44 @@
0.3.0 (2022-XX-XX)
0.3.0 (2023-XX-XX)
==================
* Support HN text overlay.
* Support HN page with text.
* Handle inaccurate page count in CAJ and KDH.
0.2.5 (2023-01-05)
==================
* Improve PDF parser.
* Handle duplicated object in CAJ.
* Handle duplicated image in HN.
* Handle incomplete PDF object in CAJ and KDH.
* Handle invalid PDF object token in CAJ and KDH.
* Fix JBIG decoder.
0.2.4 (2022-12-31)
==================
* Fix HN image compositing.
* Fix PDF object check.
0.2.3 (2022-12-30)
==================
* Support HN figure placement.
0.2.2 (2022-12-29)
==================
* Support JPEG 2000 for HN.
* Handle missing but referenced root object.
* Handle HN with more than one image per page.
* Fix buffer overflow.
0.2.1 (2022-12-26)
==================
* Handle different JPEG colour component.
* Handle headless HN and page with no image.
0.2.0 (2022-12-22)
==================

View file

@ -9,16 +9,15 @@ Development
Currently, CAJ, KDH, and HN can be converted. Please report
any failures with a sample that can reproduce the behaviour.
HN support does not support JPEG 2000 yet.
Dependency
----------
1. OpenSSL
2. libiconv
3. zlib
4. jbig2dec
5. libjpeg-turbo
1. libcrypto (OpenSSL)
2. zlib
3. jbig2dec
4. libjpeg-turbo
5. openjpeg
6. pkgconf
Usage
=====
@ -36,12 +35,12 @@ Options
Specify output file
-b, --buffer
Set buffer size (default 512k)
Set input buffer size (default 512k)
-v, --verbose
Print more information (twice for even more, three times for HN image decoding information as well)
Print more information (twice for even more, three times for HN image processing information as well)
Thanks
======
This project is inspired by [https://github.com/JeziL/caj2pdf](https://github.com/JeziL/caj2pdf)
This project is inspired by [https://github.com/caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf)

View file

@ -4,23 +4,28 @@
# SPDX-License-Identifier: Apache-2.0
#
src = melon.c iconv.c zlib.c jbig.c jbig2.c jpeg.c \
cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c \
cnki_pdf.c cnki_zlib.c cnki_jbig.c cnki_jbig2.c cnki.c \
src = melon.c iconv.c zlib.c jbig2.c jpeg.c jp2.c md5.c \
cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c cnki_pdf.c \
cnki_zlib.c cnki_jbig.c cnki_jbig_dec.c cnki_jbig2.c cnki.c \
pdf_cnki.c pdf_get.c pdf_parser.c pdf_writer.c pdf.c
inc = extern.h version.h iconv.h zlib.h jbig.h jbig2.h jpeg.h \
cnki.h pdf_cnki.h cnki_jbig.h pdf.h
inc = extern.h version.h iconv.h zlib.h jbig2.h jpeg.h jp2.h md5.h \
cnki.h pdf_cnki.h cnki_jbig.h cnki_jbig_dec.h pdf.h
obj = ${src:.c=.o}
PREFIX = /usr/local
CFLAGS = -O2 -pipe -flto -Wall -Wextra
LDFLAGS = -Wl,-O2 -lcrypto -liconv -lz -ljbig2dec -ljpeg -Wl,--as-needed
LDFLAGS = -Wl,-O2 -lcrypto -lz -ljbig2dec -ljpeg -lopenjp2 -Wl,--as-needed
CFLAGS += -I/usr/local/include
LDFLAGS += -L/usr/local/lib
OPENJPEG_CFLAGS != pkgconf --cflags libopenjp2
CFLAGS += ${OPENJPEG_CFLAGS}
CFLAGS += -DLIBICONV_PLUG
all: ${obj} ${inc}
${CC} ${LDFLAGS} -o melon ${obj}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -54,6 +54,11 @@ cnki_destroy(cnki_t **param)
object_hn_t *ptr_hn;
while ((ptr_hn = (*param)->object_hn) != NULL) {
(*param)->object_hn = (*param)->object_hn->next;
free(ptr_hn->text);
if (ptr_hn->image_data != NULL)
for (int i = 0; i < ptr_hn->image_length; i++)
free(ptr_hn->image_data[i].image);
free(ptr_hn->image_data);
free(ptr_hn);
}
@ -71,12 +76,19 @@ cnki_info(cnki_t **param)
printf("Reading file header at 0x%x\n", ADDRESS_HEAD);
int addr[2];
unsigned char str[2];
fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET);
fread((*param)->file_stat->type, 4, 1, (*param)->fp_i);
if ((*param)->stat > 0)
printf("File type is '%s'\n", (*param)->file_stat->type);
fread(str, 2, 1, (*param)->fp_i);
if ((*param)->stat > 0) {
if ((unsigned char) (*param)->file_stat->type[0] > 0x7f)
printf("File type is '%02x'\n", (unsigned char) (*param)->file_stat->type[0]);
else
printf("File type is '%s'\n", (*param)->file_stat->type);
}
if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) {
return 0;
@ -86,6 +98,9 @@ cnki_info(cnki_t **param)
} else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) {
addr[0] = ADDRESS_HN_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
addr[0] = ADDRESS_C8_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) {
return 0;
} else {
@ -102,6 +117,14 @@ cnki_info(cnki_t **param)
printf("Advised %d page(s)\n",
(*param)->file_stat->page);
if (strncmp((*param)->file_stat->type, "HN", 2) == 0 && str[0] == 0xc8 && str[1] == 0x00) {
fseek((*param)->fp_i, 0xd8, SEEK_SET);
return 0;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
fseek((*param)->fp_i, 0x50, SEEK_SET);
return 0;
}
if ((*param)->stat > 1)
printf("Reading outline count at 0x%x\n", addr[1]);
@ -115,7 +138,7 @@ cnki_info(cnki_t **param)
if ((*param)->file_stat->outline > 0) {
if ((*param)->stat > 1) {
printf("Loading outline(s)\n");
printf("\t%16s\t%-24s\t%12s\t%12s\t%5s\n",
printf("\t%19s\t%-24s\t%12s\t%12s\t%5s\n",
"title",
"hierarchy",
"page",

View file

@ -16,6 +16,8 @@
#define ADDRESS_HN_PAGE 0x0090
#define ADDRESS_HN_OUTLINE 0x0158
#define ADDRESS_C8_PAGE 0x0008
#define ADDRESS_KDH_BODY 0x00fe
#define KEY_KDH "FZHMEI"
@ -56,6 +58,10 @@ typedef struct _hn_image_t {
int32_t format; /* hn_code */
int32_t address;
int32_t size;
uint16_t x;
uint16_t y;
uint16_t w;
uint16_t h;
char *image;
} hn_image_t;
@ -64,7 +70,8 @@ typedef struct _object_hn_t {
int32_t text_size;
int16_t image_length;
int16_t page;
int32_t unknown[2]; /* TODO: what is it? */
int32_t unknown; /* TODO: what is it? */
int32_t address_next;
char *text;
struct _hn_image_t *image_data;
struct _object_hn_t *next;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -27,12 +27,13 @@ cnki_hn(cnki_t **param)
if ((*param)->stat > 1) {
printf("Loading page(s)\n");
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n",
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4s\t%8s\t%8s\n",
"address",
"text",
"length",
"page",
"unknown",
"next",
"code",
"address",
"image");
@ -44,7 +45,8 @@ cnki_hn(cnki_t **param)
fread(&ptr->text_size, 4, 1, (*param)->fp_i);
fread(&ptr->image_length, 2, 1, (*param)->fp_i);
fread(&ptr->page, 2, 1, (*param)->fp_i);
fread(&ptr->unknown, 8, 1, (*param)->fp_i);
fread(&ptr->unknown, 4, 1, (*param)->fp_i);
fread(&ptr->address_next, 4, 1, (*param)->fp_i);
ptr->text = NULL;
ptr->image_data = NULL;
@ -62,66 +64,80 @@ cnki_hn(cnki_t **param)
ptr = (*param)->object_hn;
while (ptr != NULL) {
ptr->text = malloc(ptr->text_size);
if (ptr->text_size > 0) {
ptr->text = malloc(ptr->text_size);
if (ptr->text == NULL)
return 1;
if (ptr->text == NULL)
return 1;
fseek((*param)->fp_i, ptr->address, SEEK_SET);
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
fseek((*param)->fp_i, ptr->address, SEEK_SET);
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
}
if ((*param)->stat > 1)
printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}",
printf("\t%08x\t%8d\t%6d\t%4d\t%8d\t%08x",
ptr->address,
ptr->text_size,
ptr->image_length,
ptr->page,
ptr->unknown[0],
ptr->unknown[1]);
ptr->unknown,
ptr->address_next);
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
if (ptr->image_length > 0) {
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
if (ptr->image_data == NULL)
return 1;
for (int i = 0; i < ptr->image_length; i++) {
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
fseek((*param)->fp_i,
ptr->image_data[i].address + ptr->image_data[i].size,
SEEK_SET);
}
for (int i = 0; i < ptr->image_length; i++) {
ptr->image_data[i].image = malloc(ptr->image_data[i].size);
if (ptr->image_data[i].image == NULL)
if (ptr->image_data == NULL)
return 1;
fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET);
fread(ptr->image_data[i].image,
ptr->image_data[i].size, 1,
(*param)->fp_i);
for (int i = 0; i < ptr->image_length; i++) {
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
ptr->image_data[i].x = 0;
ptr->image_data[i].y = 0;
ptr->image_data[i].w = 0;
ptr->image_data[i].h = 0;
fseek((*param)->fp_i,
ptr->image_data[i].address + ptr->image_data[i].size,
SEEK_SET);
}
if ((*param)->stat > 1) {
if (i == 0) {
printf("\t%4d\t%08x\t%8d\n",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
} else {
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n",
"",
"",
"",
"",
"",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
for (int i = 0; i < ptr->image_length; i++) {
ptr->image_data[i].image = malloc(ptr->image_data[i].size);
if (ptr->image_data[i].image == NULL)
return 1;
fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET);
fread(ptr->image_data[i].image,
ptr->image_data[i].size, 1,
(*param)->fp_i);
if ((*param)->stat > 1) {
if (i == 0) {
printf("\t%4d\t%08x\t%8d\n",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
} else {
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4d\t%08x\t%8d\n",
"",
"",
"",
"",
"",
"",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
}
}
}
} else if ((*param)->stat > 1) {
printf("\t%4s\t%8s\t%8s\n",
"",
"",
"");
}
ptr = ptr->next;

View file

@ -8,7 +8,7 @@
#include <string.h>
#include "cnki_jbig.h"
#include "jbig.h"
#include "cnki_jbig_dec.h"
int
cnki_jbig(char **bitmap, int *bitmap_size,

View file

@ -27,8 +27,8 @@ typedef struct _dib_t {
uint16_t depth;
uint32_t compression; /* dib_compression_code */
uint32_t size;
uint32_t resolution_h;
uint32_t resolution_v;
int32_t resolution_h;
int32_t resolution_v;
uint32_t colour;
uint32_t colour_used;
} dib_t;

View file

@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
static const uint16_t _LSZ[256] = {
static const uint16_t _LSZ[0x71] = {
0x5a1d,
0x2586, 0x1114, 0x080b, 0x03d8, 0x01da, 0x00e5, 0x006f, 0x0036,
0x001a, 0x000d, 0x0006, 0x0003, 0x0001, 0x5a7f, 0x3f25, 0x2cf2,
@ -28,7 +28,7 @@ static const uint16_t _LSZ[256] = {
0x5627, 0x50e7, 0x4b85, 0x5597, 0x504f, 0x5a10, 0x5522, 0x59eb
};
static const uint8_t _NLPS[256] = {
static const uint8_t _NLPS[0x71] = {
1,
14, 16, 18, 20, 23, 25, 28, 30,
33, 35, 9, 10, 12, 15, 36, 38,
@ -47,7 +47,7 @@ static const uint8_t _NLPS[256] = {
105, 108, 109, 110, 111, 110, 112, 112
};
static const uint8_t _NMPS[256] = {
static const uint8_t _NMPS[0x71] = {
1,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 13, 15, 16, 17,
@ -66,7 +66,7 @@ static const uint8_t _NMPS[256] = {
106, 107, 103, 109, 107, 111, 109, 111
};
static const bool _SWTCH[256] = {
static const bool _SWTCH[0x71] = {
1,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0,
@ -99,7 +99,7 @@ static int _height;
static int _width_padded;
static int _ret_pos;
static char *_ret;
static unsigned char *_ret;
static int _scd_size;
static unsigned char *_scd;
@ -108,7 +108,7 @@ static void
_bytein(void)
{
if (_ret_pos < _scd_size)
_reg_c += *(_scd + _ret_pos++) << 8;
_reg_c += _scd[_ret_pos++] << 8;
_ct = 8;
}
@ -207,7 +207,18 @@ static void
_procline(int line, char *a, char *b, char *c)
{
/* The encoder must be erroneous */
uint16_t cx = (*b & 0x01) << 2;
uint16_t cx = 0;
if (line > 0) {
cx += (_ret[_width_padded * (_height - line)] & 0x20) << 2;
cx += _ret[_width_padded * (_height - line)] & 0x40;
cx += (_ret[_width_padded * (_height - line)] & 0x80) >> 2;
}
if (line > 1) {
cx += (_ret[_width_padded * (_height - line + 1)] & 0x40) >> 4;
cx += (_ret[_width_padded * (_height - line + 1)] & 0x80) >> 6;
}
for (int i = 0; i < _width; i++) {
_decode(cx);
@ -215,19 +226,19 @@ _procline(int line, char *a, char *b, char *c)
cx >>= 1;
if (_pix == 1) {
*(_ret + _width_padded * (_height - line - 1) + i / 8) |= _pix << (7 - (i & 0x07));
*(c + i) = 1;
_ret[_width_padded * (_height - line - 1) + i / 8] |= _pix << (7 - (i & 0x07));
c[i] = 1;
cx |= 0x0200;
} else {
cx &= 0xfdff;
}
if (i + 2 < _width && *(a + i + 2) == 1)
if (i + 2 < _width && a[i + 2] == 1)
cx |= 0x0004;
else
cx &= 0xfffb;
if (i + 3 < _width && *(b + i + 3) == 1)
if (i + 3 < _width && b[i + 3] == 1)
cx |= 0x0080;
else
cx &= 0xff7f;
@ -293,7 +304,7 @@ strdec_jbig(char **bitmap, int width, int height,
memset(*bitmap, 0, _height * _width_padded);
_ret_pos = 0;
_ret = *bitmap;
_ret = (unsigned char *) *bitmap;
_scd_size = jbig_size;
_scd = (unsigned char *) jbig;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -15,16 +15,18 @@ cnki_kdh(cnki_t **param)
if ((*param)->stat > 0)
printf("Begin 'KDH' decryption\n");
long cur = ADDRESS_KDH_BODY;
long end;
fseek((*param)->fp_i, 0, SEEK_END);
long size = ftell((*param)->fp_i);
fseek((*param)->fp_i, ADDRESS_KDH_BODY, SEEK_SET);
end = ftell((*param)->fp_i);
fseek((*param)->fp_i, cur, SEEK_SET);
const char key[] = KEY_KDH;
const int key_len = KEY_KDH_LENGTH;
long key_cur = 0;
int buf_size;
char buf[(*param)->size_buf];
FILE *tmp = tmpfile();
@ -33,32 +35,32 @@ cnki_kdh(cnki_t **param)
return 1;
for (;;) {
fread(buf, (*param)->size_buf, 1, (*param)->fp_i);
if (cur + (*param)->size_buf < end)
buf_size = (*param)->size_buf;
else
buf_size = end - cur;
for (int i = 0; i < (*param)->size_buf; i++) {
buf[i] ^= key[key_cur % key_len];
key_cur++;
}
fread(buf, buf_size, 1, (*param)->fp_i);
fwrite(buf, (*param)->size_buf, 1, tmp);
for (int i = 0; i < buf_size; i++)
buf[i] ^= key[key_cur++ % key_len];
if (ftell((*param)->fp_i) == size)
fwrite(buf, buf_size, 1, tmp);
if ((cur = ftell((*param)->fp_i)) >= end)
break;
}
if ((*param)->stat > 0)
printf("Decrypted %ld byte(s)\n", ftell(tmp));
fseek(tmp, 0, SEEK_SET);
fclose((*param)->fp_i);
FILE *orig = (*param)->fp_i;
fseek(tmp, 0, SEEK_SET);
(*param)->fp_i = tmp;
cnki_pdf(param);
(*param)->fp_i = orig;
fclose(tmp);
if ((*param)->stat > 0)
printf("Conversion ended\n");

File diff suppressed because it is too large Load diff

View file

@ -13,12 +13,17 @@ int
cnki_zlib(char **dst, int *dst_size,
const char * restrict src, int src_size)
{
uint8_t padding = 0;
int32_t size;
memcpy(&size, src + 20, 4);
if (strncmp(src + 8, "COMPRESSTEXT", 12) == 0)
padding = 8;
memcpy(&size, src + 12 + padding, 4);
*dst_size = size;
if (strinflate(dst, size, src + 24, src_size - 24) != 0)
if (strinflate(dst, size, src + 16 + padding, src_size - 16 - padding) != 0)
return 1;
return 0;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
* Copyright (c) 2022-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -31,5 +31,6 @@ strdec_jbig2(char **bitmap,
}
jbig2_release_page(ctx, image);
jbig2_ctx_free(ctx);
return 0;
}

115
src/jp2.c Normal file
View file

@ -0,0 +1,115 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <string.h>
#include <openjpeg.h>
typedef struct _stream_user_data {
OPJ_SIZE_T pos;
OPJ_SIZE_T size;
const unsigned char *data;
} stream_user_data;
static OPJ_SIZE_T
_opj_stream_read(void *p_buffer, OPJ_SIZE_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (d->pos >= d->size)
return (OPJ_SIZE_T) - 1;
OPJ_SIZE_T ret_size = p_nb_bytes;
if (d->pos + ret_size > d->size)
ret_size = d->size - d->pos;
memcpy(p_buffer, d->data + d->pos, ret_size);
d->pos += ret_size;
return ret_size;
}
static OPJ_OFF_T
_opj_stream_skip(OPJ_OFF_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (d->pos + p_nb_bytes <= d->size)
d->pos += p_nb_bytes;
else
d->pos = d->size;
return d->pos;
}
static OPJ_BOOL
_opj_stream_seek(OPJ_OFF_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (p_nb_bytes <= (OPJ_OFF_T) d->size) {
d->pos = p_nb_bytes;
return OPJ_TRUE;
}
return OPJ_FALSE;
}
int
strinfo_jp2_dim(int *jp2_width, int *jp2_height,
const char * restrict data, int data_size)
{
opj_codec_t *codec;
opj_dparameters_t param;
opj_stream_t *stream;
opj_image_t *image;
stream_user_data d;
if (data_size < 2)
return 1;
opj_set_default_decoder_parameters(&param);
if ((unsigned char) data[0] == 0xff && (unsigned char) data[1] == 0x4f)
codec = opj_create_decompress(OPJ_CODEC_J2K);
else
codec = opj_create_decompress(OPJ_CODEC_JP2);
if (!opj_setup_decoder(codec, &param)) {
opj_destroy_codec(codec);
return 1;
}
stream = opj_stream_default_create(OPJ_TRUE);
d.pos = 0;
d.size = data_size;
d.data = (unsigned char *) data;
opj_stream_set_read_function(stream, _opj_stream_read);
opj_stream_set_skip_function(stream, _opj_stream_skip);
opj_stream_set_seek_function(stream, _opj_stream_seek);
opj_stream_set_user_data(stream, &d, NULL);
opj_stream_set_user_data_length(stream, data_size);
if (!opj_read_header(stream, codec, &image)) {
opj_destroy_codec(codec);
opj_stream_destroy(stream);
return 1;
}
opj_destroy_codec(codec);
opj_stream_destroy(stream);
*jp2_width = image->x1 - image->x0;
*jp2_height = image->y1 - image->y0;
opj_image_destroy(image);
return 0;
}

8
src/jp2.h Normal file
View file

@ -0,0 +1,8 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strinfo_jp2_dim(int *jp2_width, int *jp2_height,
const char * restrict data, int data_size);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -9,7 +9,7 @@
#include <jpeglib.h>
int
strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height,
strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components,
const char * restrict data, int data_size)
{
struct jpeg_decompress_struct cinfo;
@ -27,6 +27,7 @@ strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height,
*jpeg_width = cinfo.output_width;
*jpeg_height = cinfo.output_height;
*jpeg_components = cinfo.output_components;
jpeg_destroy((struct jpeg_common_struct *) &cinfo);

View file

@ -1,8 +1,8 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height,
int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components,
const char * restrict data, int data_size);

24
src/md5.c Normal file
View file

@ -0,0 +1,24 @@
/*
* Copyright (c) 2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdlib.h>
#include <openssl/md5.h>
int
strmd5(unsigned char **dst, int *dst_size,
const unsigned char * restrict src, int src_size)
{
*dst_size = MD5_DIGEST_LENGTH;
*dst = malloc(*dst_size);
if (*dst == NULL)
return 1;
MD5(src, src_size, *dst);
return 0;
}

9
src/md5.h Normal file
View file

@ -0,0 +1,9 @@
/*
* Copyright (c) 2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int
strmd5(unsigned char **dst, int *dst_size,
const unsigned char * restrict src, int src_size);

View file

@ -98,7 +98,8 @@ main(int argc, char **argv)
strerror(errno));
return EXIT_FAILURE;
}
} else if (strncmp(param->file_stat->type, "HN", 2) == 0) {
} else if (strncmp(param->file_stat->type, "HN", 2) == 0 ||
(unsigned char) param->file_stat->type[0] == 0xc8) {
if (cnki_hn(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));

View file

@ -89,7 +89,7 @@ pdf_get_free_id(pdf_object_t **pdf)
int id = 0;
for (int i = 1; i < 99999999; i++) {
for (int i = 1; i < 100000000; i++) {
ptr = (*pdf)->next;
while (ptr != NULL) {
if (ptr->id == i) {
@ -123,7 +123,7 @@ pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count)
int id = 0;
pdf_object_t *ptr;
for (int i = 1; i < 99999999; i++) {
for (int i = 1; i < 100000000; i++) {
ptr = (*pdf)->next;
while (ptr != NULL) {
if (ptr->id == i) {

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -19,26 +19,35 @@ static void *
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
{
const char whitespace[6] = {
'\r',
'\n',
'\f',
'\t',
'\0',
' '
0x00,
0x09,
0x0a,
0x0c,
0x0d,
0x20
};
char tmp[s1 + 1];
memcpy(tmp, p1, s1);
char *ret = NULL;
char *ret;
char str[s1 + 1];
memcpy(str, p1, s1);
size_t tmp_size = 0;
char *tmp;
for (int i = 0; i < 6; i++) {
tmp[s1] = whitespace[i];
if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
return ret;
str[s1] = whitespace[i];
if ((tmp = memmem(p0, s0, str, s1 + 1)) == NULL)
continue;
if (tmp_size == 0 || (size_t) (tmp - (char *) p0) < tmp_size) {
tmp_size = tmp - (char *) p0;
ret = tmp;
}
}
return NULL;
return ret;
}
static int
@ -57,23 +66,45 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
end = ftell(*fp);
fseek(*fp, cur, SEEK_SET);
int head = 0;
int tail = 0;
long head = 0;
long tail = 0;
char *pos;
char *tmp;
for (;;) {
fread(buf, size_buf, 1, *fp);
if (cur + size_buf < end) {
fread(buf, size_buf, 1, *fp);
} else {
fread(buf, end - cur, 1, *fp);
memset(buf + end - cur, 0, size_buf - end + cur);
}
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
head = cur + (pos - buf) + 7;
if (head == 0) {
/* Hack needed for invalid object */
pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6);
tmp = memmem(buf, size_buf, " 0 obj", 6);
while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b)
tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6);
if (pos != NULL && tmp != NULL) {
if (pos - buf < tmp - buf)
head = cur + (pos - buf) + 7;
else
head = cur + (tmp - buf) + 6;
} else if (pos != NULL) {
head = cur + (pos - buf) + 7;
} else if (tmp != NULL) {
head = cur + (tmp - buf) + 6;
}
}
if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) {
/* We need to check if it is the object stored in stream */
while (memcmp(pos + 7,
"\r\nendstream", 11) == 0 &&
(tmp = _memmem_whitespace(pos + 6,
size_buf - (pos - buf) - 6,
(tmp = _memmem_whitespace(pos + 7,
size_buf - (pos - buf) - 7,
"endobj", 6)) != NULL)
pos = tmp;
@ -102,13 +133,17 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
ptr->address = head;
ptr->size = tail - head;
fseek(*fp, tail + 6, SEEK_SET);
fseek(*fp, tail + 7, SEEK_SET);
head = tail = 0;
} else if (head > 0 && tail > 0) {
if (cur + size_buf < end)
fseek(*fp, head, SEEK_SET);
tail = 0;
} else {
fseek(*fp, -6, SEEK_CUR);
fseek(*fp, -7, SEEK_CUR);
}
if ((cur = ftell(*fp)) + 6 >= end)
if ((cur = ftell(*fp)) + 7 >= end)
break;
}
@ -126,6 +161,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
pdf_object_t *ptr = (*pdf)->next;
char str[8];
char *buf;
char *head;
char *tail;
@ -137,34 +173,86 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
if (buf == NULL)
return 1;
memset(buf, 0, ptr->size);
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
fseek(*fp, ptr->address - 12, SEEK_SET);
fread(buf, 8, 1, *fp);
/* Handle incomplete object */
head = buf;
while ((tmp = _memmem_whitespace(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 7;
/* Hack needed for invalid object */
while ((tmp = memmem(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 6;
if (head - buf > 0) {
ptr->address += head - buf;
ptr->size -= head - buf;
tmp = realloc(buf, ptr->size);
if (tmp == NULL)
return 1;
buf = tmp;
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
}
/* Hack needed for invalid object */
fseek(*fp, ptr->address - 14, SEEK_SET);
fread(str, 8, 1, *fp);
if (str[7] < '0' || str[7] > '9') {
fseek(*fp, ptr->address - 15, SEEK_SET);
fread(str, 8, 1, *fp);
}
for (int i = 7; i >= 0; i--) {
if (str[i] < '0' || str[i] > '9') {
if (i < 7)
ptr->id = atoi(str + i + 1);
else
ptr->id = 0;
for (int i = 0; i < 8; i++) {
if (buf[i] >= '0' && buf[i] <= '9') {
ptr->id = atoi(buf + i);
break;
}
}
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL &&
(tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) {
/*
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
while ((tmp = _memmem_whitespace(tail + 2,
ptr->size - (tail - buf) - 2,
">>", 2)) != NULL &&
memmem(tail + 2,
ptr->size - (tail - buf) - 2,
"stream\r\n", 8) == NULL)
tail = tmp;
((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL ||
/* Hack needed for invalid object */
(tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) {
if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) {
tail = memmem(buf, ptr->size, ">>", 2);
while (ptr->size - (tail - buf) > 2 &&
(tmp = memmem(tail + 2,
ptr->size - (tail - buf) - 2,
">>", 2)) != NULL &&
memmem(tail + 2,
(tmp - tail) - 2,
"stream\r\n", 8) == NULL)
tail = tmp;
} else {
/*
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
while (ptr->size - (tail - buf) > 3 &&
(tmp = _memmem_whitespace(tail + 3,
ptr->size - (tail - buf) - 3,
">>", 2)) != NULL &&
memmem(tail + 3,
(tmp - tail) - 3,
"stream\r\n", 8) == NULL)
tail = tmp;
}
ptr->dictionary_size = tail - head + 2;
ptr->dictionary = malloc(ptr->dictionary_size + 1);
@ -172,8 +260,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
if (ptr->dictionary == NULL)
return 1;
memset(ptr->dictionary, 0, ptr->dictionary_size + 1);
memcpy(ptr->dictionary, head, ptr->dictionary_size);
memset(ptr->dictionary + ptr->dictionary_size, 0, 1);
if ((head = memmem(tail,
ptr->size - (tail - buf),
@ -186,11 +274,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
* contains another object that
* contains another stream
*/
while (_memmem_whitespace(tail,
ptr->size - (tail - buf),
while (_memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
"endobj", 6) != NULL &&
(tmp = _memmem_whitespace(tail + 9,
ptr->size - (tail - buf) - 9,
(tmp = _memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
"endstream", 9)) != NULL)
tail = tmp;
@ -202,19 +290,13 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
memcpy(ptr->stream, head + 8, ptr->stream_size);
}
free(buf);
} else {
ptr->object_size = ptr->size;
ptr->object = malloc(ptr->object_size + 1);
if (ptr->object == NULL)
return 1;
memset(ptr->object, 0, ptr->object_size + 1);
memcpy(ptr->object, buf, ptr->object_size);
ptr->object = buf;
}
free(buf);
ptr = ptr->next;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -8,14 +8,32 @@
#include <string.h>
#include <time.h>
#include <openssl/md5.h>
#include "version.h"
#include "md5.h"
#include "pdf.h"
static int
_info_obj(pdf_object_t **pdf)
{
char dictionary[128] = "<<\n"
"/Producer (Melon " VERSION "." RELEASE "." PATCH EXTRA ")\n"
"/CreationDate (D:";
char buf[64];
time_t timestamp = time(NULL);
strftime(buf, 64, "%Y%m%d%H%M%S", gmtime(&timestamp));
strcat(dictionary, buf);
strcat(dictionary, "+00'00')\n>>");
return pdf_obj_append(pdf, 0, NULL, dictionary, NULL, 0);
}
int
pdf_dump_obj(pdf_object_t **pdf, FILE **fp)
{
if (*pdf == NULL || *fp == NULL)
if (*pdf == NULL || *fp == NULL || _info_obj(pdf) != 0)
return 1;
long cur;
@ -144,35 +162,28 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref)
buf_size = snprintf(buf, 64, "%lx%x", timestamp, size);
#endif
unsigned char str[64];
memcpy(str, buf, 64);
int fid_size;
unsigned char *fid;
unsigned char fid[MD5_DIGEST_LENGTH];
MD5(str, buf_size, fid);
if (strmd5(&fid, &fid_size, (unsigned char *) buf, buf_size) != 0)
return 1;
pdf_object_t *ptr = *pdf;
while (ptr->next != NULL)
ptr = ptr->next;
/*
* TODO: Document information dictionary
* `"/Producer (Melon)"'
* `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"'
*
* Trailer dictionary
* `"/Info %d 0 R"'
*/
fprintf(*fp,
"/Size %d\n/Root %d 0 R\n",
"/Size %d\n/Root %d 0 R\n/Info %d 0 R\n",
ptr->id + 1,
pdf_get_catalog_id(pdf));
pdf_get_catalog_id(pdf),
ptr->id);
fputs("/ID [", *fp);
for (int i = 0; i < 2; i++) {
fputs("<", *fp);
for (int j = 0; j < MD5_DIGEST_LENGTH; j++)
for (int j = 0; j < fid_size; j++)
fprintf(*fp, "%02x", fid[j]);
fputs(">", *fp);
@ -191,5 +202,7 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref)
fputs("%%EOF\n", *fp);
free(fid);
return 0;
}

View file

@ -1,10 +1,10 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#define VERSION "0"
#define RELEASE "2"
#define RELEASE "3"
#define PATCH "0"
#define EXTRA ""