Compare commits

..

No commits in common. "master" and "0.2.0" have entirely different histories.

25 changed files with 458 additions and 1072 deletions

View file

@ -1,44 +1,7 @@
0.3.0 (2023-XX-XX)
==================
* Support HN text overlay.
* Support HN page with text.
* Handle inaccurate page count in CAJ and KDH.
0.2.5 (2023-01-05)
==================
* Improve PDF parser.
* Handle duplicated object in CAJ.
* Handle duplicated image in HN.
* Handle incomplete PDF object in CAJ and KDH.
* Handle invalid PDF object token in CAJ and KDH.
* Fix JBIG decoder.
0.2.4 (2022-12-31)
==================
* Fix HN image compositing.
* Fix PDF object check.
0.2.3 (2022-12-30)
==================
* Support HN figure placement.
0.2.2 (2022-12-29)
0.3.0 (2022-XX-XX)
==================
* Support JPEG 2000 for HN.
* Handle missing but referenced root object.
* Handle HN with more than one image per page.
* Fix buffer overflow.
0.2.1 (2022-12-26)
==================
* Handle different JPEG colour component.
* Handle headless HN and page with no image.
0.2.0 (2022-12-22)
==================

View file

@ -9,15 +9,16 @@ Development
Currently, CAJ, KDH, and HN can be converted. Please report
any failures with a sample that can reproduce the behaviour.
HN support does not support JPEG 2000 yet.
Dependency
----------
1. libcrypto (OpenSSL)
2. zlib
3. jbig2dec
4. libjpeg-turbo
5. openjpeg
6. pkgconf
1. OpenSSL
2. libiconv
3. zlib
4. jbig2dec
5. libjpeg-turbo
Usage
=====
@ -35,12 +36,12 @@ Options
Specify output file
-b, --buffer
Set input buffer size (default 512k)
Set buffer size (default 512k)
-v, --verbose
Print more information (twice for even more, three times for HN image processing information as well)
Print more information (twice for even more, three times for HN image decoding information as well)
Thanks
======
This project is inspired by [https://github.com/caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf)
This project is inspired by [https://github.com/JeziL/caj2pdf](https://github.com/JeziL/caj2pdf)

View file

@ -4,28 +4,23 @@
# SPDX-License-Identifier: Apache-2.0
#
src = melon.c iconv.c zlib.c jbig2.c jpeg.c jp2.c md5.c \
cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c cnki_pdf.c \
cnki_zlib.c cnki_jbig.c cnki_jbig_dec.c cnki_jbig2.c cnki.c \
src = melon.c iconv.c zlib.c jbig.c jbig2.c jpeg.c \
cnki_caj.c cnki_hn.c cnki_kdh.c cnki_outline_tree.c \
cnki_pdf.c cnki_zlib.c cnki_jbig.c cnki_jbig2.c cnki.c \
pdf_cnki.c pdf_get.c pdf_parser.c pdf_writer.c pdf.c
inc = extern.h version.h iconv.h zlib.h jbig2.h jpeg.h jp2.h md5.h \
cnki.h pdf_cnki.h cnki_jbig.h cnki_jbig_dec.h pdf.h
inc = extern.h version.h iconv.h zlib.h jbig.h jbig2.h jpeg.h \
cnki.h pdf_cnki.h cnki_jbig.h pdf.h
obj = ${src:.c=.o}
PREFIX = /usr/local
CFLAGS = -O2 -pipe -flto -Wall -Wextra
LDFLAGS = -Wl,-O2 -lcrypto -lz -ljbig2dec -ljpeg -lopenjp2 -Wl,--as-needed
LDFLAGS = -Wl,-O2 -lcrypto -liconv -lz -ljbig2dec -ljpeg -Wl,--as-needed
CFLAGS += -I/usr/local/include
LDFLAGS += -L/usr/local/lib
OPENJPEG_CFLAGS != pkgconf --cflags libopenjp2
CFLAGS += ${OPENJPEG_CFLAGS}
CFLAGS += -DLIBICONV_PLUG
all: ${obj} ${inc}
${CC} ${LDFLAGS} -o melon ${obj}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -54,11 +54,6 @@ cnki_destroy(cnki_t **param)
object_hn_t *ptr_hn;
while ((ptr_hn = (*param)->object_hn) != NULL) {
(*param)->object_hn = (*param)->object_hn->next;
free(ptr_hn->text);
if (ptr_hn->image_data != NULL)
for (int i = 0; i < ptr_hn->image_length; i++)
free(ptr_hn->image_data[i].image);
free(ptr_hn->image_data);
free(ptr_hn);
}
@ -76,19 +71,12 @@ cnki_info(cnki_t **param)
printf("Reading file header at 0x%x\n", ADDRESS_HEAD);
int addr[2];
unsigned char str[2];
fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET);
fread((*param)->file_stat->type, 4, 1, (*param)->fp_i);
fread(str, 2, 1, (*param)->fp_i);
if ((*param)->stat > 0) {
if ((unsigned char) (*param)->file_stat->type[0] > 0x7f)
printf("File type is '%02x'\n", (unsigned char) (*param)->file_stat->type[0]);
else
printf("File type is '%s'\n", (*param)->file_stat->type);
}
if ((*param)->stat > 0)
printf("File type is '%s'\n", (*param)->file_stat->type);
if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) {
return 0;
@ -98,9 +86,6 @@ cnki_info(cnki_t **param)
} else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) {
addr[0] = ADDRESS_HN_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
addr[0] = ADDRESS_C8_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
} else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) {
return 0;
} else {
@ -117,14 +102,6 @@ cnki_info(cnki_t **param)
printf("Advised %d page(s)\n",
(*param)->file_stat->page);
if (strncmp((*param)->file_stat->type, "HN", 2) == 0 && str[0] == 0xc8 && str[1] == 0x00) {
fseek((*param)->fp_i, 0xd8, SEEK_SET);
return 0;
} else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) {
fseek((*param)->fp_i, 0x50, SEEK_SET);
return 0;
}
if ((*param)->stat > 1)
printf("Reading outline count at 0x%x\n", addr[1]);
@ -138,7 +115,7 @@ cnki_info(cnki_t **param)
if ((*param)->file_stat->outline > 0) {
if ((*param)->stat > 1) {
printf("Loading outline(s)\n");
printf("\t%19s\t%-24s\t%12s\t%12s\t%5s\n",
printf("\t%16s\t%-24s\t%12s\t%12s\t%5s\n",
"title",
"hierarchy",
"page",

View file

@ -16,8 +16,6 @@
#define ADDRESS_HN_PAGE 0x0090
#define ADDRESS_HN_OUTLINE 0x0158
#define ADDRESS_C8_PAGE 0x0008
#define ADDRESS_KDH_BODY 0x00fe
#define KEY_KDH "FZHMEI"
@ -58,10 +56,6 @@ typedef struct _hn_image_t {
int32_t format; /* hn_code */
int32_t address;
int32_t size;
uint16_t x;
uint16_t y;
uint16_t w;
uint16_t h;
char *image;
} hn_image_t;
@ -70,8 +64,7 @@ typedef struct _object_hn_t {
int32_t text_size;
int16_t image_length;
int16_t page;
int32_t unknown; /* TODO: what is it? */
int32_t address_next;
int32_t unknown[2]; /* TODO: what is it? */
char *text;
struct _hn_image_t *image_data;
struct _object_hn_t *next;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -27,13 +27,12 @@ cnki_hn(cnki_t **param)
if ((*param)->stat > 1) {
printf("Loading page(s)\n");
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4s\t%8s\t%8s\n",
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n",
"address",
"text",
"length",
"page",
"unknown",
"next",
"code",
"address",
"image");
@ -45,8 +44,7 @@ cnki_hn(cnki_t **param)
fread(&ptr->text_size, 4, 1, (*param)->fp_i);
fread(&ptr->image_length, 2, 1, (*param)->fp_i);
fread(&ptr->page, 2, 1, (*param)->fp_i);
fread(&ptr->unknown, 4, 1, (*param)->fp_i);
fread(&ptr->address_next, 4, 1, (*param)->fp_i);
fread(&ptr->unknown, 8, 1, (*param)->fp_i);
ptr->text = NULL;
ptr->image_data = NULL;
@ -64,80 +62,66 @@ cnki_hn(cnki_t **param)
ptr = (*param)->object_hn;
while (ptr != NULL) {
if (ptr->text_size > 0) {
ptr->text = malloc(ptr->text_size);
ptr->text = malloc(ptr->text_size);
if (ptr->text == NULL)
return 1;
if (ptr->text == NULL)
return 1;
fseek((*param)->fp_i, ptr->address, SEEK_SET);
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
}
fseek((*param)->fp_i, ptr->address, SEEK_SET);
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
if ((*param)->stat > 1)
printf("\t%08x\t%8d\t%6d\t%4d\t%8d\t%08x",
printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}",
ptr->address,
ptr->text_size,
ptr->image_length,
ptr->page,
ptr->unknown,
ptr->address_next);
ptr->unknown[0],
ptr->unknown[1]);
if (ptr->image_length > 0) {
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
if (ptr->image_data == NULL)
if (ptr->image_data == NULL)
return 1;
for (int i = 0; i < ptr->image_length; i++) {
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
fseek((*param)->fp_i,
ptr->image_data[i].address + ptr->image_data[i].size,
SEEK_SET);
}
for (int i = 0; i < ptr->image_length; i++) {
ptr->image_data[i].image = malloc(ptr->image_data[i].size);
if (ptr->image_data[i].image == NULL)
return 1;
for (int i = 0; i < ptr->image_length; i++) {
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
ptr->image_data[i].x = 0;
ptr->image_data[i].y = 0;
ptr->image_data[i].w = 0;
ptr->image_data[i].h = 0;
fseek((*param)->fp_i,
ptr->image_data[i].address + ptr->image_data[i].size,
SEEK_SET);
}
fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET);
fread(ptr->image_data[i].image,
ptr->image_data[i].size, 1,
(*param)->fp_i);
for (int i = 0; i < ptr->image_length; i++) {
ptr->image_data[i].image = malloc(ptr->image_data[i].size);
if (ptr->image_data[i].image == NULL)
return 1;
fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET);
fread(ptr->image_data[i].image,
ptr->image_data[i].size, 1,
(*param)->fp_i);
if ((*param)->stat > 1) {
if (i == 0) {
printf("\t%4d\t%08x\t%8d\n",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
} else {
printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4d\t%08x\t%8d\n",
"",
"",
"",
"",
"",
"",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
}
if ((*param)->stat > 1) {
if (i == 0) {
printf("\t%4d\t%08x\t%8d\n",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
} else {
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n",
"",
"",
"",
"",
"",
ptr->image_data[i].format,
ptr->image_data[i].address,
ptr->image_data[i].size);
}
}
} else if ((*param)->stat > 1) {
printf("\t%4s\t%8s\t%8s\n",
"",
"",
"");
}
ptr = ptr->next;

View file

@ -8,7 +8,7 @@
#include <string.h>
#include "cnki_jbig.h"
#include "cnki_jbig_dec.h"
#include "jbig.h"
int
cnki_jbig(char **bitmap, int *bitmap_size,

View file

@ -27,8 +27,8 @@ typedef struct _dib_t {
uint16_t depth;
uint32_t compression; /* dib_compression_code */
uint32_t size;
int32_t resolution_h;
int32_t resolution_v;
uint32_t resolution_h;
uint32_t resolution_v;
uint32_t colour;
uint32_t colour_used;
} dib_t;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -15,18 +15,16 @@ cnki_kdh(cnki_t **param)
if ((*param)->stat > 0)
printf("Begin 'KDH' decryption\n");
long cur = ADDRESS_KDH_BODY;
long end;
fseek((*param)->fp_i, 0, SEEK_END);
end = ftell((*param)->fp_i);
fseek((*param)->fp_i, cur, SEEK_SET);
long size = ftell((*param)->fp_i);
fseek((*param)->fp_i, ADDRESS_KDH_BODY, SEEK_SET);
const char key[] = KEY_KDH;
const int key_len = KEY_KDH_LENGTH;
long key_cur = 0;
int buf_size;
char buf[(*param)->size_buf];
FILE *tmp = tmpfile();
@ -35,32 +33,32 @@ cnki_kdh(cnki_t **param)
return 1;
for (;;) {
if (cur + (*param)->size_buf < end)
buf_size = (*param)->size_buf;
else
buf_size = end - cur;
fread(buf, (*param)->size_buf, 1, (*param)->fp_i);
fread(buf, buf_size, 1, (*param)->fp_i);
for (int i = 0; i < (*param)->size_buf; i++) {
buf[i] ^= key[key_cur % key_len];
key_cur++;
}
for (int i = 0; i < buf_size; i++)
buf[i] ^= key[key_cur++ % key_len];
fwrite(buf, (*param)->size_buf, 1, tmp);
fwrite(buf, buf_size, 1, tmp);
if ((cur = ftell((*param)->fp_i)) >= end)
if (ftell((*param)->fp_i) == size)
break;
}
if ((*param)->stat > 0)
printf("Decrypted %ld byte(s)\n", ftell(tmp));
fclose((*param)->fp_i);
fseek(tmp, 0, SEEK_SET);
FILE *orig = (*param)->fp_i;
(*param)->fp_i = tmp;
cnki_pdf(param);
(*param)->fp_i = orig;
fclose(tmp);
if ((*param)->stat > 0)
printf("Conversion ended\n");

File diff suppressed because it is too large Load diff

View file

@ -13,17 +13,12 @@ int
cnki_zlib(char **dst, int *dst_size,
const char * restrict src, int src_size)
{
uint8_t padding = 0;
int32_t size;
if (strncmp(src + 8, "COMPRESSTEXT", 12) == 0)
padding = 8;
memcpy(&size, src + 12 + padding, 4);
memcpy(&size, src + 20, 4);
*dst_size = size;
if (strinflate(dst, size, src + 16 + padding, src_size - 16 - padding) != 0)
if (strinflate(dst, size, src + 24, src_size - 24) != 0)
return 1;
return 0;

View file

@ -9,7 +9,7 @@
#include <stdlib.h>
#include <string.h>
static const uint16_t _LSZ[0x71] = {
static const uint16_t _LSZ[256] = {
0x5a1d,
0x2586, 0x1114, 0x080b, 0x03d8, 0x01da, 0x00e5, 0x006f, 0x0036,
0x001a, 0x000d, 0x0006, 0x0003, 0x0001, 0x5a7f, 0x3f25, 0x2cf2,
@ -28,7 +28,7 @@ static const uint16_t _LSZ[0x71] = {
0x5627, 0x50e7, 0x4b85, 0x5597, 0x504f, 0x5a10, 0x5522, 0x59eb
};
static const uint8_t _NLPS[0x71] = {
static const uint8_t _NLPS[256] = {
1,
14, 16, 18, 20, 23, 25, 28, 30,
33, 35, 9, 10, 12, 15, 36, 38,
@ -47,7 +47,7 @@ static const uint8_t _NLPS[0x71] = {
105, 108, 109, 110, 111, 110, 112, 112
};
static const uint8_t _NMPS[0x71] = {
static const uint8_t _NMPS[256] = {
1,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 13, 15, 16, 17,
@ -66,7 +66,7 @@ static const uint8_t _NMPS[0x71] = {
106, 107, 103, 109, 107, 111, 109, 111
};
static const bool _SWTCH[0x71] = {
static const bool _SWTCH[256] = {
1,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0,
@ -99,7 +99,7 @@ static int _height;
static int _width_padded;
static int _ret_pos;
static unsigned char *_ret;
static char *_ret;
static int _scd_size;
static unsigned char *_scd;
@ -108,7 +108,7 @@ static void
_bytein(void)
{
if (_ret_pos < _scd_size)
_reg_c += _scd[_ret_pos++] << 8;
_reg_c += *(_scd + _ret_pos++) << 8;
_ct = 8;
}
@ -207,18 +207,7 @@ static void
_procline(int line, char *a, char *b, char *c)
{
/* The encoder must be erroneous */
uint16_t cx = 0;
if (line > 0) {
cx += (_ret[_width_padded * (_height - line)] & 0x20) << 2;
cx += _ret[_width_padded * (_height - line)] & 0x40;
cx += (_ret[_width_padded * (_height - line)] & 0x80) >> 2;
}
if (line > 1) {
cx += (_ret[_width_padded * (_height - line + 1)] & 0x40) >> 4;
cx += (_ret[_width_padded * (_height - line + 1)] & 0x80) >> 6;
}
uint16_t cx = (*b & 0x01) << 2;
for (int i = 0; i < _width; i++) {
_decode(cx);
@ -226,19 +215,19 @@ _procline(int line, char *a, char *b, char *c)
cx >>= 1;
if (_pix == 1) {
_ret[_width_padded * (_height - line - 1) + i / 8] |= _pix << (7 - (i & 0x07));
c[i] = 1;
*(_ret + _width_padded * (_height - line - 1) + i / 8) |= _pix << (7 - (i & 0x07));
*(c + i) = 1;
cx |= 0x0200;
} else {
cx &= 0xfdff;
}
if (i + 2 < _width && a[i + 2] == 1)
if (i + 2 < _width && *(a + i + 2) == 1)
cx |= 0x0004;
else
cx &= 0xfffb;
if (i + 3 < _width && b[i + 3] == 1)
if (i + 3 < _width && *(b + i + 3) == 1)
cx |= 0x0080;
else
cx &= 0xff7f;
@ -304,7 +293,7 @@ strdec_jbig(char **bitmap, int width, int height,
memset(*bitmap, 0, _height * _width_padded);
_ret_pos = 0;
_ret = (unsigned char *) *bitmap;
_ret = *bitmap;
_scd_size = jbig_size;
_scd = (unsigned char *) jbig;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, yzrh <yzrh@noema.org>
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -31,6 +31,5 @@ strdec_jbig2(char **bitmap,
}
jbig2_release_page(ctx, image);
jbig2_ctx_free(ctx);
return 0;
}

115
src/jp2.c
View file

@ -1,115 +0,0 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <string.h>
#include <openjpeg.h>
typedef struct _stream_user_data {
OPJ_SIZE_T pos;
OPJ_SIZE_T size;
const unsigned char *data;
} stream_user_data;
static OPJ_SIZE_T
_opj_stream_read(void *p_buffer, OPJ_SIZE_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (d->pos >= d->size)
return (OPJ_SIZE_T) - 1;
OPJ_SIZE_T ret_size = p_nb_bytes;
if (d->pos + ret_size > d->size)
ret_size = d->size - d->pos;
memcpy(p_buffer, d->data + d->pos, ret_size);
d->pos += ret_size;
return ret_size;
}
static OPJ_OFF_T
_opj_stream_skip(OPJ_OFF_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (d->pos + p_nb_bytes <= d->size)
d->pos += p_nb_bytes;
else
d->pos = d->size;
return d->pos;
}
static OPJ_BOOL
_opj_stream_seek(OPJ_OFF_T p_nb_bytes, void *p_user_data)
{
stream_user_data *d = (stream_user_data *) p_user_data;
if (p_nb_bytes <= (OPJ_OFF_T) d->size) {
d->pos = p_nb_bytes;
return OPJ_TRUE;
}
return OPJ_FALSE;
}
int
strinfo_jp2_dim(int *jp2_width, int *jp2_height,
const char * restrict data, int data_size)
{
opj_codec_t *codec;
opj_dparameters_t param;
opj_stream_t *stream;
opj_image_t *image;
stream_user_data d;
if (data_size < 2)
return 1;
opj_set_default_decoder_parameters(&param);
if ((unsigned char) data[0] == 0xff && (unsigned char) data[1] == 0x4f)
codec = opj_create_decompress(OPJ_CODEC_J2K);
else
codec = opj_create_decompress(OPJ_CODEC_JP2);
if (!opj_setup_decoder(codec, &param)) {
opj_destroy_codec(codec);
return 1;
}
stream = opj_stream_default_create(OPJ_TRUE);
d.pos = 0;
d.size = data_size;
d.data = (unsigned char *) data;
opj_stream_set_read_function(stream, _opj_stream_read);
opj_stream_set_skip_function(stream, _opj_stream_skip);
opj_stream_set_seek_function(stream, _opj_stream_seek);
opj_stream_set_user_data(stream, &d, NULL);
opj_stream_set_user_data_length(stream, data_size);
if (!opj_read_header(stream, codec, &image)) {
opj_destroy_codec(codec);
opj_stream_destroy(stream);
return 1;
}
opj_destroy_codec(codec);
opj_stream_destroy(stream);
*jp2_width = image->x1 - image->x0;
*jp2_height = image->y1 - image->y0;
opj_image_destroy(image);
return 0;
}

View file

@ -1,8 +0,0 @@
/*
* Copyright (c) 2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strinfo_jp2_dim(int *jp2_width, int *jp2_height,
const char * restrict data, int data_size);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -9,7 +9,7 @@
#include <jpeglib.h>
int
strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components,
strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height,
const char * restrict data, int data_size)
{
struct jpeg_decompress_struct cinfo;
@ -27,7 +27,6 @@ strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components,
*jpeg_width = cinfo.output_width;
*jpeg_height = cinfo.output_height;
*jpeg_components = cinfo.output_components;
jpeg_destroy((struct jpeg_common_struct *) &cinfo);

View file

@ -1,8 +1,8 @@
/*
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height, int *jpeg_components,
int strinfo_jpeg_dim(int *jpeg_width, int *jpeg_height,
const char * restrict data, int data_size);

View file

@ -1,24 +0,0 @@
/*
* Copyright (c) 2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdlib.h>
#include <openssl/md5.h>
int
strmd5(unsigned char **dst, int *dst_size,
const unsigned char * restrict src, int src_size)
{
*dst_size = MD5_DIGEST_LENGTH;
*dst = malloc(*dst_size);
if (*dst == NULL)
return 1;
MD5(src, src_size, *dst);
return 0;
}

View file

@ -1,9 +0,0 @@
/*
* Copyright (c) 2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
int
strmd5(unsigned char **dst, int *dst_size,
const unsigned char * restrict src, int src_size);

View file

@ -98,8 +98,7 @@ main(int argc, char **argv)
strerror(errno));
return EXIT_FAILURE;
}
} else if (strncmp(param->file_stat->type, "HN", 2) == 0 ||
(unsigned char) param->file_stat->type[0] == 0xc8) {
} else if (strncmp(param->file_stat->type, "HN", 2) == 0) {
if (cnki_hn(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));

View file

@ -89,7 +89,7 @@ pdf_get_free_id(pdf_object_t **pdf)
int id = 0;
for (int i = 1; i < 100000000; i++) {
for (int i = 1; i < 99999999; i++) {
ptr = (*pdf)->next;
while (ptr != NULL) {
if (ptr->id == i) {
@ -123,7 +123,7 @@ pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count)
int id = 0;
pdf_object_t *ptr;
for (int i = 1; i < 100000000; i++) {
for (int i = 1; i < 99999999; i++) {
ptr = (*pdf)->next;
while (ptr != NULL) {
if (ptr->id == i) {

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -19,35 +19,26 @@ static void *
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
{
const char whitespace[6] = {
0x00,
0x09,
0x0a,
0x0c,
0x0d,
0x20
'\r',
'\n',
'\f',
'\t',
'\0',
' '
};
char *ret = NULL;
char tmp[s1 + 1];
memcpy(tmp, p1, s1);
char str[s1 + 1];
memcpy(str, p1, s1);
size_t tmp_size = 0;
char *tmp;
char *ret;
for (int i = 0; i < 6; i++) {
str[s1] = whitespace[i];
if ((tmp = memmem(p0, s0, str, s1 + 1)) == NULL)
continue;
if (tmp_size == 0 || (size_t) (tmp - (char *) p0) < tmp_size) {
tmp_size = tmp - (char *) p0;
ret = tmp;
}
tmp[s1] = whitespace[i];
if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
return ret;
}
return ret;
return NULL;
}
static int
@ -66,45 +57,23 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
end = ftell(*fp);
fseek(*fp, cur, SEEK_SET);
long head = 0;
long tail = 0;
int head = 0;
int tail = 0;
char *pos;
char *tmp;
for (;;) {
if (cur + size_buf < end) {
fread(buf, size_buf, 1, *fp);
} else {
fread(buf, end - cur, 1, *fp);
memset(buf + end - cur, 0, size_buf - end + cur);
}
fread(buf, size_buf, 1, *fp);
if (head == 0) {
/* Hack needed for invalid object */
pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6);
tmp = memmem(buf, size_buf, " 0 obj", 6);
while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b)
tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6);
if (pos != NULL && tmp != NULL) {
if (pos - buf < tmp - buf)
head = cur + (pos - buf) + 7;
else
head = cur + (tmp - buf) + 6;
} else if (pos != NULL) {
head = cur + (pos - buf) + 7;
} else if (tmp != NULL) {
head = cur + (tmp - buf) + 6;
}
}
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
head = cur + (pos - buf) + 7;
if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) {
/* We need to check if it is the object stored in stream */
while (memcmp(pos + 7,
"\r\nendstream", 11) == 0 &&
(tmp = _memmem_whitespace(pos + 7,
size_buf - (pos - buf) - 7,
(tmp = _memmem_whitespace(pos + 6,
size_buf - (pos - buf) - 6,
"endobj", 6)) != NULL)
pos = tmp;
@ -133,17 +102,13 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
ptr->address = head;
ptr->size = tail - head;
fseek(*fp, tail + 7, SEEK_SET);
fseek(*fp, tail + 6, SEEK_SET);
head = tail = 0;
} else if (head > 0 && tail > 0) {
if (cur + size_buf < end)
fseek(*fp, head, SEEK_SET);
tail = 0;
} else {
fseek(*fp, -7, SEEK_CUR);
fseek(*fp, -6, SEEK_CUR);
}
if ((cur = ftell(*fp)) + 7 >= end)
if ((cur = ftell(*fp)) + 6 >= end)
break;
}
@ -161,7 +126,6 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
pdf_object_t *ptr = (*pdf)->next;
char str[8];
char *buf;
char *head;
char *tail;
@ -173,86 +137,34 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
if (buf == NULL)
return 1;
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
memset(buf, 0, ptr->size);
/* Handle incomplete object */
head = buf;
while ((tmp = _memmem_whitespace(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 7;
/* Hack needed for invalid object */
while ((tmp = memmem(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 6;
if (head - buf > 0) {
ptr->address += head - buf;
ptr->size -= head - buf;
tmp = realloc(buf, ptr->size);
if (tmp == NULL)
return 1;
buf = tmp;
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
}
/* Hack needed for invalid object */
fseek(*fp, ptr->address - 14, SEEK_SET);
fread(str, 8, 1, *fp);
if (str[7] < '0' || str[7] > '9') {
fseek(*fp, ptr->address - 15, SEEK_SET);
fread(str, 8, 1, *fp);
}
for (int i = 7; i >= 0; i--) {
if (str[i] < '0' || str[i] > '9') {
if (i < 7)
ptr->id = atoi(str + i + 1);
else
ptr->id = 0;
fseek(*fp, ptr->address - 12, SEEK_SET);
fread(buf, 8, 1, *fp);
for (int i = 0; i < 8; i++) {
if (buf[i] >= '0' && buf[i] <= '9') {
ptr->id = atoi(buf + i);
break;
}
}
if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL &&
((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL ||
/* Hack needed for invalid object */
(tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) {
if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) {
tail = memmem(buf, ptr->size, ">>", 2);
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
while (ptr->size - (tail - buf) > 2 &&
(tmp = memmem(tail + 2,
ptr->size - (tail - buf) - 2,
">>", 2)) != NULL &&
memmem(tail + 2,
(tmp - tail) - 2,
"stream\r\n", 8) == NULL)
tail = tmp;
} else {
/*
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
while (ptr->size - (tail - buf) > 3 &&
(tmp = _memmem_whitespace(tail + 3,
ptr->size - (tail - buf) - 3,
">>", 2)) != NULL &&
memmem(tail + 3,
(tmp - tail) - 3,
"stream\r\n", 8) == NULL)
tail = tmp;
}
if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL &&
(tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) {
/*
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
while ((tmp = _memmem_whitespace(tail + 2,
ptr->size - (tail - buf) - 2,
">>", 2)) != NULL &&
memmem(tail + 2,
ptr->size - (tail - buf) - 2,
"stream\r\n", 8) == NULL)
tail = tmp;
ptr->dictionary_size = tail - head + 2;
ptr->dictionary = malloc(ptr->dictionary_size + 1);
@ -260,8 +172,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
if (ptr->dictionary == NULL)
return 1;
memset(ptr->dictionary, 0, ptr->dictionary_size + 1);
memcpy(ptr->dictionary, head, ptr->dictionary_size);
memset(ptr->dictionary + ptr->dictionary_size, 0, 1);
if ((head = memmem(tail,
ptr->size - (tail - buf),
@ -274,11 +186,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
* contains another object that
* contains another stream
*/
while (_memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
while (_memmem_whitespace(tail,
ptr->size - (tail - buf),
"endobj", 6) != NULL &&
(tmp = _memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
(tmp = _memmem_whitespace(tail + 9,
ptr->size - (tail - buf) - 9,
"endstream", 9)) != NULL)
tail = tmp;
@ -290,13 +202,19 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
memcpy(ptr->stream, head + 8, ptr->stream_size);
}
free(buf);
} else {
ptr->object_size = ptr->size;
ptr->object = buf;
ptr->object = malloc(ptr->object_size + 1);
if (ptr->object == NULL)
return 1;
memset(ptr->object, 0, ptr->object_size + 1);
memcpy(ptr->object, buf, ptr->object_size);
}
free(buf);
ptr = ptr->next;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -8,32 +8,14 @@
#include <string.h>
#include <time.h>
#include "version.h"
#include "md5.h"
#include <openssl/md5.h>
#include "pdf.h"
static int
_info_obj(pdf_object_t **pdf)
{
char dictionary[128] = "<<\n"
"/Producer (Melon " VERSION "." RELEASE "." PATCH EXTRA ")\n"
"/CreationDate (D:";
char buf[64];
time_t timestamp = time(NULL);
strftime(buf, 64, "%Y%m%d%H%M%S", gmtime(&timestamp));
strcat(dictionary, buf);
strcat(dictionary, "+00'00')\n>>");
return pdf_obj_append(pdf, 0, NULL, dictionary, NULL, 0);
}
int
pdf_dump_obj(pdf_object_t **pdf, FILE **fp)
{
if (*pdf == NULL || *fp == NULL || _info_obj(pdf) != 0)
if (*pdf == NULL || *fp == NULL)
return 1;
long cur;
@ -162,28 +144,35 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref)
buf_size = snprintf(buf, 64, "%lx%x", timestamp, size);
#endif
int fid_size;
unsigned char *fid;
unsigned char str[64];
memcpy(str, buf, 64);
if (strmd5(&fid, &fid_size, (unsigned char *) buf, buf_size) != 0)
return 1;
unsigned char fid[MD5_DIGEST_LENGTH];
MD5(str, buf_size, fid);
pdf_object_t *ptr = *pdf;
while (ptr->next != NULL)
ptr = ptr->next;
/*
* TODO: Document information dictionary
* `"/Producer (Melon)"'
* `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"'
*
* Trailer dictionary
* `"/Info %d 0 R"'
*/
fprintf(*fp,
"/Size %d\n/Root %d 0 R\n/Info %d 0 R\n",
"/Size %d\n/Root %d 0 R\n",
ptr->id + 1,
pdf_get_catalog_id(pdf),
ptr->id);
pdf_get_catalog_id(pdf));
fputs("/ID [", *fp);
for (int i = 0; i < 2; i++) {
fputs("<", *fp);
for (int j = 0; j < fid_size; j++)
for (int j = 0; j < MD5_DIGEST_LENGTH; j++)
fprintf(*fp, "%02x", fid[j]);
fputs(">", *fp);
@ -202,7 +191,5 @@ pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref)
fputs("%%EOF\n", *fp);
free(fid);
return 0;
}

View file

@ -1,10 +1,10 @@
/*
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
* Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#define VERSION "0"
#define RELEASE "3"
#define RELEASE "2"
#define PATCH "0"
#define EXTRA ""