Add HN text extraction.
This commit is contained in:
parent
8d6fbb43c9
commit
98691d4203
14 changed files with 325 additions and 41 deletions
|
@ -10,7 +10,7 @@ obj = ${src:.c=.o}
|
||||||
PREFIX = /usr/local
|
PREFIX = /usr/local
|
||||||
|
|
||||||
CFLAGS = -O3 -march=native -pipe -Wall
|
CFLAGS = -O3 -march=native -pipe -Wall
|
||||||
LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed
|
LDFLAGS = -Wl,-O3 -lcrypto -lz -Wl,--as-needed
|
||||||
|
|
||||||
all: ${obj}
|
all: ${obj}
|
||||||
${CC} ${LDFLAGS} -o melon $^
|
${CC} ${LDFLAGS} -o melon $^
|
||||||
|
|
|
@ -10,7 +10,7 @@ obj = ${src:.c=.o}
|
||||||
PREFIX = /usr/local
|
PREFIX = /usr/local
|
||||||
|
|
||||||
CFLAGS = -O3 -march=native -pipe -flto=thin -Wall
|
CFLAGS = -O3 -march=native -pipe -flto=thin -Wall
|
||||||
LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed
|
LDFLAGS = -Wl,-O3 -lcrypto -lz -Wl,--as-needed
|
||||||
|
|
||||||
all: ${obj}
|
all: ${obj}
|
||||||
${CC} ${LDFLAGS} -o melon $>
|
${CC} ${LDFLAGS} -o melon $>
|
||||||
|
|
|
@ -69,15 +69,15 @@ cnki_info(cnki_t **param)
|
||||||
if ((*param)->stat > 0)
|
if ((*param)->stat > 0)
|
||||||
printf("File type is '%s'\n", (*param)->file_stat->type);
|
printf("File type is '%s'\n", (*param)->file_stat->type);
|
||||||
|
|
||||||
if (strcmp((*param)->file_stat->type, "%PDF") == 0) {
|
if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
} else if (strcmp((*param)->file_stat->type, "CAJ") == 0) {
|
} else if (strncmp((*param)->file_stat->type, "CAJ", 3) == 0) {
|
||||||
addr[0] = ADDRESS_CAJ_PAGE;
|
addr[0] = ADDRESS_CAJ_PAGE;
|
||||||
addr[1] = ADDRESS_CAJ_OUTLINE;
|
addr[1] = ADDRESS_CAJ_OUTLINE;
|
||||||
} else if (strcmp((*param)->file_stat->type, "HN") == 0) {
|
} else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) {
|
||||||
addr[0] = ADDRESS_HN_PAGE;
|
addr[0] = ADDRESS_HN_PAGE;
|
||||||
addr[1] = ADDRESS_HN_OUTLINE;
|
addr[1] = ADDRESS_HN_OUTLINE;
|
||||||
} else if (strcmp((*param)->file_stat->type, "KDH ") == 0) {
|
} else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
} else {
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -64,7 +64,7 @@ typedef struct _object_hn_t {
|
||||||
int32_t text_size;
|
int32_t text_size;
|
||||||
int16_t image_length;
|
int16_t image_length;
|
||||||
int16_t page;
|
int16_t page;
|
||||||
int32_t zero[2];
|
int32_t unknown[2]; /* TODO: what is it? */
|
||||||
char *text;
|
char *text;
|
||||||
struct _hn_image_t *image_data;
|
struct _hn_image_t *image_data;
|
||||||
struct _object_hn_t *next;
|
struct _object_hn_t *next;
|
||||||
|
@ -88,6 +88,8 @@ int cnki_outline_tree(object_outline_tree_t **outline_tree,
|
||||||
object_outline_t **outline, int *ids);
|
object_outline_t **outline, int *ids);
|
||||||
|
|
||||||
/* cnki_zlib.c */
|
/* cnki_zlib.c */
|
||||||
|
int cnki_zlib(char **dst, int *dst_size,
|
||||||
|
const char * restrict src, int src_size);
|
||||||
|
|
||||||
/* cnki_xml.c */
|
/* cnki_xml.c */
|
||||||
int cnki_xml(char **xml, FILE **fp);
|
int cnki_xml(char **xml, FILE **fp);
|
||||||
|
|
253
src/cnki_hn.c
253
src/cnki_hn.c
|
@ -5,8 +5,10 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#include "cnki.h"
|
#include "cnki.h"
|
||||||
|
#include "iconv.h"
|
||||||
#include "pdf.h"
|
#include "pdf.h"
|
||||||
#include "pdf_cnki.h"
|
#include "pdf_cnki.h"
|
||||||
|
|
||||||
|
@ -29,13 +31,12 @@ cnki_hn(cnki_t **param)
|
||||||
|
|
||||||
if ((*param)->stat > 1) {
|
if ((*param)->stat > 1) {
|
||||||
printf("Loading page(s)\n");
|
printf("Loading page(s)\n");
|
||||||
printf("\t%8s\t%8s\t%6s\t%4s\t%6s\t%6s\t%4s\t%8s\t%8s\n",
|
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n",
|
||||||
"address",
|
"address",
|
||||||
"text",
|
"text",
|
||||||
"length",
|
"length",
|
||||||
"page",
|
"page",
|
||||||
"zero",
|
"unknown",
|
||||||
"#",
|
|
||||||
"code",
|
"code",
|
||||||
"address",
|
"address",
|
||||||
"image");
|
"image");
|
||||||
|
@ -47,7 +48,7 @@ cnki_hn(cnki_t **param)
|
||||||
fread(&ptr->text_size, 4, 1, (*param)->fp_i);
|
fread(&ptr->text_size, 4, 1, (*param)->fp_i);
|
||||||
fread(&ptr->image_length, 2, 1, (*param)->fp_i);
|
fread(&ptr->image_length, 2, 1, (*param)->fp_i);
|
||||||
fread(&ptr->page, 2, 1, (*param)->fp_i);
|
fread(&ptr->page, 2, 1, (*param)->fp_i);
|
||||||
fread(&ptr->zero, 8, 1, (*param)->fp_i);
|
fread(&ptr->unknown, 8, 1, (*param)->fp_i);
|
||||||
|
|
||||||
ptr->text = NULL;
|
ptr->text = NULL;
|
||||||
ptr->image_data = NULL;
|
ptr->image_data = NULL;
|
||||||
|
@ -74,13 +75,13 @@ cnki_hn(cnki_t **param)
|
||||||
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
|
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
|
||||||
|
|
||||||
if ((*param)->stat > 1)
|
if ((*param)->stat > 1)
|
||||||
printf("\t%08x\t%8d\t%6d\t%4d\t{%d, %d}",
|
printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}",
|
||||||
ptr->address,
|
ptr->address,
|
||||||
ptr->text_size,
|
ptr->text_size,
|
||||||
ptr->image_length,
|
ptr->image_length,
|
||||||
ptr->page,
|
ptr->page,
|
||||||
ptr->zero[0],
|
ptr->unknown[0],
|
||||||
ptr->zero[1]);
|
ptr->unknown[1]);
|
||||||
|
|
||||||
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
|
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
|
||||||
|
|
||||||
|
@ -91,6 +92,9 @@ cnki_hn(cnki_t **param)
|
||||||
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
|
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
|
||||||
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
|
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
|
||||||
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
|
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
|
||||||
|
fseek((*param)->fp_i,
|
||||||
|
ptr->image_data[i].address + ptr->image_data[i].size,
|
||||||
|
SEEK_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < ptr->image_length; i++) {
|
for (int i = 0; i < ptr->image_length; i++) {
|
||||||
|
@ -104,12 +108,24 @@ cnki_hn(cnki_t **param)
|
||||||
ptr->image_data[i].size, 1,
|
ptr->image_data[i].size, 1,
|
||||||
(*param)->fp_i);
|
(*param)->fp_i);
|
||||||
|
|
||||||
if ((*param)->stat > 1)
|
if ((*param)->stat > 1) {
|
||||||
printf("\t%6d\t%4d\t%08x\t%8d\n",
|
if (i == 0) {
|
||||||
i,
|
printf("\t%4d\t%08x\t%8d\n",
|
||||||
ptr->image_data[i].format,
|
ptr->image_data[i].format,
|
||||||
ptr->image_data[i].address,
|
ptr->image_data[i].address,
|
||||||
ptr->image_data[i].size);
|
ptr->image_data[i].size);
|
||||||
|
} else {
|
||||||
|
printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
ptr->image_data[i].format,
|
||||||
|
ptr->image_data[i].address,
|
||||||
|
ptr->image_data[i].size);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ptr = ptr->next;
|
ptr = ptr->next;
|
||||||
|
@ -119,16 +135,227 @@ cnki_hn(cnki_t **param)
|
||||||
printf("Loaded %d page(s)\n", (*param)->file_stat->page);
|
printf("Loaded %d page(s)\n", (*param)->file_stat->page);
|
||||||
|
|
||||||
if ((*param)->stat > 1)
|
if ((*param)->stat > 1)
|
||||||
printf("Creating PDF object(s)\n");
|
printf("Generating PDF object(s)\n");
|
||||||
|
|
||||||
pdf_object_t *pdf = NULL;
|
pdf_object_t *pdf = NULL;
|
||||||
|
|
||||||
if (pdf_obj_create(&pdf) != 0)
|
if (pdf_obj_create(&pdf) != 0)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
int buf_size;
|
||||||
|
char *buf;
|
||||||
|
|
||||||
|
int str_size;
|
||||||
|
char *str;
|
||||||
|
|
||||||
|
int conv_size;
|
||||||
|
char *conv_dst;
|
||||||
|
char conv_src[2];
|
||||||
|
char conv_hex[3];
|
||||||
|
|
||||||
|
ptr = (*param)->object_hn;
|
||||||
|
while (ptr != NULL) {
|
||||||
|
if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) {
|
||||||
|
cnki_zlib(&buf, &buf_size, ptr->text, ptr->text_size);
|
||||||
|
|
||||||
|
str_size = buf_size / 8 + 7;
|
||||||
|
str = malloc(str_size);
|
||||||
|
|
||||||
|
if (str == NULL)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
memset(str, 0, str_size);
|
||||||
|
|
||||||
|
strcat(str, "<feff");
|
||||||
|
|
||||||
|
for (int i = 0; i < buf_size; i += 16) {
|
||||||
|
conv_src[0] = buf[i + 7];
|
||||||
|
conv_src[1] = buf[i + 6];
|
||||||
|
|
||||||
|
conv_size = 512;
|
||||||
|
|
||||||
|
if (strconv(&conv_dst, "UTF-16BE",
|
||||||
|
conv_src, "GB18030", &conv_size) == 0) {
|
||||||
|
for (int j = 0; j < conv_size - 2; j++) {
|
||||||
|
snprintf(conv_hex, 3,
|
||||||
|
"%02x", (unsigned char) conv_dst[j]);
|
||||||
|
strcat(str, conv_hex);
|
||||||
|
}
|
||||||
|
free(conv_dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(buf);
|
||||||
|
|
||||||
|
strcat(str, ">");
|
||||||
|
} else {
|
||||||
|
str_size = ptr->text_size;
|
||||||
|
str = malloc(str_size);
|
||||||
|
|
||||||
|
if (str == NULL)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
memset(str, 0, str_size);
|
||||||
|
|
||||||
|
strcat(str, "<feff");
|
||||||
|
|
||||||
|
for (int i = 0; i < ptr->text_size; i += 4) {
|
||||||
|
conv_src[0] = ptr->text[i + 3];
|
||||||
|
conv_src[1] = ptr->text[i + 2];
|
||||||
|
|
||||||
|
conv_size = 512;
|
||||||
|
|
||||||
|
if (strconv(&conv_dst, "UTF-16BE",
|
||||||
|
conv_src, "GB18030", &conv_size) == 0) {
|
||||||
|
for (int j = 0; j < conv_size - 2; j++) {
|
||||||
|
snprintf(conv_hex, 3,
|
||||||
|
"%02x", (unsigned char) conv_dst[j]);
|
||||||
|
strcat(str, conv_hex);
|
||||||
|
}
|
||||||
|
free(conv_dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
strcat(str, ">");
|
||||||
|
}
|
||||||
|
|
||||||
|
pdf_obj_append(&pdf, 0, str, NULL, NULL);
|
||||||
|
|
||||||
|
free(str);
|
||||||
|
|
||||||
|
ptr = ptr->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((*param)->stat > 1) {
|
||||||
|
printf("\t%8s\t%12s\t%12s\t%12s\n",
|
||||||
|
"id",
|
||||||
|
"object",
|
||||||
|
"dictionary",
|
||||||
|
"stream");
|
||||||
|
|
||||||
|
pdf_object_t *ptr = pdf->next;
|
||||||
|
while (ptr != NULL) {
|
||||||
|
printf("\t%8d\t%12d\t%12d\t%12d\n",
|
||||||
|
ptr->id,
|
||||||
|
ptr->object_size,
|
||||||
|
ptr->dictionary_size,
|
||||||
|
ptr->stream_size);
|
||||||
|
ptr = ptr->next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ((*param)->stat > 0)
|
if ((*param)->stat > 0)
|
||||||
printf("Conversion ended\n");
|
printf("Generated %d object(s)\n",
|
||||||
|
pdf_get_count(&pdf));
|
||||||
|
|
||||||
|
int *ids = NULL;
|
||||||
|
|
||||||
|
if ((*param)->file_stat->outline > 0) {
|
||||||
|
if ((*param)->stat > 1)
|
||||||
|
printf("Generating outline object(s)\n\t%8s\n", "id");
|
||||||
|
|
||||||
|
pdf_get_free_ids(&pdf, &ids, (*param)->file_stat->outline + 1);
|
||||||
|
int outline = pdf_cnki_outline(&pdf, &(*param)->object_outline, &ids);
|
||||||
|
|
||||||
|
if ((*param)->stat > 1)
|
||||||
|
for (int i = 0; i < (*param)->file_stat->outline + 1; i++)
|
||||||
|
printf("\t%8d\n", ids[i]);
|
||||||
|
|
||||||
|
if ((*param)->stat > 0) {
|
||||||
|
if (outline != 0)
|
||||||
|
printf("No outline information\n");
|
||||||
|
else
|
||||||
|
printf("Generated %d outline object(s)\n",
|
||||||
|
(*param)->file_stat->outline + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((*param)->stat > 1)
|
||||||
|
printf("Writing header\n");
|
||||||
|
|
||||||
|
long cur = 0;
|
||||||
|
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
cur = ftell((*param)->fp_o);
|
||||||
|
|
||||||
|
if (pdf_dump_header(&pdf, &(*param)->fp_o) != 0) {
|
||||||
|
fprintf(stderr, "Header not written\n");
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
printf("Header %ld byte(s) written\n",
|
||||||
|
ftell((*param)->fp_o) - cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((*param)->stat > 1)
|
||||||
|
printf("Writing object(s)\n");
|
||||||
|
|
||||||
|
pdf_dump_obj(&pdf, &(*param)->fp_o);
|
||||||
|
|
||||||
|
if ((*param)->stat > 1) {
|
||||||
|
printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n",
|
||||||
|
"address",
|
||||||
|
"size",
|
||||||
|
"id",
|
||||||
|
"object",
|
||||||
|
"dictionary",
|
||||||
|
"stream");
|
||||||
|
|
||||||
|
pdf_object_t *ptr = pdf->next;
|
||||||
|
while (ptr != NULL) {
|
||||||
|
printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n",
|
||||||
|
ptr->address,
|
||||||
|
ptr->size,
|
||||||
|
ptr->id,
|
||||||
|
ptr->object_size,
|
||||||
|
ptr->dictionary_size,
|
||||||
|
ptr->stream_size);
|
||||||
|
ptr = ptr->next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
printf("%d object(s) %ld byte(s) written\n",
|
||||||
|
pdf_get_count(&pdf),
|
||||||
|
ftell((*param)->fp_o));
|
||||||
|
|
||||||
|
long xref = ftell((*param)->fp_o);
|
||||||
|
|
||||||
|
if ((*param)->stat > 1)
|
||||||
|
printf("Writing cross-reference table\n");
|
||||||
|
|
||||||
|
if (pdf_dump_xref(&pdf, &(*param)->fp_o) != 0) {
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
printf("Cross-reference table not written\n");
|
||||||
|
} else {
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
printf("Cross-reference table %ld byte(s) written\n",
|
||||||
|
ftell((*param)->fp_o) - xref);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((*param)->stat > 1)
|
||||||
|
printf("Writing trailer\n");
|
||||||
|
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
cur = ftell((*param)->fp_o);
|
||||||
|
|
||||||
|
if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) {
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
printf("Trailer not written\n");
|
||||||
|
} else {
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
printf("Trailer %ld byte(s) written\n",
|
||||||
|
ftell((*param)->fp_o) - cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
printf("Total %ld byte(s) written\n",
|
||||||
|
ftell((*param)->fp_o));
|
||||||
|
|
||||||
|
pdf_obj_destroy(&pdf);
|
||||||
|
|
||||||
|
if ((*param)->stat > 0)
|
||||||
|
printf("Conversion ended (partial)\n");
|
||||||
|
|
||||||
/* TODO: Finish me please :) */
|
/* TODO: Finish me please :) */
|
||||||
return 1;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,4 +4,22 @@
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <zlib.h>
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "zlib.h"
|
||||||
|
|
||||||
|
int
|
||||||
|
cnki_zlib(char **dst, int *dst_size,
|
||||||
|
const char * restrict src, int src_size)
|
||||||
|
{
|
||||||
|
int32_t size;
|
||||||
|
memcpy(&size, src + 20, 4);
|
||||||
|
|
||||||
|
*dst_size = size;
|
||||||
|
|
||||||
|
if (strinflate(dst, size, src + 24, size - 24) != 0)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
@ -9,7 +9,6 @@
|
||||||
|
|
||||||
#include <iconv.h>
|
#include <iconv.h>
|
||||||
|
|
||||||
/* So, why would anyone use something other than UTF-8? */
|
|
||||||
int
|
int
|
||||||
strconv(char **dst,
|
strconv(char **dst,
|
||||||
const char * restrict dst_code,
|
const char * restrict dst_code,
|
||||||
|
@ -51,8 +50,7 @@ strconv(char **dst,
|
||||||
free(src_start);
|
free(src_start);
|
||||||
return 1;
|
return 1;
|
||||||
} else {
|
} else {
|
||||||
/* Not including NULL */
|
*size -= dst_size;
|
||||||
*size -= dst_size + 2;
|
|
||||||
|
|
||||||
*dst = malloc(*size);
|
*dst = malloc(*size);
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,7 @@
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int
|
int strconv(char **dst,
|
||||||
strconv(char **dst,
|
|
||||||
const char * restrict dst_code,
|
const char * restrict dst_code,
|
||||||
const char * restrict src,
|
const char * restrict src,
|
||||||
const char * restrict src_code,
|
const char * restrict src_code,
|
||||||
|
|
|
@ -86,25 +86,25 @@ main(int argc, char **argv, char **envp)
|
||||||
|
|
||||||
cnki_info(¶m);
|
cnki_info(¶m);
|
||||||
|
|
||||||
if (strcmp(param->file_stat->type, "%PDF") == 0) {
|
if (strncmp(param->file_stat->type, "%PDF", 4) == 0) {
|
||||||
if (cnki_pdf(¶m) != 0) {
|
if (cnki_pdf(¶m) != 0) {
|
||||||
fprintf(stderr, "%s: %s\n", argv[0],
|
fprintf(stderr, "%s: %s\n", argv[0],
|
||||||
strerror(errno));
|
strerror(errno));
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
} else if (strcmp(param->file_stat->type, "CAJ") == 0) {
|
} else if (strncmp(param->file_stat->type, "CAJ", 3) == 0) {
|
||||||
if (cnki_caj(¶m) != 0) {
|
if (cnki_caj(¶m) != 0) {
|
||||||
fprintf(stderr, "%s: %s\n", argv[0],
|
fprintf(stderr, "%s: %s\n", argv[0],
|
||||||
strerror(errno));
|
strerror(errno));
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
} else if (strcmp(param->file_stat->type, "HN") == 0) {
|
} else if (strncmp(param->file_stat->type, "HN", 2) == 0) {
|
||||||
if (cnki_hn(¶m) != 0) {
|
if (cnki_hn(¶m) != 0) {
|
||||||
fprintf(stderr, "%s: %s\n", argv[0],
|
fprintf(stderr, "%s: %s\n", argv[0],
|
||||||
strerror(errno));
|
strerror(errno));
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
} else if (strcmp(param->file_stat->type, "KDH ") == 0) {
|
} else if (strncmp(param->file_stat->type, "KDH ", 4) == 0) {
|
||||||
if (cnki_kdh(¶m) != 0) {
|
if (cnki_kdh(¶m) != 0) {
|
||||||
fprintf(stderr, "%s: %s\n", argv[0],
|
fprintf(stderr, "%s: %s\n", argv[0],
|
||||||
strerror(errno));
|
strerror(errno));
|
||||||
|
|
|
@ -21,8 +21,6 @@ typedef struct _pdf_object_t {
|
||||||
|
|
||||||
/* pdf.c */
|
/* pdf.c */
|
||||||
/* TODO: Rewrite object dictionary */
|
/* TODO: Rewrite object dictionary */
|
||||||
/* TODO: Compact object id */
|
|
||||||
/* TODO: `mutool clean -gggsz' */
|
|
||||||
int pdf_obj_create(pdf_object_t **pdf);
|
int pdf_obj_create(pdf_object_t **pdf);
|
||||||
void pdf_obj_destroy(pdf_object_t **pdf);
|
void pdf_obj_destroy(pdf_object_t **pdf);
|
||||||
int pdf_obj_add(pdf_object_t **pdf, int id,
|
int pdf_obj_add(pdf_object_t **pdf, int id,
|
||||||
|
|
|
@ -50,7 +50,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int *
|
||||||
&size) == 0) {
|
&size) == 0) {
|
||||||
strcat(dictionary, "/Title <feff");
|
strcat(dictionary, "/Title <feff");
|
||||||
|
|
||||||
for (int i = 0; i < size; i++) {
|
for (int i = 0; i < size - 2; i++) {
|
||||||
snprintf(buf, 64, "%02x", (unsigned char) str[i]);
|
snprintf(buf, 64, "%02x", (unsigned char) str[i]);
|
||||||
strcat(dictionary, buf);
|
strcat(dictionary, buf);
|
||||||
}
|
}
|
||||||
|
@ -89,7 +89,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int *
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Page starts from 0 */
|
/* Page starts from 0 */
|
||||||
snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>\n",
|
snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>",
|
||||||
atoi(ptr->item->page) - 1);
|
atoi(ptr->item->page) - 1);
|
||||||
strcat(dictionary, buf);
|
strcat(dictionary, buf);
|
||||||
|
|
||||||
|
@ -123,7 +123,7 @@ pdf_cnki_outline(pdf_object_t **pdf, object_outline_t **outline, int **ids)
|
||||||
free(outline_tree);
|
free(outline_tree);
|
||||||
|
|
||||||
snprintf(buf, 128,
|
snprintf(buf, 128,
|
||||||
"<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>\n",
|
"<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>",
|
||||||
ret[0], ret[1], ret[2]);
|
ret[0], ret[1], ret[2]);
|
||||||
|
|
||||||
free(ret);
|
free(ret);
|
||||||
|
|
|
@ -26,12 +26,15 @@ pdf_dump_obj(pdf_object_t **pdf, FILE **fp)
|
||||||
|
|
||||||
fprintf(*fp, "%d 0 obj\n", ptr->id);
|
fprintf(*fp, "%d 0 obj\n", ptr->id);
|
||||||
|
|
||||||
if (ptr->dictionary != NULL)
|
if (ptr->dictionary != NULL) {
|
||||||
fputs(ptr->dictionary, *fp);
|
fputs(ptr->dictionary, *fp);
|
||||||
else if (ptr->object != NULL)
|
fputs("\n", *fp);
|
||||||
|
} else if (ptr->object != NULL) {
|
||||||
fputs(ptr->object, *fp);
|
fputs(ptr->object, *fp);
|
||||||
else if (ptr->stream == NULL)
|
fputs("\n", *fp);
|
||||||
|
} else if (ptr->stream == NULL) {
|
||||||
fputs("null\n", *fp);
|
fputs("null\n", *fp);
|
||||||
|
}
|
||||||
|
|
||||||
if (ptr->stream != NULL) {
|
if (ptr->stream != NULL) {
|
||||||
fputs("stream\r\n", *fp);
|
fputs("stream\r\n", *fp);
|
||||||
|
|
31
src/zlib.c
Normal file
31
src/zlib.c
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2020, yzrh <yzrh@noema.org>
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <zlib.h>
|
||||||
|
|
||||||
|
int
|
||||||
|
strinflate(char **dst, int dst_size,
|
||||||
|
const char * restrict src, int src_size)
|
||||||
|
{
|
||||||
|
*dst = malloc(dst_size);
|
||||||
|
|
||||||
|
if (*dst == NULL)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
unsigned long size = dst_size;
|
||||||
|
|
||||||
|
uncompress((Bytef *) *dst, &size, (const Bytef *) src, src_size);
|
||||||
|
|
||||||
|
if (size != dst_size) {
|
||||||
|
free(*dst);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
8
src/zlib.h
Normal file
8
src/zlib.h
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2020, yzrh <yzrh@noema.org>
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
int strinflate(char **dst, int dst_size,
|
||||||
|
const char * restrict src, int src_size);
|
Loading…
Reference in a new issue