Fix HN text parsing.

Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
yzrh 2023-01-14 23:52:28 +00:00
parent dd5854678c
commit 2fa2b760ae

View file

@ -850,45 +850,7 @@ cnki_pdf_hn(cnki_t **param)
for (int i = 0, j = 0; i < ptr->text_size - 1;) { for (int i = 0, j = 0; i < ptr->text_size - 1;) {
switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) { switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) {
case 0x8001: case 0x8001:
if (ptr->address_next > ptr->address) if (ptr->address_next <= ptr->address) {
strcat(dictionary, "T*\n");
case 0x8070:
if (ptr->address_next > ptr->address) {
i += 4;
for (;;) {
if (i + 3 >= ptr->text_size ||
(unsigned char) ptr->text[i + 1] == 0x80)
break;
conv_src[0] = ptr->text[i + 3];
conv_src[1] = ptr->text[i + 2];
//snprintf(buf, 64, "%f %f Td\n");
//strcat(dictionary, buf);
conv_size = 6;
if (strconv(&conv_dst, "UTF-16BE",
conv_src, "GB18030", &conv_size) == 0) {
if (conv_size - 2 > 0) {
strcat(dictionary, "<feff");
for (int k = 0; k < conv_size - 2; k++) {
snprintf(conv_hex, 3,
"%02x", (unsigned char) conv_dst[k]);
strcat(dictionary, conv_hex);
}
strcat(dictionary, "> Tj\n");
}
free(conv_dst);
}
i += 4;
}
break;
}
if (i + 7 >= ptr->text_size) { if (i + 7 >= ptr->text_size) {
i += 2; i += 2;
break; break;
@ -897,7 +859,7 @@ cnki_pdf_hn(cnki_t **param)
conv_src[0] = ptr->text[i + 7]; conv_src[0] = ptr->text[i + 7];
conv_src[1] = ptr->text[i + 6]; conv_src[1] = ptr->text[i + 6];
//snprintf(buf, 64, "%f %f Td\n"); //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n")
//strcat(dictionary, buf); //strcat(dictionary, buf);
conv_size = 6; conv_size = 6;
@ -905,7 +867,7 @@ cnki_pdf_hn(cnki_t **param)
if (strconv(&conv_dst, "UTF-16BE", if (strconv(&conv_dst, "UTF-16BE",
conv_src, "GB18030", &conv_size) == 0) { conv_src, "GB18030", &conv_size) == 0) {
if (conv_size - 2 > 0) { if (conv_size - 2 > 0) {
strcat(dictionary, "<feff"); strcat(dictionary, "<");
for (int k = 0; k < conv_size - 2; k++) { for (int k = 0; k < conv_size - 2; k++) {
snprintf(conv_hex, 3, snprintf(conv_hex, 3,
"%02x", (unsigned char) conv_dst[k]); "%02x", (unsigned char) conv_dst[k]);
@ -917,6 +879,46 @@ cnki_pdf_hn(cnki_t **param)
} }
i += 8; i += 8;
break;
}
strcat(dictionary, "T*\n");
case 0x8070:
i += 4;
if (ptr->address_next <= ptr->address)
break;
for (;;) {
if (i + 3 >= ptr->text_size ||
(unsigned char) ptr->text[i + 1] == 0x80)
break;
conv_src[0] = ptr->text[i + 3];
conv_src[1] = ptr->text[i + 2];
//snprintf(buf, 64, "1 0 0 1 %d %d Tm\n")
//strcat(dictionary, buf);
conv_size = 6;
if (strconv(&conv_dst, "UTF-16BE",
conv_src, "GB18030", &conv_size) == 0) {
if (conv_size - 2 > 0) {
strcat(dictionary, "<");
for (int k = 0; k < conv_size - 2; k++) {
snprintf(conv_hex, 3,
"%02x", (unsigned char) conv_dst[k]);
strcat(dictionary, conv_hex);
}
strcat(dictionary, "> Tj\n");
}
free(conv_dst);
}
i += 4;
}
break; break;
case 0x800a: case 0x800a:
if (i + 27 >= ptr->text_size || j >= ptr->image_length) { if (i + 27 >= ptr->text_size || j >= ptr->image_length) {