From 2fa2b760aef552982250dad346bd255be08cd9bb Mon Sep 17 00:00:00 2001 From: yzrh Date: Sat, 14 Jan 2023 23:52:28 +0000 Subject: [PATCH] Fix HN text parsing. Signed-off-by: yzrh --- src/cnki_pdf.c | 96 ++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 0c1ebb0..d96ea49 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -850,73 +850,75 @@ cnki_pdf_hn(cnki_t **param) for (int i = 0, j = 0; i < ptr->text_size - 1;) { switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) { case 0x8001: - if (ptr->address_next > ptr->address) - strcat(dictionary, "T*\n"); - case 0x8070: - if (ptr->address_next > ptr->address) { - i += 4; + if (ptr->address_next <= ptr->address) { + if (i + 7 >= ptr->text_size) { + i += 2; + break; + } - for (;;) { - if (i + 3 >= ptr->text_size || - (unsigned char) ptr->text[i + 1] == 0x80) - break; + conv_src[0] = ptr->text[i + 7]; + conv_src[1] = ptr->text[i + 6]; - conv_src[0] = ptr->text[i + 3]; - conv_src[1] = ptr->text[i + 2]; + //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") + //strcat(dictionary, buf); - //snprintf(buf, 64, "%f %f Td\n"); - //strcat(dictionary, buf); + conv_size = 6; - conv_size = 6; - - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, " Tj\n"); + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + if (conv_size - 2 > 0) { + strcat(dictionary, "<"); + for (int k = 0; k < conv_size - 2; k++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[k]); + strcat(dictionary, conv_hex); } - free(conv_dst); + strcat(dictionary, "> Tj\n"); } - - i += 4; + free(conv_dst); } + i += 8; break; } - if (i + 7 >= ptr->text_size) { - i += 2; + strcat(dictionary, "T*\n"); + case 0x8070: + i += 4; + + if (ptr->address_next <= ptr->address) break; - } - conv_src[0] = ptr->text[i + 7]; - conv_src[1] = ptr->text[i + 6]; + for (;;) { + if (i + 3 >= ptr->text_size || + (unsigned char) ptr->text[i + 1] == 0x80) + break; - //snprintf(buf, 64, "%f %f Td\n"); - //strcat(dictionary, buf); + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; - conv_size = 6; + //snprintf(buf, 64, "1 0 0 1 %d %d Tm\n") + //strcat(dictionary, buf); - if (strconv(&conv_dst, "UTF-16BE", - conv_src, "GB18030", &conv_size) == 0) { - if (conv_size - 2 > 0) { - strcat(dictionary, " 0) { + strcat(dictionary, "<"); + for (int k = 0; k < conv_size - 2; k++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[k]); + strcat(dictionary, conv_hex); + } + strcat(dictionary, "> Tj\n"); } - strcat(dictionary, "> Tj\n"); + free(conv_dst); } - free(conv_dst); + + i += 4; } - i += 8; break; case 0x800a: if (i + 27 >= ptr->text_size || j >= ptr->image_length) {