Fix HN text parsing.
Signed-off-by: yzrh <yzrh@noema.org>
This commit is contained in:
parent
dd5854678c
commit
2fa2b760ae
1 changed files with 49 additions and 47 deletions
|
@ -850,73 +850,75 @@ cnki_pdf_hn(cnki_t **param)
|
||||||
for (int i = 0, j = 0; i < ptr->text_size - 1;) {
|
for (int i = 0, j = 0; i < ptr->text_size - 1;) {
|
||||||
switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) {
|
switch (((unsigned char) ptr->text[i + 1] << 8) + (unsigned char) ptr->text[i]) {
|
||||||
case 0x8001:
|
case 0x8001:
|
||||||
if (ptr->address_next > ptr->address)
|
if (ptr->address_next <= ptr->address) {
|
||||||
strcat(dictionary, "T*\n");
|
if (i + 7 >= ptr->text_size) {
|
||||||
case 0x8070:
|
i += 2;
|
||||||
if (ptr->address_next > ptr->address) {
|
break;
|
||||||
i += 4;
|
}
|
||||||
|
|
||||||
for (;;) {
|
conv_src[0] = ptr->text[i + 7];
|
||||||
if (i + 3 >= ptr->text_size ||
|
conv_src[1] = ptr->text[i + 6];
|
||||||
(unsigned char) ptr->text[i + 1] == 0x80)
|
|
||||||
break;
|
|
||||||
|
|
||||||
conv_src[0] = ptr->text[i + 3];
|
//snprintf(buf, 64, "1 0 0 1 %d %d Tm\n")
|
||||||
conv_src[1] = ptr->text[i + 2];
|
//strcat(dictionary, buf);
|
||||||
|
|
||||||
//snprintf(buf, 64, "%f %f Td\n");
|
conv_size = 6;
|
||||||
//strcat(dictionary, buf);
|
|
||||||
|
|
||||||
conv_size = 6;
|
if (strconv(&conv_dst, "UTF-16BE",
|
||||||
|
conv_src, "GB18030", &conv_size) == 0) {
|
||||||
if (strconv(&conv_dst, "UTF-16BE",
|
if (conv_size - 2 > 0) {
|
||||||
conv_src, "GB18030", &conv_size) == 0) {
|
strcat(dictionary, "<");
|
||||||
if (conv_size - 2 > 0) {
|
for (int k = 0; k < conv_size - 2; k++) {
|
||||||
strcat(dictionary, "<feff");
|
snprintf(conv_hex, 3,
|
||||||
for (int k = 0; k < conv_size - 2; k++) {
|
"%02x", (unsigned char) conv_dst[k]);
|
||||||
snprintf(conv_hex, 3,
|
strcat(dictionary, conv_hex);
|
||||||
"%02x", (unsigned char) conv_dst[k]);
|
|
||||||
strcat(dictionary, conv_hex);
|
|
||||||
}
|
|
||||||
strcat(dictionary, "> Tj\n");
|
|
||||||
}
|
}
|
||||||
free(conv_dst);
|
strcat(dictionary, "> Tj\n");
|
||||||
}
|
}
|
||||||
|
free(conv_dst);
|
||||||
i += 4;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
i += 8;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i + 7 >= ptr->text_size) {
|
strcat(dictionary, "T*\n");
|
||||||
i += 2;
|
case 0x8070:
|
||||||
|
i += 4;
|
||||||
|
|
||||||
|
if (ptr->address_next <= ptr->address)
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
|
|
||||||
conv_src[0] = ptr->text[i + 7];
|
for (;;) {
|
||||||
conv_src[1] = ptr->text[i + 6];
|
if (i + 3 >= ptr->text_size ||
|
||||||
|
(unsigned char) ptr->text[i + 1] == 0x80)
|
||||||
|
break;
|
||||||
|
|
||||||
//snprintf(buf, 64, "%f %f Td\n");
|
conv_src[0] = ptr->text[i + 3];
|
||||||
//strcat(dictionary, buf);
|
conv_src[1] = ptr->text[i + 2];
|
||||||
|
|
||||||
conv_size = 6;
|
//snprintf(buf, 64, "1 0 0 1 %d %d Tm\n")
|
||||||
|
//strcat(dictionary, buf);
|
||||||
|
|
||||||
if (strconv(&conv_dst, "UTF-16BE",
|
conv_size = 6;
|
||||||
conv_src, "GB18030", &conv_size) == 0) {
|
|
||||||
if (conv_size - 2 > 0) {
|
if (strconv(&conv_dst, "UTF-16BE",
|
||||||
strcat(dictionary, "<feff");
|
conv_src, "GB18030", &conv_size) == 0) {
|
||||||
for (int k = 0; k < conv_size - 2; k++) {
|
if (conv_size - 2 > 0) {
|
||||||
snprintf(conv_hex, 3,
|
strcat(dictionary, "<");
|
||||||
"%02x", (unsigned char) conv_dst[k]);
|
for (int k = 0; k < conv_size - 2; k++) {
|
||||||
strcat(dictionary, conv_hex);
|
snprintf(conv_hex, 3,
|
||||||
|
"%02x", (unsigned char) conv_dst[k]);
|
||||||
|
strcat(dictionary, conv_hex);
|
||||||
|
}
|
||||||
|
strcat(dictionary, "> Tj\n");
|
||||||
}
|
}
|
||||||
strcat(dictionary, "> Tj\n");
|
free(conv_dst);
|
||||||
}
|
}
|
||||||
free(conv_dst);
|
|
||||||
|
i += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
i += 8;
|
|
||||||
break;
|
break;
|
||||||
case 0x800a:
|
case 0x800a:
|
||||||
if (i + 27 >= ptr->text_size || j >= ptr->image_length) {
|
if (i + 27 >= ptr->text_size || j >= ptr->image_length) {
|
||||||
|
|
Loading…
Reference in a new issue