Fix inconsistent whitespace detection in PDF parser.

Signed-off-by: yzrh <yzrh@noema.org>
2023-01-02 23:40:54 +00:00 · 2023-01-02 23:40:54 +00:00 · 4a02b8bfc7
commit 4a02b8bfc7
parent 7d9d658461
2 changed files with 23 additions and 13 deletions
--- a/CHANGE.md
+++ b/CHANGE.md
@ -8,6 +8,7 @@

 * Improve PDF parser.
 * Handle duplicated object in CAJ.
+* Handle duplicated image in HN.
 * Fix JBIG decoder.

 0.2.4 (2022-12-31)
--- a/src/pdf_parser.c
+++ b/src/pdf_parser.c
@ -19,26 +19,35 @@ static void *
 _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
 {
 	const char whitespace[6] = {
-		' ',
-		'\r',
-		'\n',
-		'\f',
-		'\t',
-		'\0'
+		0x00,
+		0x09,
+		0x0a,
+		0x0c,
+		0x0d,
+		0x20
 	};

-	char tmp[s1 + 1];
-	memcpy(tmp, p1, s1);
+	char *ret = NULL;

-	char *ret;
+	char str[s1 + 1];
+	memcpy(str, p1, s1);
+
+	size_t tmp_size = 0;
+	char *tmp;

 	for (int i = 0; i < 6; i++) {
-		tmp[s1] = whitespace[i];
-		if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
-			return ret;
+		str[s1] = whitespace[i];
+
+		if ((tmp = memmem(p0, s0, str, s1 + 1)) == NULL)
+			continue;
+
+		if (tmp_size == 0 || (size_t) (tmp - (char *) p0) < tmp_size) {
+			tmp_size = tmp - (char *) p0;
+			ret = tmp;
+		}
 	}

-	return NULL;
+	return ret;
 }

 static int