[vlc-commits] [Git][videolan/vlc][master] 2 commits: unicode: add test case for overlong surrogates
Hugo Beauzée-Luyssen (@chouquette)
gitlab at videolan.org
Thu Sep 23 14:27:03 UTC 2021
Hugo Beauzée-Luyssen pushed to branch master at VideoLAN / VLC
Commits:
f50b2c12 by Rémi Denis-Courmont at 2021-09-23T14:05:46+00:00
unicode: add test case for overlong surrogates
- - - - -
8dfba3ae by Rémi Denis-Courmont at 2021-09-23T14:05:46+00:00
unicode: optimise vlc_towc()
- - - - -
2 changed files:
- src/test/utf8.c
- src/text/unicode.c
Changes:
=====================================
src/test/utf8.c
=====================================
@@ -153,6 +153,9 @@ int main (void)
test_towc("\xED\xA0\x80", -1, 0xD800);
test_towc("\xED\xBF\xBF", -1, 0xDFFF);
test_towc("\xEE\x80\x80", 3, 0xE000);
+ /* Overlong surrogates */
+ test_towc("\xF0\x8D\x88\x80", -1, 0xD800);
+ test_towc("\xF0\x8D\xBF\xBF", -1, 0xDFFF);
/* Spurious continuation byte */
test_towc("\x80", -1, 0);
test_towc("\xBF", -1, 0);
=====================================
src/text/unicode.c
=====================================
@@ -112,76 +112,67 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
size_t vlc_towc (const char *str, uint32_t *restrict pwc)
{
- uint8_t *ptr = (uint8_t *)str, c;
- uint32_t cp;
-
assert (str != NULL);
- c = *ptr;
- if (unlikely(c > 0xF4))
+ unsigned char c0 = str[0];
+
+ if (likely((c0 & 0x80) == 0)) { // 7-bit ASCII character -> short cut
+ *pwc = c0;
+ return c0 != '\0';
+ }
+
+ if (unlikely((c0 & 0x40) == 0))
+ return -1; // continuation byte -> error
+
+ unsigned char c1 = str[1];
+ uint32_t cp = c1 & 0x3F;
+
+ if (unlikely((c1 >> 6) != 2)) // missing continuation byte
return -1;
- int charlen = clz((unsigned char)(c ^ 0xFF));
- switch (charlen)
- {
- case 0: // 7-bit ASCII character -> short cut
- *pwc = c;
- return c != '\0';
+ if (likely((c0 & 0x20) == 0)) { // two-byte sequence
+ *pwc = cp = ((c0 & 0x1F) << 6) | cp;
- case 1: // continuation byte -> error
- return -1;
+ if (unlikely(cp < 0x80))
+ return -1; // ASCII overlong
+ return 2;
+ }
- case 2:
- if (unlikely(c < 0xC2)) // ASCII overlong
- return -1;
- cp = (c & 0x1F) << 6;
- break;
+ unsigned char c2 = str[2];
- case 3:
- cp = (c & 0x0F) << 12;
- break;
+ cp = (cp << 6) | (c2 & 0x3F);
- case 4:
- cp = (c & 0x07) << 18;
- break;
+ if (unlikely((c2 >> 6) != 2)) // missing second continuation byte
+ return -1;
- default:
- vlc_assert_unreachable ();
+ if (likely((c0 & 0x10) == 0)) { // three-byte sequence
+ *pwc = cp = ((c0 & 0xF) << 12) | cp;
+
+ if (unlikely(cp < 0x800)) // overlong
+ return -1;
+ if (unlikely(cp >= 0xD800 && cp < 0xE000)) // surrogate
+ return -1;
+ return 3;
}
- /* Unrolled continuation bytes decoding */
- switch (charlen)
- {
- case 4:
- c = *++ptr;
- if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
- return -1;
- cp |= (c & 0x3F) << 12;
-
- if (unlikely(cp >= 0x110000)) // beyond Unicode range
- return -1;
- /* fall through */
- case 3:
- c = *++ptr;
- if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
- return -1;
- cp |= (c & 0x3F) << 6;
-
- if (unlikely(cp >= 0xD800 && cp < 0xE000)) // UTF-16 surrogate
- return -1;
- if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
- return -1;
- /* fall through */
- case 2:
- c = *++ptr;
- if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
- return -1;
- cp |= (c & 0x3F);
- break;
+ if (likely((c0 & 0x08) == 0)) { // four-byte sequence
+ unsigned char c3 = str[3];
+
+ cp = (cp << 6) | (c3 & 0x3F);
+
+ if (unlikely((c3 >> 6) != 2)) // missing third continuation byte
+ return -1;
+
+ *pwc = cp = ((c0 & 0xF) << 18) | cp;
+
+ if (unlikely(cp < 0x10000)) // overlong (or surrogate)
+ return -1;
+ if (unlikely(cp >= 0x110000)) // out of Unicode range
+ return -1;
+ return 4;
}
- *pwc = cp;
- return charlen;
+ return -1;
}
/**
View it on GitLab: https://code.videolan.org/videolan/vlc/-/compare/499b7ef7aab93826765cd0dc4bdd66bebe3b6bab...8dfba3ae96ffca962c16c2e8918250f53febe333
--
View it on GitLab: https://code.videolan.org/videolan/vlc/-/compare/499b7ef7aab93826765cd0dc4bdd66bebe3b6bab...8dfba3ae96ffca962c16c2e8918250f53febe333
You're receiving this email because of your account on code.videolan.org.
More information about the vlc-commits
mailing list