[vlc-commits] [Git][videolan/vlc][master] 2 commits: unicode: add test case for overlong surrogates

Hugo Beauzée-Luyssen (@chouquette) gitlab at videolan.org
Thu Sep 23 14:27:03 UTC 2021



Hugo Beauzée-Luyssen pushed to branch master at VideoLAN / VLC


Commits:
f50b2c12 by Rémi Denis-Courmont at 2021-09-23T14:05:46+00:00
unicode: add test case for overlong surrogates

- - - - -
8dfba3ae by Rémi Denis-Courmont at 2021-09-23T14:05:46+00:00
unicode: optimise vlc_towc()

- - - - -


2 changed files:

- src/test/utf8.c
- src/text/unicode.c


Changes:

=====================================
src/test/utf8.c
=====================================
@@ -153,6 +153,9 @@ int main (void)
     test_towc("\xED\xA0\x80", -1, 0xD800);
     test_towc("\xED\xBF\xBF", -1, 0xDFFF);
     test_towc("\xEE\x80\x80", 3, 0xE000);
+    /* Overlong surrogates */
+    test_towc("\xF0\x8D\x88\x80", -1, 0xD800);
+    test_towc("\xF0\x8D\xBF\xBF", -1, 0xDFFF);
     /* Spurious continuation byte */
     test_towc("\x80", -1, 0);
     test_towc("\xBF", -1, 0);


=====================================
src/text/unicode.c
=====================================
@@ -112,76 +112,67 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
 
 size_t vlc_towc (const char *str, uint32_t *restrict pwc)
 {
-    uint8_t *ptr = (uint8_t *)str, c;
-    uint32_t cp;
-
     assert (str != NULL);
 
-    c = *ptr;
-    if (unlikely(c > 0xF4))
+    unsigned char c0 = str[0];
+
+    if (likely((c0 & 0x80) == 0)) { // 7-bit ASCII character -> short cut
+        *pwc = c0;
+         return c0 != '\0';
+    }
+
+    if (unlikely((c0 & 0x40) == 0))
+        return -1; // continuation byte -> error
+
+    unsigned char c1 = str[1];
+    uint32_t cp = c1 & 0x3F;
+
+    if (unlikely((c1 >> 6) != 2)) // missing continuation byte
         return -1;
 
-    int charlen = clz((unsigned char)(c ^ 0xFF));
-    switch (charlen)
-    {
-        case 0: // 7-bit ASCII character -> short cut
-            *pwc = c;
-            return c != '\0';
+    if (likely((c0 & 0x20) == 0)) { // two-byte sequence
+        *pwc = cp = ((c0 & 0x1F) << 6) | cp;
 
-        case 1: // continuation byte -> error
-            return -1;
+        if (unlikely(cp < 0x80))
+            return -1; // ASCII overlong
+        return 2;
+    }
 
-        case 2:
-            if (unlikely(c < 0xC2)) // ASCII overlong
-                return -1;
-            cp = (c & 0x1F) << 6;
-            break;
+    unsigned char c2 = str[2];
 
-        case 3:
-            cp = (c & 0x0F) << 12;
-            break;
+    cp = (cp << 6) | (c2 & 0x3F);
 
-        case 4:
-            cp = (c & 0x07) << 18;
-            break;
+    if (unlikely((c2 >> 6) != 2)) // missing second continuation byte
+        return -1;
 
-        default:
-            vlc_assert_unreachable ();
+    if (likely((c0 & 0x10) == 0)) { // three-byte sequence
+        *pwc = cp = ((c0 & 0xF) << 12) | cp;
+
+        if (unlikely(cp < 0x800)) // overlong
+            return -1;
+        if (unlikely(cp >= 0xD800 && cp < 0xE000)) // surrogate
+            return -1;
+        return 3;
     }
 
-    /* Unrolled continuation bytes decoding */
-    switch (charlen)
-    {
-        case 4:
-            c = *++ptr;
-            if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
-                return -1;
-            cp |= (c & 0x3F) << 12;
-
-            if (unlikely(cp >= 0x110000)) // beyond Unicode range
-                return -1;
-            /* fall through */
-        case 3:
-            c = *++ptr;
-            if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
-                return -1;
-            cp |= (c & 0x3F) << 6;
-
-            if (unlikely(cp >= 0xD800 && cp < 0xE000)) // UTF-16 surrogate
-                return -1;
-            if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
-                return -1;
-            /* fall through */
-        case 2:
-            c = *++ptr;
-            if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
-                return -1;
-            cp |= (c & 0x3F);
-            break;
+    if (likely((c0 & 0x08) == 0)) { // four-byte sequence
+        unsigned char c3 = str[3];
+
+        cp = (cp << 6) | (c3 & 0x3F);
+
+        if (unlikely((c3 >> 6) != 2)) // missing third continuation byte
+            return -1;
+
+        *pwc = cp = ((c0 & 0xF) << 18) | cp;
+
+        if (unlikely(cp < 0x10000)) // overlong (or surrogate)
+            return -1;
+        if (unlikely(cp >= 0x110000)) // out of Unicode range
+            return -1;
+        return 4;
     }
 
-    *pwc = cp;
-    return charlen;
+    return -1;
 }
 
 /**



View it on GitLab: https://code.videolan.org/videolan/vlc/-/compare/499b7ef7aab93826765cd0dc4bdd66bebe3b6bab...8dfba3ae96ffca962c16c2e8918250f53febe333

-- 
View it on GitLab: https://code.videolan.org/videolan/vlc/-/compare/499b7ef7aab93826765cd0dc4bdd66bebe3b6bab...8dfba3ae96ffca962c16c2e8918250f53febe333
You're receiving this email because of your account on code.videolan.org.




More information about the vlc-commits mailing list