[vlc-commits] commit: Refactor EnsureUTF8 and IsUTF8 ( Rémi Denis-Courmont )
git at videolan.org
git at videolan.org
Fri Oct 8 20:44:23 CEST 2010
vlc | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Fri Oct 8 20:30:11 2010 +0300| [9ce1a13fb8fa77a38e0dfdfe3387829e1df3f085] | committer: Rémi Denis-Courmont
Refactor EnsureUTF8 and IsUTF8
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=9ce1a13fb8fa77a38e0dfdfe3387829e1df3f085
---
src/text/unicode.c | 129 ++++++++++++++++++++++++++++++----------------------
1 files changed, 75 insertions(+), 54 deletions(-)
diff --git a/src/text/unicode.c b/src/text/unicode.c
index 3030d47..bed5f1f 100644
--- a/src/text/unicode.c
+++ b/src/text/unicode.c
@@ -2,7 +2,7 @@
* unicode.c: Unicode <-> locale functions
*****************************************************************************
* Copyright (C) 2005-2006 the VideoLAN team
- * Copyright © 2005-2008 Rémi Denis-Courmont
+ * Copyright © 2005-2010 Rémi Denis-Courmont
*
* Authors: Rémi Denis-Courmont <rem # videolan.org>
*
@@ -273,73 +273,74 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
}
-static char *CheckUTF8( char *str, char rep )
+/**
+ * Converts the first character from a UTF-8 sequence into a code point.
+ *
+ * @param str an UTF-8 bytes sequence
+ * @return 0 if str points to an empty string, i.e. the first character is NUL;
+ * number of bytes that the first character occupies (from 1 to 4) otherwise;
+ * -1 if the byte sequence was not a valid UTF-8 sequence.
+ */
+static size_t vlc_towc (const char *str, uint32_t *restrict pwc)
{
uint8_t *ptr = (uint8_t *)str;
assert (str != NULL);
- for (;;)
- {
- uint8_t c = ptr[0];
+ uint8_t c = ptr[0];
- if (c == '\0')
- break;
-
- if (c > 0xF4)
- goto error;
-
- int charlen = clz8 (c ^ 0xFF);
- switch (charlen)
- {
- case 0: // 7-bit ASCII character -> OK
- ptr++;
- continue;
+ if (unlikely(c == '\0'))
+ {
+ *pwc = 0;
+ return 0;
+ }
- case 1: // continuation byte -> error
- goto error;
- }
+ if (unlikely(c > 0xF4))
+ return -1;
- assert (charlen >= 2 && charlen <= 4);
+ int charlen = clz8 (c ^ 0xFF);
+ switch (charlen)
+ {
+ case 0: // 7-bit ASCII character -> OK
+ *pwc = c;
+ return 1;
- uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
- for (int i = 1; i < charlen; i++)
- {
- assert (cp < (1 << 26));
- c = ptr[i];
+ case 1: // continuation byte -> error
+ return -1;
+ }
- if ((c >> 6) != 2) // not a continuation byte
- goto error;
+ assert (charlen >= 2 && charlen <= 4);
- cp = (cp << 6) | (ptr[i] & 0x3f);
- }
+ uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
+ for (int i = 1; i < charlen; i++)
+ {
+ assert (cp < (1 << 26));
+ c = ptr[i];
- switch (charlen)
- {
- case 4:
- if (cp > 0x10FFFF) // beyond Unicode
- goto error;
- case 3:
- if (cp >= 0xD800 && cp < 0xC000) // UTF-16 surrogate
- goto error;
- case 2:
- if (cp < 128) // ASCII overlong
- goto error;
- if (cp < (1u << (5 * charlen - 3))) // overlong
- goto error;
- }
- ptr += charlen;
- continue;
+ if (unlikely((c >> 6) != 2)) // not a continuation byte
+ return -1;
- error:
- if (rep == 0)
- return NULL;
- *ptr++ = rep;
- str = NULL;
+ cp = (cp << 6) | (ptr[i] & 0x3f);
}
- return str;
+ switch (charlen)
+ {
+ case 4:
+ if (unlikely(cp > 0x10FFFF)) // beyond Unicode
+ return -1;
+ case 3:
+ if (unlikely(cp >= 0xD800 && cp < 0xC000)) // UTF-16 surrogate
+ return -1;
+ case 2:
+ if (unlikely(cp < 128)) // ASCII overlong
+ return -1;
+ if (unlikely(cp < (1u << (5 * charlen - 3)))) // overlong
+ return -1;
+ }
+ *pwc = cp;
+ return charlen;
}
+
/**
* Replaces invalid/overlong UTF-8 sequences with question marks.
* Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
@@ -349,7 +350,19 @@ static char *CheckUTF8( char *str, char rep )
*/
char *EnsureUTF8( char *str )
{
- return CheckUTF8( str, '?' );
+ char *ret = str;
+ size_t n;
+ uint32_t cp;
+
+ while ((n = vlc_towc (str, &cp)) != 0)
+ if (likely(n != (size_t)-1))
+ str += n;
+ else
+ {
+ *str++ = '?';
+ ret = NULL;
+ }
+ return ret;
}
@@ -362,7 +375,15 @@ char *EnsureUTF8( char *str )
*/
const char *IsUTF8( const char *str )
{
- return CheckUTF8( (char *)str, 0 );
+ size_t n;
+ uint32_t cp;
+
+ while ((n = vlc_towc (str, &cp)) != 0)
+ if (likely(n != (size_t)-1))
+ str += n;
+ else
+ return NULL;
+ return str;
}
/**
More information about the vlc-commits
mailing list