[vlc-commits] Refactor EnsureUTF8 and IsUTF8
Rémi Denis-Courmont
git at videolan.org
Sun May 8 20:00:21 CEST 2011
vlc/vlc-1.1 | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Fri Oct 8 20:30:11 2010 +0300| [81eadc82a1300a0a8b99ee66598d8c7562fe810b] | committer: Rémi Denis-Courmont
Refactor EnsureUTF8 and IsUTF8
(cherry picked from commit 9ce1a13fb8fa77a38e0dfdfe3387829e1df3f085)
Conflicts:
src/text/unicode.c
> http://git.videolan.org/gitweb.cgi/vlc/vlc-1.1.git/?a=commit;h=81eadc82a1300a0a8b99ee66598d8c7562fe810b
---
src/text/unicode.c | 92 +++++++++++++---------------------------------------
1 files changed, 23 insertions(+), 69 deletions(-)
diff --git a/src/text/unicode.c b/src/text/unicode.c
index cef4df6..c9075a8 100644
--- a/src/text/unicode.c
+++ b/src/text/unicode.c
@@ -2,7 +2,7 @@
* unicode.c: Unicode <-> locale functions
*****************************************************************************
* Copyright (C) 2005-2006 the VideoLAN team
- * Copyright © 2005-2008 Rémi Denis-Courmont
+ * Copyright © 2005-2010 Rémi Denis-Courmont
*
* Authors: Rémi Denis-Courmont <rem # videolan.org>
*
@@ -365,72 +365,6 @@ size_t vlc_towc (const char *str, uint32_t *restrict pwc)
return charlen;
}
-static char *CheckUTF8( char *str, char rep )
-{
- uint8_t *ptr = (uint8_t *)str;
- assert (str != NULL);
-
- for (;;)
- {
- uint8_t c = ptr[0];
-
- if (c == '\0')
- break;
-
- if (c > 0xF4)
- goto error;
-
- int charlen = clz8 (c ^ 0xFF);
- switch (charlen)
- {
- case 0: // 7-bit ASCII character -> OK
- ptr++;
- continue;
-
- case 1: // continuation byte -> error
- goto error;
- }
-
- assert (charlen >= 2 && charlen <= 4);
-
- uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
- for (int i = 1; i < charlen; i++)
- {
- assert (cp < (1 << 26));
- c = ptr[i];
-
- if ((c >> 6) != 2) // not a continuation byte
- goto error;
-
- cp = (cp << 6) | (ptr[i] & 0x3f);
- }
-
- switch (charlen)
- {
- case 4:
- if (cp > 0x10FFFF) // beyond Unicode
- goto error;
- case 3:
- if (cp >= 0xD800 && cp < 0xC000) // UTF-16 surrogate
- goto error;
- case 2:
- if (cp < 128) // ASCII overlong
- goto error;
- if (cp < (1u << (5 * charlen - 4))) // overlong
- goto error;
- }
- ptr += charlen;
- continue;
-
- error:
- if (rep == 0)
- return NULL;
- *ptr++ = rep;
- str = NULL;
- }
-
- return str;
-}
/**
* Replaces invalid/overlong UTF-8 sequences with question marks.
@@ -441,7 +375,19 @@ static char *CheckUTF8( char *str, char rep )
*/
char *EnsureUTF8( char *str )
{
- return CheckUTF8( str, '?' );
+ char *ret = str;
+ size_t n;
+ uint32_t cp;
+
+ while ((n = vlc_towc (str, &cp)) != 0)
+ if (likely(n != (size_t)-1))
+ str += n;
+ else
+ {
+ *str++ = '?';
+ ret = NULL;
+ }
+ return ret;
}
@@ -454,7 +400,15 @@ char *EnsureUTF8( char *str )
*/
const char *IsUTF8( const char *str )
{
- return CheckUTF8( (char *)str, 0 );
+ size_t n;
+ uint32_t cp;
+
+ while ((n = vlc_towc (str, &cp)) != 0)
+ if (likely(n != (size_t)-1))
+ str += n;
+ else
+ return NULL;
+ return str;
}
/**
More information about the vlc-commits
mailing list