[vlc-commits] Import vlc_towc() function from VLC 1.2
Rémi Denis-Courmont
git at videolan.org
Sun May 8 19:56:49 CEST 2011
vlc/vlc-1.1 | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Sun May 8 20:56:01 2011 +0300| [7b017ca7cb0c95f4314506c4bc2cf52243786a2f] | committer: Rémi Denis-Courmont
Import vlc_towc() function from VLC 1.2
> http://git.videolan.org/gitweb.cgi/vlc/vlc-1.1.git/?a=commit;h=7b017ca7cb0c95f4314506c4bc2cf52243786a2f
---
src/libvlc.h | 2 +
src/text/unicode.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 84 insertions(+), 0 deletions(-)
diff --git a/src/libvlc.h b/src/libvlc.h
index 236ae0c..7547a6c 100644
--- a/src/libvlc.h
+++ b/src/libvlc.h
@@ -38,6 +38,8 @@ extern const size_t libvlc_actions_count;
extern int vlc_InitActions (libvlc_int_t *);
extern void vlc_DeinitActions (libvlc_int_t *);
+size_t vlc_towc (const char *str, uint32_t *restrict pwc);
+
/*
* OS-specific initialization
*/
diff --git a/src/text/unicode.c b/src/text/unicode.c
index e3a29dd..cef4df6 100644
--- a/src/text/unicode.c
+++ b/src/text/unicode.c
@@ -283,6 +283,88 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
}
+/**
+ * Converts the first character from a UTF-8 sequence into a code point.
+ *
+ * @param str an UTF-8 bytes sequence
+ * @return 0 if str points to an empty string, i.e. the first character is NUL;
+ * number of bytes that the first character occupies (from 1 to 4) otherwise;
+ * -1 if the byte sequence was not a valid UTF-8 sequence.
+ */
+size_t vlc_towc (const char *str, uint32_t *restrict pwc)
+{
+ uint8_t *ptr = (uint8_t *)str, c;
+ uint32_t cp;
+
+ assert (str != NULL);
+
+ c = *ptr;
+ if (unlikely(c > 0xF4))
+ return -1;
+
+ int charlen = clz8 (c ^ 0xFF);
+ switch (charlen)
+ {
+ case 0: // 7-bit ASCII character -> short cut
+ *pwc = c;
+ return c != '\0';
+
+ case 1: // continuation byte -> error
+ return -1;
+
+ case 2:
+ if (unlikely(c < 0xC2)) // ASCII overlong
+ return -1;
+ cp = (c & 0x1F) << 6;
+ break;
+
+ case 3:
+ cp = (c & 0x0F) << 12;
+ break;
+
+ case 4:
+ cp = (c & 0x07) << 16;
+ break;
+
+ default:
+ assert (0);
+ }
+
+ /* Unrolled continuation bytes decoding */
+ switch (charlen)
+ {
+ case 4:
+ c = *++ptr;
+ if (unlikely((c >> 6) != 2)) // not a continuation byte
+ return -1;
+ cp |= (c & 0x3f) << 12;
+
+ if (unlikely(cp >= 0x110000)) // beyond Unicode range
+ return -1;
+ /* fall through */
+ case 3:
+ c = *++ptr;
+ if (unlikely((c >> 6) != 2)) // not a continuation byte
+ return -1;
+ cp |= (c & 0x3f) << 6;
+
+ if (unlikely(cp >= 0xD800 && cp < 0xC000)) // UTF-16 surrogate
+ return -1;
+ if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
+ return -1;
+ /* fall through */
+ case 2:
+ c = *++ptr;
+ if (unlikely((c >> 6) != 2)) // not a continuation byte
+ return -1;
+ cp |= (c & 0x3f);
+ break;
+ }
+
+ *pwc = cp;
+ return charlen;
+}
+
static char *CheckUTF8( char *str, char rep )
{
uint8_t *ptr = (uint8_t *)str;
More information about the vlc-commits
mailing list