[vlc-commits] url: handle IRIs properly in vlc_UrlParse() (fixes #17515)

Rémi Denis-Courmont git at videolan.org
Wed Oct 19 14:56:12 CEST 2016


vlc | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Wed Oct 19 15:50:57 2016 +0300| [e48ec7558912edd65b3303c07dc7470d65761171] | committer: Rémi Denis-Courmont

url: handle IRIs properly in vlc_UrlParse() (fixes #17515)

> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=e48ec7558912edd65b3303c07dc7470d65761171
---

 include/vlc_url.h |  8 +++++---
 src/text/url.c    | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/include/vlc_url.h b/include/vlc_url.h
index 405fc4a..5a20c27 100644
--- a/include/vlc_url.h
+++ b/include/vlc_url.h
@@ -156,7 +156,7 @@ struct vlc_url_t
 };
 
 /**
- * Splits an URL into parts.
+ * Parses an URI or IRI.
  *
  * Extracts the following parts from an URI string:
  *  - scheme (i.e. protocol),
@@ -167,8 +167,10 @@ struct vlc_url_t
  *  - path (including the filename preceded by any and all directories)
  *  - request parameters (excluding the leading question mark '?').
  *
- * If the host name uses IDN, it is decoded to ASCII, as appropriate for DNS
- * resolution. If the host is an IPv6 address literal, brackets are stripped.
+ * The function accepts URIs, as well as UTF-8-encoded IRIs. For IRIs, the hier
+ * part (specifically, the host name) is assumed to be an IDN and is decoded to
+ * ASCII according, so it can be used for DNS resolution. If the host is an
+ * IPv6 address literal, brackets are stripped.
  *
  * Any missing part is set to nul. For historical reasons, the target structure
  * is always initialized, even if parsing the URI string fails.
diff --git a/src/text/url.c b/src/text/url.c
index 821b2b0..6116893 100644
--- a/src/text/url.c
+++ b/src/text/url.c
@@ -24,6 +24,7 @@
 #endif
 
 #include <errno.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -95,6 +96,8 @@ static bool isurihex(int c)
         || ((unsigned char)(c - 'a') < 6);
 }
 
+static const char urihex[] = "0123456789ABCDEF";
+
 static char *encode_URI_bytes (const char *str, size_t *restrict lenp)
 {
     char *buf = malloc (3 * *lenp + 1);
@@ -104,7 +107,6 @@ static char *encode_URI_bytes (const char *str, size_t *restrict lenp)
     char *out = buf;
     for (size_t i = 0; i < *lenp; i++)
     {
-        static const char hex[] = "0123456789ABCDEF";
         unsigned char c = str[i];
 
         if (isurisafe (c))
@@ -114,8 +116,8 @@ static char *encode_URI_bytes (const char *str, size_t *restrict lenp)
         else
         {
             *(out++) = '%';
-            *(out++) = hex[c >> 4];
-            *(out++) = hex[c & 0xf];
+            *(out++) = urihex[c >> 4];
+            *(out++) = urihex[c & 0xf];
         }
     }
 
@@ -323,6 +325,49 @@ out:
 
 static char *vlc_idna_to_ascii (const char *);
 
+/* RFC3987 §3.1 */
+static char *vlc_iri2uri(const char *iri)
+{
+    size_t a = 0, u = 0;
+
+    for (size_t i = 0; iri[i] != '\0'; i++)
+    {
+        unsigned char c = iri[i];
+
+        if (c < 128)
+            a++;
+        else
+            u++;
+    }
+
+    if (unlikely((a + u) > (SIZE_MAX / 4)))
+    {
+        errno = ENOMEM;
+        return NULL;
+    }
+
+    char *uri = malloc(a + 3 * u + 1), *p;
+    if (unlikely(uri == NULL))
+        return NULL;
+
+    for (p = uri; *iri != '\0'; iri++)
+    {
+        unsigned char c = *iri;
+
+        if (c < 128)
+            *(p++) = c;
+        else
+        {
+            *(p++) = '%';
+            *(p++) = urihex[c >> 4];
+            *(p++) = urihex[c & 0xf];
+        }
+    }
+
+    *p = '\0';
+    return uri;
+}
+
 static bool vlc_uri_component_validate(const char *str, const char *extras)
 {
     assert(str != NULL);
@@ -372,7 +417,7 @@ int vlc_UrlParse(vlc_url_t *restrict url, const char *str)
         return -1;
     }
 
-    char *buf = strdup (str);
+    char *buf = vlc_iri2uri(str);
     if (unlikely(buf == NULL))
         return -1;
     url->psz_buffer = buf;
@@ -464,7 +509,7 @@ int vlc_UrlParse(vlc_url_t *restrict url, const char *str)
             if (next != NULL)
                 *(next++) = '\0';
 
-            url->psz_host = vlc_idna_to_ascii (cur);
+            url->psz_host = vlc_idna_to_ascii(vlc_uri_decode(cur));
         }
 
         if (url->psz_host == NULL)



More information about the vlc-commits mailing list