From e48ec7558912edd65b3303c07dc7470d65761171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 19 Oct 2016 15:50:57 +0300 Subject: [PATCH] url: handle IRIs properly in vlc_UrlParse() (fixes #17515) --- include/vlc_url.h | 8 ++++--- src/text/url.c | 55 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/include/vlc_url.h b/include/vlc_url.h index 405fc4aac0..5a20c27edc 100644 --- a/include/vlc_url.h +++ b/include/vlc_url.h @@ -156,7 +156,7 @@ struct vlc_url_t }; /** - * Splits an URL into parts. + * Parses an URI or IRI. * * Extracts the following parts from an URI string: * - scheme (i.e. protocol), @@ -167,8 +167,10 @@ struct vlc_url_t * - path (including the filename preceded by any and all directories) * - request parameters (excluding the leading question mark '?'). * - * If the host name uses IDN, it is decoded to ASCII, as appropriate for DNS - * resolution. If the host is an IPv6 address literal, brackets are stripped. + * The function accepts URIs, as well as UTF-8-encoded IRIs. For IRIs, the hier + * part (specifically, the host name) is assumed to be an IDN and is decoded to + * ASCII according, so it can be used for DNS resolution. If the host is an + * IPv6 address literal, brackets are stripped. * * Any missing part is set to nul. For historical reasons, the target structure * is always initialized, even if parsing the URI string fails. diff --git a/src/text/url.c b/src/text/url.c index 821b2b0571..6116893c33 100644 --- a/src/text/url.c +++ b/src/text/url.c @@ -24,6 +24,7 @@ #endif #include +#include #include #include #include @@ -95,6 +96,8 @@ static bool isurihex(int c) || ((unsigned char)(c - 'a') < 6); } +static const char urihex[] = "0123456789ABCDEF"; + static char *encode_URI_bytes (const char *str, size_t *restrict lenp) { char *buf = malloc (3 * *lenp + 1); @@ -104,7 +107,6 @@ static char *encode_URI_bytes (const char *str, size_t *restrict lenp) char *out = buf; for (size_t i = 0; i < *lenp; i++) { - static const char hex[] = "0123456789ABCDEF"; unsigned char c = str[i]; if (isurisafe (c)) @@ -114,8 +116,8 @@ static char *encode_URI_bytes (const char *str, size_t *restrict lenp) else { *(out++) = '%'; - *(out++) = hex[c >> 4]; - *(out++) = hex[c & 0xf]; + *(out++) = urihex[c >> 4]; + *(out++) = urihex[c & 0xf]; } } @@ -323,6 +325,49 @@ out: static char *vlc_idna_to_ascii (const char *); +/* RFC3987 ยง3.1 */ +static char *vlc_iri2uri(const char *iri) +{ + size_t a = 0, u = 0; + + for (size_t i = 0; iri[i] != '\0'; i++) + { + unsigned char c = iri[i]; + + if (c < 128) + a++; + else + u++; + } + + if (unlikely((a + u) > (SIZE_MAX / 4))) + { + errno = ENOMEM; + return NULL; + } + + char *uri = malloc(a + 3 * u + 1), *p; + if (unlikely(uri == NULL)) + return NULL; + + for (p = uri; *iri != '\0'; iri++) + { + unsigned char c = *iri; + + if (c < 128) + *(p++) = c; + else + { + *(p++) = '%'; + *(p++) = urihex[c >> 4]; + *(p++) = urihex[c & 0xf]; + } + } + + *p = '\0'; + return uri; +} + static bool vlc_uri_component_validate(const char *str, const char *extras) { assert(str != NULL); @@ -372,7 +417,7 @@ int vlc_UrlParse(vlc_url_t *restrict url, const char *str) return -1; } - char *buf = strdup (str); + char *buf = vlc_iri2uri(str); if (unlikely(buf == NULL)) return -1; url->psz_buffer = buf; @@ -464,7 +509,7 @@ int vlc_UrlParse(vlc_url_t *restrict url, const char *str) if (next != NULL) *(next++) = '\0'; - url->psz_host = vlc_idna_to_ascii (cur); + url->psz_host = vlc_idna_to_ascii(vlc_uri_decode(cur)); } if (url->psz_host == NULL)