cue: tolerate NBSP as whitespace

Apparently such .cue files exist. They fail both probing and parsing. To
make it worse, the sample at hand was encoded as Latin1.

One part of this is replacing bstr_lstrip() with a version that supports
NBSP. One could argue that bstr_lstrip() should always do this, but I
don't want to overdo it. There are many more unicode abomination which
it could be said it's supposed to handle, so it will stay ASCII instead
of going down this rabbit hole. I'm just assuming this cue sheet was
generated by some stupid software that inexplicably liked NBSPs (which
is how we justify a one-off fix). The new lstrip_whitespace() doesn't
look particularly efficient, but it doesn't have to be.

The second part is dealing with the fact that the charset is not
necessarily UTF-8. We don't want to do conversion before probing thinks
it knows it's a cue sheet (would probably make it more fragile all
around), so just make it work with Latin1 by assuming invalid code
points are Latin1. This fallback is part of why lstrip_whitespace() is
sort of roundabout.

(You could still rewrite it as much more efficient state machine,
instead of using a slow and validating UTF-8 parser that is called per
codepoint. Starting to overthink this.)

Multimedia is terrible. Legacy charsets are terrible. Everything is
terrible.

Fixes: #7429
This commit is contained in:
wm4 2020-02-03 19:13:44 +01:00
parent 13624b5c7a
commit cbee577d0a
1 changed files with 31 additions and 5 deletions

View File

@ -62,20 +62,46 @@ static const struct {
{ -1 },
};
static const uint8_t spaces[] = {' ', '\f', '\n', '\r', '\t', '\v', 0xA0};
static struct bstr lstrip_whitespace(struct bstr data)
{
while (data.len) {
bstr rest = data;
int code = bstr_decode_utf8(data, &rest);
if (code < 0) {
// Tolerate Latin1 => probing works (which doesn't convert charsets).
code = data.start[0];
rest.start += 1;
rest.len -= 1;
}
for (size_t n = 0; n < MP_ARRAY_SIZE(spaces); n++) {
if (spaces[n] == code) {
data = rest;
goto next;
}
}
break;
next: ;
}
return data;
}
static enum cue_command read_cmd(struct bstr *data, struct bstr *out_params)
{
struct bstr line = bstr_strip_linebreaks(bstr_getline(*data, data));
line = bstr_lstrip(line);
line = lstrip_whitespace(line);
if (line.len == 0)
return CUE_EMPTY;
for (int n = 0; cue_command_strings[n].command != -1; n++) {
struct bstr name = bstr0(cue_command_strings[n].text);
if (bstr_case_startswith(line, name)) {
struct bstr rest = bstr_cut(line, name.len);
if (rest.len && !strchr(WHITESPACE, rest.start[0]))
struct bstr par = lstrip_whitespace(rest);
if (rest.len && par.len == rest.len)
continue;
if (out_params)
*out_params = bstr_lstrip(rest);
*out_params = par;
return cue_command_strings[n].command;
}
}
@ -94,7 +120,7 @@ static bool eat_char(struct bstr *data, char ch)
static char *read_quoted(void *talloc_ctx, struct bstr *data)
{
*data = bstr_lstrip(*data);
*data = lstrip_whitespace(*data);
if (!eat_char(data, '"'))
return NULL;
int end = bstrchr(*data, '"');
@ -118,7 +144,7 @@ static struct bstr strip_quotes(struct bstr data)
// Return -1 on failure.
static int read_int(struct bstr *data, bool two_digit)
{
*data = bstr_lstrip(*data);
*data = lstrip_whitespace(*data);
if (data->len && data->start[0] == '-')
return -1;
struct bstr s = *data;