demux_mkv: improve robustness against broken files

Fixes test7.mkv from the Matroska test file collection, as well as some
real broken files I've found in the wild. (Unfortunately, true recovery
requires resetting the decoders and playback state with a manual seek,
but it's still better than just exiting.)

If there are broken EBML elements, try harder to skip them correctly.
Do this by searching for the next cluster element. The cluster element
intentionally has a long ID, so it's a suitable element for
resynchronizing (mkvmerge does something similar).

We know that data is corrupt if the ID or length fields of an element
are malformed. Additionally, if skipping an unknown element goes past
the end of the file, we assume it's corrupt and undo the seek. Do this
because it often happens that corrupt data is interpreted as correct
EBML elements. Since these elements will have a ridiculous values in
their length fields due to the large value range that is possible
(0-2^56-2), they will go past the end of the file. So instead of
skipping them (which would result in playback termination), try to
find the next cluster instead. (We still skip unknown elements that
are within the file, as this is needed for correct operation. Also, we
first execute the seek, because we don't really know where the file
ends. Doing it this way is better for unseekable streams too, because
it will still work in the non-error case.)

This is done as special case in the packet reading function only. On
the other hand, that's the only part of the file that's read after
initialization is done.
This commit is contained in:
wm4 2013-03-28 00:01:17 +01:00
parent 3533ee3ae4
commit ac1c5e6e18
3 changed files with 73 additions and 8 deletions

View File

@ -26,6 +26,7 @@
#include <ctype.h>
#include <inttypes.h>
#include <stdbool.h>
#include <assert.h>
#include <libavutil/common.h>
#include <libavutil/lzo.h>
@ -2228,10 +2229,12 @@ static int demux_mkv_fill_buffer(demuxer_t *demuxer, demux_stream_t *ds)
case EBML_ID_INVALID:
free(block);
return 0;
ebml_resync_cluster(s);
goto find_next_cluster;
default:
ebml_read_skip(s, &l);
if (ebml_read_skip_or_resync_cluster(s, &l) != 0)
goto find_next_cluster;
break;
}
mkv_d->blockgroup_size -= l + il;
@ -2290,18 +2293,24 @@ static int demux_mkv_fill_buffer(demuxer_t *demuxer, demux_stream_t *ds)
break;
case EBML_ID_INVALID:
return 0;
ebml_resync_cluster(s);
goto find_next_cluster;
default:
ebml_read_skip(s, &l);
default: ;
if (ebml_read_skip_or_resync_cluster(s, &l) != 0)
goto find_next_cluster;
break;
}
mkv_d->cluster_size -= l + il;
}
}
while (ebml_read_id(s, &il) != MATROSKA_ID_CLUSTER) {
ebml_read_skip(s, NULL);
find_next_cluster:
for (;;) {
uint32_t id = ebml_read_id(s, &il);
if (id == MATROSKA_ID_CLUSTER)
break;
ebml_read_skip_or_resync_cluster(s, NULL);
if (s->eof)
return 0;
}

View File

@ -246,7 +246,7 @@ char *ebml_read_utf8(stream_t *s, uint64_t *length)
}
/*
* Skip the next element.
* Skip the current element.
*/
int ebml_read_skip(stream_t *s, uint64_t *length)
{
@ -264,6 +264,60 @@ int ebml_read_skip(stream_t *s, uint64_t *length)
return 0;
}
/*
* Skip to (probable) next cluster (MATROSKA_ID_CLUSTER) element start position.
*/
int ebml_resync_cluster(stream_t *s)
{
int64_t pos = stream_tell(s);
uint32_t last_4_bytes = 0;
mp_msg(MSGT_DEMUX, MSGL_ERR, "[mkv] Corrupt file detected. "
"Trying to resync starting from position %"PRId64"...\n", pos);
while (!s->eof) {
// Assumes MATROSKA_ID_CLUSTER is 4 bytes, with no 0 bytes.
if (last_4_bytes == MATROSKA_ID_CLUSTER) {
mp_msg(MSGT_DEMUX, MSGL_ERR,
"[mkv] Cluster found at %"PRId64".\n", pos - 4);
stream_seek(s, pos - 4);
return 0;
}
last_4_bytes = (last_4_bytes << 8) | stream_read_char(s);
pos++;
}
return -1;
}
/*
* Skip the current element, or on error, call ebml_resync_cluster().
*/
int ebml_read_skip_or_resync_cluster(stream_t *s, uint64_t *length)
{
uint64_t len;
int l;
len = ebml_read_length(s, &l);
if (len == EBML_UINT_INVALID)
goto resync;
if (length)
*length = len + l;
int64_t pos = stream_tell(s);
stream_skip(s, len);
// When reading corrupted elements, len will often be a random high number,
// and stream_skip() will set EOF.
if (s->eof) {
stream_seek(s, pos);
goto resync;
}
return 0;
resync:
return ebml_resync_cluster(s) < 0 ? -1 : 1;
}
/*
* Read the next element, but only the header. The contents
* are supposed to be sub-elements which can be read separately.

View File

@ -99,6 +99,8 @@ double ebml_read_float (stream_t *s, uint64_t *length);
char *ebml_read_ascii (stream_t *s, uint64_t *length);
char *ebml_read_utf8 (stream_t *s, uint64_t *length);
int ebml_read_skip (stream_t *s, uint64_t *length);
int ebml_read_skip_or_resync_cluster(stream_t *s, uint64_t *length);
int ebml_resync_cluster(stream_t *s);
uint32_t ebml_read_master (stream_t *s, uint64_t *length);
int ebml_read_element(struct stream *s, struct ebml_parse_ctx *ctx,