sub: sub-filter-regex and jsre: support ass-to-plaintext

Using --sub-filter-regex-plain (default:no)

The ass-to-plaintext functionality already existed at sd_ass.c, but
it's internal and uses a private buffer type, so a trivial utility
wrapper was added with standard char*/bstr interface.

The plaintext can be multi-line, and the multi-line regexp flag is now
always set, but only affects plaintext (the ASS source is one line).
This commit is contained in:
Avi Halachmi (:avih) 2021-07-23 20:31:15 +03:00 committed by avih
parent 7c264950c0
commit 41650203c3
7 changed files with 31 additions and 3 deletions

View File

@ -2804,7 +2804,7 @@ Subtitles
List items are matched in order. If a regular expression matches, the
process is stopped, and the subtitle line is discarded. The text matched
against is, currently, always the ``Text`` field of ASS events (if the
against is, by default, the ``Text`` field of ASS events (if the
subtitle format is different, it is always converted). This may include
formatting tags. Matching is case-insensitive, but how this is done depends
on the libc, and most likely works in ASCII only. It does not work on
@ -2831,6 +2831,12 @@ Subtitles
Shares/affected-by all ``--sub-filter-regex-*`` control options (see below),
and also experimental. Requires only JavaScript support.
``--sub-filter-regex-plain=<yes|no>``
Whether to first convert the ASS "Text" field to plain-text (default: no).
This strips ASS tags and applies ASS directives, like ``\N`` to new-line.
If the result is multi-line then the regexp anchors ``^`` and ``$`` match
each line, but still any match discards all lines.
``--sub-filter-regex-warn=<yes|no>``
Log dropped lines with warning log level, instead of verbose (default: no).
Helpful for testing.

View File

@ -218,6 +218,7 @@ const struct m_sub_options mp_sub_filter_opts = {
{"sub-filter-sdh", OPT_FLAG(sub_filter_SDH)},
{"sub-filter-sdh-harder", OPT_FLAG(sub_filter_SDH_harder)},
{"sub-filter-regex-enable", OPT_FLAG(rf_enable)},
{"sub-filter-regex-plain", OPT_FLAG(rf_plain)},
{"sub-filter-regex", OPT_STRINGLIST(rf_items)},
{"sub-filter-jsre", OPT_STRINGLIST(jsre_items)},
{"sub-filter-regex-warn", OPT_FLAG(rf_warn)},

View File

@ -114,6 +114,7 @@ struct mp_sub_filter_opts {
int sub_filter_SDH;
int sub_filter_SDH_harder;
int rf_enable;
int rf_plain;
char **rf_items;
char **jsre_items;
int rf_warn;

View File

@ -87,7 +87,7 @@ static bool jsre_init(struct sd_filter *ft)
for (int n = 0; ft->opts->jsre_items && ft->opts->jsre_items[n]; n++) {
char *item = ft->opts->jsre_items[n];
int err = p_regcomp(p->J, p->num_regexes, item, JS_REGEXP_I);
int err = p_regcomp(p->J, p->num_regexes, item, JS_REGEXP_I | JS_REGEXP_M);
if (err) {
MP_ERR(ft, "jsre: %s -- '%s'\n", get_err(p->J), item);
js_pop(p->J, 1);
@ -111,6 +111,9 @@ static struct demux_packet *jsre_filter(struct sd_filter *ft,
char *text = bstrto0(NULL, sd_ass_pkt_text(ft, pkt, p->offset));
bool drop = false;
if (ft->opts->rf_plain)
sd_ass_to_plaintext(text, strlen(text), text);
for (int n = 0; n < p->num_regexes; n++) {
int found, err = p_regexec(p->J, n, text, &found);
if (err == 0 && found) {

View File

@ -30,7 +30,7 @@ static bool rf_init(struct sd_filter *ft)
MP_TARRAY_GROW(p, p->regexes, p->num_regexes);
regex_t *preg = &p->regexes[p->num_regexes];
int err = regcomp(preg, item, REG_ICASE | REG_EXTENDED | REG_NOSUB);
int err = regcomp(preg, item, REG_ICASE | REG_EXTENDED | REG_NOSUB | REG_NEWLINE);
if (err) {
char errbuf[512];
regerror(err, preg, errbuf, sizeof(errbuf));
@ -63,6 +63,9 @@ static struct demux_packet *rf_filter(struct sd_filter *ft,
char *text = bstrto0(NULL, sd_ass_pkt_text(ft, pkt, p->offset));
bool drop = false;
if (ft->opts->rf_plain)
sd_ass_to_plaintext(text, strlen(text), text);
for (int n = 0; n < p->num_regexes; n++) {
int err = regexec(&p->regexes[n], text, 0, NULL, 0);
if (err == 0) {

View File

@ -101,4 +101,9 @@ int sd_ass_fmt_offset(const char *event_format);
// on malformed event: warns and returns (bstr){NULL,0}
bstr sd_ass_pkt_text(struct sd_filter *ft, struct demux_packet *pkt, int offset);
// convert \0-terminated "Text" (ass) content to plaintext, possibly in-place.
// result.start is out, result.len is MIN(out_siz, strlen(in)) or smaller.
// if there's room: out[result.len] is set to \0. out == in is allowed.
bstr sd_ass_to_plaintext(char *out, size_t out_siz, const char *in);
#endif

View File

@ -973,3 +973,12 @@ bstr sd_ass_pkt_text(struct sd_filter *ft, struct demux_packet *pkt, int offset)
}
return txt;
}
bstr sd_ass_to_plaintext(char *out, size_t out_siz, const char *in)
{
struct buf b = {out, out_siz, 0};
ass_to_plaintext(&b, in);
if (b.len < out_siz)
out[b.len] = 0;
return (bstr){out, b.len};
}