misc: add language-matching utilities

2021-05-26 17:46:56 -05:00 · 2021-05-26 17:46:56 -05:00 · 8c8d97c26c
parent 1669c4698d
commit 8c8d97c26c
4 changed files with 393 additions and 0 deletions
--- a/meson.build
+++ b/meson.build
@ -131,6 +131,7 @@ sources = files(
    'misc/charset_conv.c',
    'misc/dispatch.c',
    'misc/json.c',
+    'misc/language.c',
    'misc/natural_sort.c',
    'misc/node.c',
    'misc/random.c',
--- a/misc/language.c
+++ b/misc/language.c
@ -0,0 +1,362 @@
+/*
+ * Language code utility functions
+ *
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "language.h"
+
+#include "common/common.h"
+#include "osdep/strnlen.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+static const struct lang {
+    char match[4];
+    char canonical[4];
+} langmap[] = {
+    {"aa", "aar"},
+    {"ab", "abk"},
+    {"ae", "ave"},
+    {"af", "afr"},
+    {"ak", "aka"},
+    {"am", "amh"},
+    {"an", "arg"},
+    {"ar", "ara"},
+    {"as", "asm"},
+    {"av", "ava"},
+    {"ay", "aym"},
+    {"az", "aze"},
+    {"ba", "bak"},
+    {"be", "bel"},
+    {"bg", "bul"},
+    {"bh", "bih"},
+    {"bi", "bis"},
+    {"bm", "bam"},
+    {"bn", "ben"},
+    {"bo", "tib"},
+    {"bod", "tib"},
+    {"br", "bre"},
+    {"bs", "bos"},
+    {"ca", "cat"},
+    {"ce", "che"},
+    {"ces", "cze"},
+    {"ch", "cha"},
+    {"co", "cos"},
+    {"cr", "cre"},
+    {"cs", "cze"},
+    {"cu", "chu"},
+    {"cv", "chv"},
+    {"cy", "wel"},
+    {"cym", "wel"},
+    {"da", "dan"},
+    {"de", "ger"},
+    {"deu", "ger"},
+    {"dv", "div"},
+    {"dz", "dzo"},
+    {"ee", "ewe"},
+    {"el", "gre"},
+    {"ell", "gre"},
+    {"en", "eng"},
+    {"eo", "epo"},
+    {"es", "spa"},
+    {"et", "est"},
+    {"eu", "baq"},
+    {"eus", "baq"},
+    {"fa", "per"},
+    {"fas", "per"},
+    {"ff", "ful"},
+    {"fi", "fin"},
+    {"fj", "fij"},
+    {"fo", "fao"},
+    {"fr", "fre"},
+    {"fra", "fre"},
+    {"fy", "fry"},
+    {"ga", "gle"},
+    {"gd", "gla"},
+    {"gl", "glg"},
+    {"gn", "grn"},
+    {"gu", "guj"},
+    {"gv", "glv"},
+    {"ha", "hau"},
+    {"he", "heb"},
+    {"hi", "hin"},
+    {"ho", "hmo"},
+    {"hr", "hrv"},
+    {"ht", "hat"},
+    {"hu", "hun"},
+    {"hy", "arm"},
+    {"hye", "arm"},
+    {"hz", "her"},
+    {"ia", "ina"},
+    {"id", "ind"},
+    {"ie", "ile"},
+    {"ig", "ibo"},
+    {"ii", "iii"},
+    {"ik", "ipk"},
+    {"io", "ido"},
+    {"is", "ice"},
+    {"isl", "ice"},
+    {"it", "ita"},
+    {"iu", "iku"},
+    {"ja", "jpn"},
+    {"jv", "jav"},
+    {"ka", "geo"},
+    {"kat", "geo"},
+    {"kg", "kon"},
+    {"ki", "kik"},
+    {"kj", "kua"},
+    {"kk", "kaz"},
+    {"kl", "kal"},
+    {"km", "khm"},
+    {"kn", "kan"},
+    {"ko", "kor"},
+    {"kr", "kau"},
+    {"ks", "kas"},
+    {"ku", "kur"},
+    {"kv", "kom"},
+    {"kw", "cor"},
+    {"ky", "kir"},
+    {"la", "lat"},
+    {"lb", "ltz"},
+    {"lg", "lug"},
+    {"li", "lim"},
+    {"ln", "lin"},
+    {"lo", "lao"},
+    {"lt", "lit"},
+    {"lu", "lub"},
+    {"lv", "lav"},
+    {"mg", "mlg"},
+    {"mh", "mah"},
+    {"mi", "mao"},
+    {"mk", "mac"},
+    {"mkd", "mac"},
+    {"ml", "mal"},
+    {"mn", "mon"},
+    {"mr", "mar"},
+    {"mri", "mao"},
+    {"ms", "may"},
+    {"msa", "may"},
+    {"mt", "mlt"},
+    {"my", "bur"},
+    {"mya", "bur"},
+    {"na", "nau"},
+    {"nb", "nob"},
+    {"nd", "nde"},
+    {"ne", "nep"},
+    {"ng", "ndo"},
+    {"nl", "dut"},
+    {"nld", "dut"},
+    {"nn", "nno"},
+    {"no", "nor"},
+    {"nr", "nbl"},
+    {"nv", "nav"},
+    {"ny", "nya"},
+    {"oc", "oci"},
+    {"oj", "oji"},
+    {"om", "orm"},
+    {"or", "ori"},
+    {"os", "oss"},
+    {"pa", "pan"},
+    {"pi", "pli"},
+    {"pl", "pol"},
+    {"ps", "pus"},
+    {"pt", "por"},
+    {"qu", "que"},
+    {"rm", "roh"},
+    {"rn", "run"},
+    {"ro", "rum"},
+    {"ron", "rum"},
+    {"ru", "rus"},
+    {"rw", "kin"},
+    {"sa", "san"},
+    {"sc", "srd"},
+    {"sd", "snd"},
+    {"se", "sme"},
+    {"sg", "sag"},
+    {"si", "sin"},
+    {"sk", "slo"},
+    {"sl", "slv"},
+    {"slk", "slo"},
+    {"sm", "smo"},
+    {"sn", "sna"},
+    {"so", "som"},
+    {"sq", "alb"},
+    {"sqi", "alb"},
+    {"sr", "srp"},
+    {"ss", "ssw"},
+    {"st", "sot"},
+    {"su", "sun"},
+    {"sv", "swe"},
+    {"sw", "swa"},
+    {"ta", "tam"},
+    {"te", "tel"},
+    {"tg", "tgk"},
+    {"th", "tha"},
+    {"ti", "tir"},
+    {"tk", "tuk"},
+    {"tl", "tgl"},
+    {"tn", "tsn"},
+    {"to", "ton"},
+    {"tr", "tur"},
+    {"ts", "tso"},
+    {"tt", "tat"},
+    {"tw", "twi"},
+    {"ty", "tah"},
+    {"ug", "uig"},
+    {"uk", "ukr"},
+    {"ur", "urd"},
+    {"uz", "uzb"},
+    {"ve", "ven"},
+    {"vi", "vie"},
+    {"vo", "vol"},
+    {"wa", "wln"},
+    {"wo", "wol"},
+    {"xh", "xho"},
+    {"yi", "yid"},
+    {"yo", "yor"},
+    {"za", "zha"},
+    {"zh", "chi"},
+    {"zho", "chi"},
+    {"zu", "zul"},
+};
+
+struct langsearch {
+    const char *str;
+    size_t size;
+};
+
+static int lang_compare(const void *s, const void *k)
+{
+    const struct langsearch *search = s;
+    const struct lang *key = k;
+
+    int ret = strncasecmp(search->str, key->match, search->size);
+    if (!ret && search->size < sizeof(key->match) && key->match[search->size])
+        return 1;
+    return ret;
+}
+
+static void canonicalize(const char **lang, size_t *size)
+{
+    if (*size > sizeof(langmap[0].match))
+        return;
+
+    struct langsearch search = {*lang, *size};
+    struct lang *l = bsearch(&search, langmap, MP_ARRAY_SIZE(langmap), sizeof(langmap[0]),
+                             &lang_compare);
+
+    if (l) {
+        *lang = l->canonical;
+        *size = strnlen(l->canonical, sizeof(l->canonical));
+    }
+}
+
+static bool tag_matches(const char *l1, size_t s1, const char *l2, size_t s2)
+{
+    return s1 == s2 && !strncasecmp(l1, l2, s1);
+}
+
+int mp_match_lang_single(const char *l1, const char *l2)
+{
+    // We never consider null or empty strings to match
+    if (!l1 || !l2 || !*l1 || !*l2)
+        return 0;
+
+    // The first subtag should always be a language; canonicalize to 3-letter ISO 639-2B (arbitrarily chosen)
+    size_t s1 = strcspn(l1, "-_");
+    size_t s2 = strcspn(l2, "-_");
+
+    const char *l1c = l1;
+    const char *l2c = l2;
+    size_t s1c = s1;
+    size_t s2c = s2;
+
+    canonicalize(&l1c, &s1c);
+    canonicalize(&l2c, &s2c);
+
+    // If the first subtags don't match, we have no match at all
+    if (!tag_matches(l1c, s1c, l2c, s2c))
+        return 0;
+
+    // Attempt to match each subtag in each string against each in the other
+    int score = 1;
+    bool x1 = false;
+    int count = 0;
+    for (;;) {
+        l1 += s1;
+
+        while (*l1 == '-' || *l1 == '_')
+            l1++;
+
+        if (!*l1)
+            break;
+
+        s1 = strcspn(l1, "-_");
+        if (tag_matches(l1, s1, "x", 1)) {
+            x1 = true;
+            continue;
+        }
+
+        const char *l2o = l2;
+        size_t s2o = s2;
+        bool x2 = false;
+        for (;;) {
+            l2 += s2;
+
+            while (*l2 == '-' || *l2 == '_')
+                l2++;
+
+            if (!*l2)
+                break;
+
+            s2 = strcspn(l2, "-_");
+            if (tag_matches(l2, s2, "x", 1)) {
+                x2 = true;
+                if (!x1)
+                    break;
+                continue;
+            }
+
+            // Private-use subtags only match against other private-use subtags
+            if (x1 && !x2)
+                continue;
+
+            if (tag_matches(l1c, s1c, l2c, s2c)) {
+                // Matches for subtags earlier in the user's string take priority over later ones,
+                // for up to LANGUAGE_SCORE_BITS subtags
+                int shift = (LANGUAGE_SCORE_BITS - count - 1);
+                if (shift < 0)
+                    shift = 0;
+                score += (1 << shift);
+
+                if (score >= LANGUAGE_SCORE_MAX)
+                    return LANGUAGE_SCORE_MAX;
+            }
+        }
+
+        l2 = l2o;
+        s2 = s2o;
+
+        count++;
+    }
+
+    return score;
+}
--- a/misc/language.h
+++ b/misc/language.h
@ -0,0 +1,29 @@
+/*
+ * Language code utility functions
+ *
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_LANGUAGE_H
+#define MP_LANGUAGE_H
+
+#define LANGUAGE_SCORE_BITS 16
+#define LANGUAGE_SCORE_MAX (1 << LANGUAGE_SCORE_BITS)
+
+// Where applicable, l1 is the user-specified code and l2 is the code being checked against it
+int mp_match_lang_single(const char *l1, const char *l2);
+
+#endif /* MP_LANGUAGE_H */
--- a/wscript_build.py
+++ b/wscript_build.py
@ -355,6 +355,7 @@ def build(ctx):
        ( "misc/dispatch.c" ),
        ( "misc/jni.c",                          "android" ),
        ( "misc/json.c" ),
+        ( "misc/language.c" ),
        ( "misc/natural_sort.c" ),
        ( "misc/node.c" ),
        ( "misc/rendezvous.c" ),