misc: add language-matching utilities

This commit is contained in:
rcombs 2021-05-26 17:46:56 -05:00 committed by sfan5
parent 1669c4698d
commit 8c8d97c26c
4 changed files with 393 additions and 0 deletions

View File

@ -131,6 +131,7 @@ sources = files(
'misc/charset_conv.c',
'misc/dispatch.c',
'misc/json.c',
'misc/language.c',
'misc/natural_sort.c',
'misc/node.c',
'misc/random.c',

362
misc/language.c Normal file
View File

@ -0,0 +1,362 @@
/*
* Language code utility functions
*
* This file is part of mpv.
*
* mpv is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* mpv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
*/
#include "language.h"
#include "common/common.h"
#include "osdep/strnlen.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
static const struct lang {
char match[4];
char canonical[4];
} langmap[] = {
{"aa", "aar"},
{"ab", "abk"},
{"ae", "ave"},
{"af", "afr"},
{"ak", "aka"},
{"am", "amh"},
{"an", "arg"},
{"ar", "ara"},
{"as", "asm"},
{"av", "ava"},
{"ay", "aym"},
{"az", "aze"},
{"ba", "bak"},
{"be", "bel"},
{"bg", "bul"},
{"bh", "bih"},
{"bi", "bis"},
{"bm", "bam"},
{"bn", "ben"},
{"bo", "tib"},
{"bod", "tib"},
{"br", "bre"},
{"bs", "bos"},
{"ca", "cat"},
{"ce", "che"},
{"ces", "cze"},
{"ch", "cha"},
{"co", "cos"},
{"cr", "cre"},
{"cs", "cze"},
{"cu", "chu"},
{"cv", "chv"},
{"cy", "wel"},
{"cym", "wel"},
{"da", "dan"},
{"de", "ger"},
{"deu", "ger"},
{"dv", "div"},
{"dz", "dzo"},
{"ee", "ewe"},
{"el", "gre"},
{"ell", "gre"},
{"en", "eng"},
{"eo", "epo"},
{"es", "spa"},
{"et", "est"},
{"eu", "baq"},
{"eus", "baq"},
{"fa", "per"},
{"fas", "per"},
{"ff", "ful"},
{"fi", "fin"},
{"fj", "fij"},
{"fo", "fao"},
{"fr", "fre"},
{"fra", "fre"},
{"fy", "fry"},
{"ga", "gle"},
{"gd", "gla"},
{"gl", "glg"},
{"gn", "grn"},
{"gu", "guj"},
{"gv", "glv"},
{"ha", "hau"},
{"he", "heb"},
{"hi", "hin"},
{"ho", "hmo"},
{"hr", "hrv"},
{"ht", "hat"},
{"hu", "hun"},
{"hy", "arm"},
{"hye", "arm"},
{"hz", "her"},
{"ia", "ina"},
{"id", "ind"},
{"ie", "ile"},
{"ig", "ibo"},
{"ii", "iii"},
{"ik", "ipk"},
{"io", "ido"},
{"is", "ice"},
{"isl", "ice"},
{"it", "ita"},
{"iu", "iku"},
{"ja", "jpn"},
{"jv", "jav"},
{"ka", "geo"},
{"kat", "geo"},
{"kg", "kon"},
{"ki", "kik"},
{"kj", "kua"},
{"kk", "kaz"},
{"kl", "kal"},
{"km", "khm"},
{"kn", "kan"},
{"ko", "kor"},
{"kr", "kau"},
{"ks", "kas"},
{"ku", "kur"},
{"kv", "kom"},
{"kw", "cor"},
{"ky", "kir"},
{"la", "lat"},
{"lb", "ltz"},
{"lg", "lug"},
{"li", "lim"},
{"ln", "lin"},
{"lo", "lao"},
{"lt", "lit"},
{"lu", "lub"},
{"lv", "lav"},
{"mg", "mlg"},
{"mh", "mah"},
{"mi", "mao"},
{"mk", "mac"},
{"mkd", "mac"},
{"ml", "mal"},
{"mn", "mon"},
{"mr", "mar"},
{"mri", "mao"},
{"ms", "may"},
{"msa", "may"},
{"mt", "mlt"},
{"my", "bur"},
{"mya", "bur"},
{"na", "nau"},
{"nb", "nob"},
{"nd", "nde"},
{"ne", "nep"},
{"ng", "ndo"},
{"nl", "dut"},
{"nld", "dut"},
{"nn", "nno"},
{"no", "nor"},
{"nr", "nbl"},
{"nv", "nav"},
{"ny", "nya"},
{"oc", "oci"},
{"oj", "oji"},
{"om", "orm"},
{"or", "ori"},
{"os", "oss"},
{"pa", "pan"},
{"pi", "pli"},
{"pl", "pol"},
{"ps", "pus"},
{"pt", "por"},
{"qu", "que"},
{"rm", "roh"},
{"rn", "run"},
{"ro", "rum"},
{"ron", "rum"},
{"ru", "rus"},
{"rw", "kin"},
{"sa", "san"},
{"sc", "srd"},
{"sd", "snd"},
{"se", "sme"},
{"sg", "sag"},
{"si", "sin"},
{"sk", "slo"},
{"sl", "slv"},
{"slk", "slo"},
{"sm", "smo"},
{"sn", "sna"},
{"so", "som"},
{"sq", "alb"},
{"sqi", "alb"},
{"sr", "srp"},
{"ss", "ssw"},
{"st", "sot"},
{"su", "sun"},
{"sv", "swe"},
{"sw", "swa"},
{"ta", "tam"},
{"te", "tel"},
{"tg", "tgk"},
{"th", "tha"},
{"ti", "tir"},
{"tk", "tuk"},
{"tl", "tgl"},
{"tn", "tsn"},
{"to", "ton"},
{"tr", "tur"},
{"ts", "tso"},
{"tt", "tat"},
{"tw", "twi"},
{"ty", "tah"},
{"ug", "uig"},
{"uk", "ukr"},
{"ur", "urd"},
{"uz", "uzb"},
{"ve", "ven"},
{"vi", "vie"},
{"vo", "vol"},
{"wa", "wln"},
{"wo", "wol"},
{"xh", "xho"},
{"yi", "yid"},
{"yo", "yor"},
{"za", "zha"},
{"zh", "chi"},
{"zho", "chi"},
{"zu", "zul"},
};
struct langsearch {
const char *str;
size_t size;
};
static int lang_compare(const void *s, const void *k)
{
const struct langsearch *search = s;
const struct lang *key = k;
int ret = strncasecmp(search->str, key->match, search->size);
if (!ret && search->size < sizeof(key->match) && key->match[search->size])
return 1;
return ret;
}
static void canonicalize(const char **lang, size_t *size)
{
if (*size > sizeof(langmap[0].match))
return;
struct langsearch search = {*lang, *size};
struct lang *l = bsearch(&search, langmap, MP_ARRAY_SIZE(langmap), sizeof(langmap[0]),
&lang_compare);
if (l) {
*lang = l->canonical;
*size = strnlen(l->canonical, sizeof(l->canonical));
}
}
static bool tag_matches(const char *l1, size_t s1, const char *l2, size_t s2)
{
return s1 == s2 && !strncasecmp(l1, l2, s1);
}
int mp_match_lang_single(const char *l1, const char *l2)
{
// We never consider null or empty strings to match
if (!l1 || !l2 || !*l1 || !*l2)
return 0;
// The first subtag should always be a language; canonicalize to 3-letter ISO 639-2B (arbitrarily chosen)
size_t s1 = strcspn(l1, "-_");
size_t s2 = strcspn(l2, "-_");
const char *l1c = l1;
const char *l2c = l2;
size_t s1c = s1;
size_t s2c = s2;
canonicalize(&l1c, &s1c);
canonicalize(&l2c, &s2c);
// If the first subtags don't match, we have no match at all
if (!tag_matches(l1c, s1c, l2c, s2c))
return 0;
// Attempt to match each subtag in each string against each in the other
int score = 1;
bool x1 = false;
int count = 0;
for (;;) {
l1 += s1;
while (*l1 == '-' || *l1 == '_')
l1++;
if (!*l1)
break;
s1 = strcspn(l1, "-_");
if (tag_matches(l1, s1, "x", 1)) {
x1 = true;
continue;
}
const char *l2o = l2;
size_t s2o = s2;
bool x2 = false;
for (;;) {
l2 += s2;
while (*l2 == '-' || *l2 == '_')
l2++;
if (!*l2)
break;
s2 = strcspn(l2, "-_");
if (tag_matches(l2, s2, "x", 1)) {
x2 = true;
if (!x1)
break;
continue;
}
// Private-use subtags only match against other private-use subtags
if (x1 && !x2)
continue;
if (tag_matches(l1c, s1c, l2c, s2c)) {
// Matches for subtags earlier in the user's string take priority over later ones,
// for up to LANGUAGE_SCORE_BITS subtags
int shift = (LANGUAGE_SCORE_BITS - count - 1);
if (shift < 0)
shift = 0;
score += (1 << shift);
if (score >= LANGUAGE_SCORE_MAX)
return LANGUAGE_SCORE_MAX;
}
}
l2 = l2o;
s2 = s2o;
count++;
}
return score;
}

29
misc/language.h Normal file
View File

@ -0,0 +1,29 @@
/*
* Language code utility functions
*
* This file is part of mpv.
*
* mpv is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* mpv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef MP_LANGUAGE_H
#define MP_LANGUAGE_H
#define LANGUAGE_SCORE_BITS 16
#define LANGUAGE_SCORE_MAX (1 << LANGUAGE_SCORE_BITS)
// Where applicable, l1 is the user-specified code and l2 is the code being checked against it
int mp_match_lang_single(const char *l1, const char *l2);
#endif /* MP_LANGUAGE_H */

View File

@ -355,6 +355,7 @@ def build(ctx):
( "misc/dispatch.c" ),
( "misc/jni.c", "android" ),
( "misc/json.c" ),
( "misc/language.c" ),
( "misc/natural_sort.c" ),
( "misc/node.c" ),
( "misc/rendezvous.c" ),