diff --git a/README.md b/README.md index b9a7a44..7a838ec 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ * `https://github.com/kmike/pymorphy2/blob/master/pymorphy2/lang/uk/_prefixes.py` **Проблем** -Фаjл `suffixes.json` изгледаjе неправилно, але проблем најбоље вероjетно jест в минулом етапу. +Фаjл `suffixes.json` изгледаjе неправилно, але проблем најбоље вероjетно jест в минулом етапу: в составных словах (фраземах), кторе имаjут выше jедного слова. ~~(jа подзирам формы `najbolje`)~~. ~~(jа подзирам формы `se`)~~. diff --git a/example1.py b/example1.py new file mode 100644 index 0000000..901379b --- /dev/null +++ b/example1.py @@ -0,0 +1,50 @@ +import pymorphy2 +import argparse + + +NOM_SING_CONVERT = {"femn": "я", "neut": "е", "masc": "й"} +ACC_SING_CONVERT = {"femn": "ю", "neut": "е", "masc": "й"} + +def flavorise(word, golden_pos_tag, isv_morph): + variants = [v for v in isv_morph.parse(word) if golden_pos_tag in v.tag] + if golden_pos_tag == "VERB": + # all infinitives? + if all("infn" in v.tag for v in variants): + return word[:-1] + "ь" + # all 3rd person sing? + if all(("3per" in v.tag and "sing" in v.tag) for v in variants): + return word + "т" + if golden_pos_tag == "ADJF": + some_variant = variants[0] # no better way to choose + if some_variant.tag.case == "nomn": + if "sing" in some_variant.tag: + return word + NOM_SING_CONVERT[some_variant.tag.gender] + if "plur" in some_variant.tag: + return word[:-2] + "ие" + if some_variant.tag.case == "accs": + if some_variant.tag.animacy == "anim" and some_variant.tag.gender == "masc": + return word + if some_variant.tag.number == "sing": + return word + ACC_SING_CONVERT[some_variant.tag.gender] + if some_variant.tag.number == "plur": + return word[:-2] + "ие" + return word + +def jota_translate(word): + return word.replace('ју', "ю").replace('ја', "я").replace('јо', "ё").replace('ији', "ии").replace('ј', "й") + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Kludge Flavorisation Example') + parser.add_argument('path') + args = parser.parse_args() + + isv_morph = pymorphy2.MorphAnalyzer(args.path) + + text = 'Такоже то може быти помочно за развиту флаворизацију Теоретично тој текст в развитој русској флаворизацији буде изгледати тако'.split() + tags = 'ADVB NPRO VERB VERB ADJF PREP ADJF NOUN ADVB NPRO NOUN PREP ADJF ADJF NOUN VERB VERB ADVB'.split() + assert len(text) == len(tags) + for word, tag in zip(text, tags): + raw_flavorized = flavorise(word, tag, isv_morph) + print(jota_translate(raw_flavorized), end=" ") + diff --git a/example2.py b/example2.py new file mode 100644 index 0000000..5bdfcda --- /dev/null +++ b/example2.py @@ -0,0 +1,83 @@ +import pymorphy2 +import argparse + +VERB_PREFIXES = [ + 'do', 'iz', 'izpo', 'nad', 'na', 'ne', 'ob', 'odpo', 'od', 'o', 'prědpo', + 'pod', 'po', 'prě', 'pre', 'pri', 'pro', 'råzpro', 'razpro', 'råz', 'raz', + 'sȯ', 's', 'u', 'vȯ', 'vo', 'v', 'vȯz', 'voz', 'vy', 'za', +] + +SIMPLE_DIACR_SUBS = { + 'e': 'ě', 'c': 'č', 'z': 'ž', 's': 'š', +} +# NOTE: pymorphy2 cannot work with several changes, i.e. {'e': 'ě', 'e': 'ę'} +ETM_DIACR_SUBS = { + 'a': 'å', 'u': 'ų', 'č': 'ć', 'e': 'ę', + # 'dž': 'đ' # ne funguje +} + +DEFAULT_UNITS = [ + [ + pymorphy2.units.DictionaryAnalyzer() + ], + pymorphy2.units.KnownPrefixAnalyzer(known_prefixes=VERB_PREFIXES), + [ + pymorphy2.units.UnknownPrefixAnalyzer(), + pymorphy2.units.KnownSuffixAnalyzer() + ] +] + +def dodavaj_bukvy(word, etm_morph): + # if "gl" in word: + # print(word) + # print(etm_morph.parse(word)) + corrected = [f.word for f in etm_morph.parse(word)] + if len(set(corrected)) == 1: + return corrected[0] + if len(set(corrected)) == 0: + return word + "/?" + return "/".join(set(corrected)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Kludge Spellcheck Example') + parser.add_argument('path') + args = parser.parse_args() + path = args.path + + std_morph = pymorphy2.MorphAnalyzer( + path+"out_isv_lat", + units=DEFAULT_UNITS, + char_substitutes=SIMPLE_DIACR_SUBS + ) + + etm_morph = pymorphy2.MorphAnalyzer( + path+"out_isv_etm", + units=DEFAULT_UNITS, + char_substitutes=ETM_DIACR_SUBS + ) + + text = "ja funguju i razuměju avtododavanje etymologičnyh bukv" + + text_smpl = "hcu preporuciti gledi pese troicky most v gradu celjabinsku zeđam foto za zenu" + text_stnd = "hču prěporučiti gledi pěše troicky most v gradu čeljabinsku žeđam foto za ženu" + text_full = "hćų prěporųčiti ględi pěše troicky most v grådu čeljabinsku žeđam foto za ženu" + + # grad: gråd = town // grad = hail + + print() + for word in text.split(" "): + print(dodavaj_bukvy(word, etm_morph), end=" ") + print() + print() + + fixed_text = " ".join(dodavaj_bukvy(word, std_morph) for word in text_smpl.split(" ")) + print(fixed_text) + print() + print(text_stnd) + print("------") + fixed_text = " ".join(dodavaj_bukvy(word, etm_morph) for word in text_stnd.split(" ")) + print(fixed_text) + print() + print(text_full) + print() diff --git a/example3.py b/example3.py new file mode 100644 index 0000000..9f31eeb --- /dev/null +++ b/example3.py @@ -0,0 +1,53 @@ +import pymorphy2 +import argparse +from collections import Counter + +VERB_PREFIXES = [ + 'do', 'iz', 'izpo', 'nad', 'na', 'ne', 'ob', 'odpo', 'od', 'o', 'prědpo', + 'pod', 'po', 'prě', 'pre', 'pri', 'pro', 'råzpro', 'razpro', 'råz', 'raz', + 'sȯ', 's', 'u', 'vȯ', 'vo', 'v', 'vȯz', 'voz', 'vy', 'za', +] + +DEFAULT_UNITS = [ + [ + pymorphy2.units.DictionaryAnalyzer() + ], + pymorphy2.units.KnownPrefixAnalyzer(known_prefixes=VERB_PREFIXES), + [ + pymorphy2.units.UnknownPrefixAnalyzer(), + pymorphy2.units.KnownSuffixAnalyzer() + ] +] + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Kludge Statistics Example') + parser.add_argument('path') + args = parser.parse_args() + path = args.path + + etm_morph = pymorphy2.MorphAnalyzer( + path+"out_isv_etm", + units=DEFAULT_UNITS, + char_substitutes={} + ) + + text = "Naša misija jest govoriti najvyše råzumlivo, zato dělamo eksperimenty, čęsto pytajemo ljudi i diskutujemo o tom kako ulěpšati naše govorenje. Zato takože čęsto napominamo ljudi, kaki dělajųt pogrěšky, aby govorili drugo. To sųt vsegda sověty a tvoje govorenje to nakraj jest tvoj izbor. My prosto staramo sę byti možlivo najvyše råzumlivi" + + cnt = Counter() + for word in text.replace(".", "").replace(",", "").split(" "): + forms = [v.normal_form for v in etm_morph.parse(word)] + if len(forms) == 0: + form = word + else: + form = forms[0] # najvyše věrojetna forma podolg spornym hevristikam + cnt[form] += 1 + print(cnt) + + cnt = Counter() + for word in text.replace(".", "").replace(",", "").split(" "): + forms = [v.normal_form for v in etm_morph.parse(word)] + for form in forms: + cnt[form] += 1/len(forms) + print(cnt) +