-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
187 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pymorphy2 | ||
import argparse | ||
|
||
|
||
NOM_SING_CONVERT = {"femn": "я", "neut": "е", "masc": "й"} | ||
ACC_SING_CONVERT = {"femn": "ю", "neut": "е", "masc": "й"} | ||
|
||
def flavorise(word, golden_pos_tag, isv_morph): | ||
variants = [v for v in isv_morph.parse(word) if golden_pos_tag in v.tag] | ||
if golden_pos_tag == "VERB": | ||
# all infinitives? | ||
if all("infn" in v.tag for v in variants): | ||
return word[:-1] + "ь" | ||
# all 3rd person sing? | ||
if all(("3per" in v.tag and "sing" in v.tag) for v in variants): | ||
return word + "т" | ||
if golden_pos_tag == "ADJF": | ||
some_variant = variants[0] # no better way to choose | ||
if some_variant.tag.case == "nomn": | ||
if "sing" in some_variant.tag: | ||
return word + NOM_SING_CONVERT[some_variant.tag.gender] | ||
if "plur" in some_variant.tag: | ||
return word[:-2] + "ие" | ||
if some_variant.tag.case == "accs": | ||
if some_variant.tag.animacy == "anim" and some_variant.tag.gender == "masc": | ||
return word | ||
if some_variant.tag.number == "sing": | ||
return word + ACC_SING_CONVERT[some_variant.tag.gender] | ||
if some_variant.tag.number == "plur": | ||
return word[:-2] + "ие" | ||
return word | ||
|
||
def jota_translate(word): | ||
return word.replace('ју', "ю").replace('ја', "я").replace('јо', "ё").replace('ији', "ии").replace('ј', "й") | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description='Kludge Flavorisation Example') | ||
parser.add_argument('path') | ||
args = parser.parse_args() | ||
|
||
isv_morph = pymorphy2.MorphAnalyzer(args.path) | ||
|
||
text = 'Такоже то може быти помочно за развиту флаворизацију Теоретично тој текст в развитој русској флаворизацији буде изгледати тако'.split() | ||
tags = 'ADVB NPRO VERB VERB ADJF PREP ADJF NOUN ADVB NPRO NOUN PREP ADJF ADJF NOUN VERB VERB ADVB'.split() | ||
assert len(text) == len(tags) | ||
for word, tag in zip(text, tags): | ||
raw_flavorized = flavorise(word, tag, isv_morph) | ||
print(jota_translate(raw_flavorized), end=" ") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import pymorphy2 | ||
import argparse | ||
|
||
VERB_PREFIXES = [ | ||
'do', 'iz', 'izpo', 'nad', 'na', 'ne', 'ob', 'odpo', 'od', 'o', 'prědpo', | ||
'pod', 'po', 'prě', 'pre', 'pri', 'pro', 'råzpro', 'razpro', 'råz', 'raz', | ||
'sȯ', 's', 'u', 'vȯ', 'vo', 'v', 'vȯz', 'voz', 'vy', 'za', | ||
] | ||
|
||
SIMPLE_DIACR_SUBS = { | ||
'e': 'ě', 'c': 'č', 'z': 'ž', 's': 'š', | ||
} | ||
# NOTE: pymorphy2 cannot work with several changes, i.e. {'e': 'ě', 'e': 'ę'} | ||
ETM_DIACR_SUBS = { | ||
'a': 'å', 'u': 'ų', 'č': 'ć', 'e': 'ę', | ||
# 'dž': 'đ' # ne funguje | ||
} | ||
|
||
DEFAULT_UNITS = [ | ||
[ | ||
pymorphy2.units.DictionaryAnalyzer() | ||
], | ||
pymorphy2.units.KnownPrefixAnalyzer(known_prefixes=VERB_PREFIXES), | ||
[ | ||
pymorphy2.units.UnknownPrefixAnalyzer(), | ||
pymorphy2.units.KnownSuffixAnalyzer() | ||
] | ||
] | ||
|
||
def dodavaj_bukvy(word, etm_morph): | ||
# if "gl" in word: | ||
# print(word) | ||
# print(etm_morph.parse(word)) | ||
corrected = [f.word for f in etm_morph.parse(word)] | ||
if len(set(corrected)) == 1: | ||
return corrected[0] | ||
if len(set(corrected)) == 0: | ||
return word + "/?" | ||
return "/".join(set(corrected)) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description='Kludge Spellcheck Example') | ||
parser.add_argument('path') | ||
args = parser.parse_args() | ||
path = args.path | ||
|
||
std_morph = pymorphy2.MorphAnalyzer( | ||
path+"out_isv_lat", | ||
units=DEFAULT_UNITS, | ||
char_substitutes=SIMPLE_DIACR_SUBS | ||
) | ||
|
||
etm_morph = pymorphy2.MorphAnalyzer( | ||
path+"out_isv_etm", | ||
units=DEFAULT_UNITS, | ||
char_substitutes=ETM_DIACR_SUBS | ||
) | ||
|
||
text = "ja funguju i razuměju avtododavanje etymologičnyh bukv" | ||
|
||
text_smpl = "hcu preporuciti gledi pese troicky most v gradu celjabinsku zeđam foto za zenu" | ||
text_stnd = "hču prěporučiti gledi pěše troicky most v gradu čeljabinsku žeđam foto za ženu" | ||
text_full = "hćų prěporųčiti ględi pěše troicky most v grådu čeljabinsku žeđam foto za ženu" | ||
|
||
# grad: gråd = town // grad = hail | ||
|
||
print() | ||
for word in text.split(" "): | ||
print(dodavaj_bukvy(word, etm_morph), end=" ") | ||
print() | ||
print() | ||
|
||
fixed_text = " ".join(dodavaj_bukvy(word, std_morph) for word in text_smpl.split(" ")) | ||
print(fixed_text) | ||
print() | ||
print(text_stnd) | ||
print("------") | ||
fixed_text = " ".join(dodavaj_bukvy(word, etm_morph) for word in text_stnd.split(" ")) | ||
print(fixed_text) | ||
print() | ||
print(text_full) | ||
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import pymorphy2 | ||
import argparse | ||
from collections import Counter | ||
|
||
VERB_PREFIXES = [ | ||
'do', 'iz', 'izpo', 'nad', 'na', 'ne', 'ob', 'odpo', 'od', 'o', 'prědpo', | ||
'pod', 'po', 'prě', 'pre', 'pri', 'pro', 'råzpro', 'razpro', 'råz', 'raz', | ||
'sȯ', 's', 'u', 'vȯ', 'vo', 'v', 'vȯz', 'voz', 'vy', 'za', | ||
] | ||
|
||
DEFAULT_UNITS = [ | ||
[ | ||
pymorphy2.units.DictionaryAnalyzer() | ||
], | ||
pymorphy2.units.KnownPrefixAnalyzer(known_prefixes=VERB_PREFIXES), | ||
[ | ||
pymorphy2.units.UnknownPrefixAnalyzer(), | ||
pymorphy2.units.KnownSuffixAnalyzer() | ||
] | ||
] | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description='Kludge Statistics Example') | ||
parser.add_argument('path') | ||
args = parser.parse_args() | ||
path = args.path | ||
|
||
etm_morph = pymorphy2.MorphAnalyzer( | ||
path+"out_isv_etm", | ||
units=DEFAULT_UNITS, | ||
char_substitutes={} | ||
) | ||
|
||
text = "Naša misija jest govoriti najvyše råzumlivo, zato dělamo eksperimenty, čęsto pytajemo ljudi i diskutujemo o tom kako ulěpšati naše govorenje. Zato takože čęsto napominamo ljudi, kaki dělajųt pogrěšky, aby govorili drugo. To sųt vsegda sověty a tvoje govorenje to nakraj jest tvoj izbor. My prosto staramo sę byti možlivo najvyše råzumlivi" | ||
|
||
cnt = Counter() | ||
for word in text.replace(".", "").replace(",", "").split(" "): | ||
forms = [v.normal_form for v in etm_morph.parse(word)] | ||
if len(forms) == 0: | ||
form = word | ||
else: | ||
form = forms[0] # najvyše věrojetna forma podolg spornym hevristikam | ||
cnt[form] += 1 | ||
print(cnt) | ||
|
||
cnt = Counter() | ||
for word in text.replace(".", "").replace(",", "").split(" "): | ||
forms = [v.normal_form for v in etm_morph.parse(word)] | ||
for form in forms: | ||
cnt[form] += 1/len(forms) | ||
print(cnt) | ||
|