Skip to content

Commit

Permalink
vysze gotovo
Browse files Browse the repository at this point in the history
  • Loading branch information
bt2901 committed May 3, 2021
1 parent 10012d6 commit 4538045
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
* `https://github.com/kmike/pymorphy2/blob/master/pymorphy2/lang/uk/_prefixes.py`

**Проблем**
Фаjл `suffixes.json` изгледаjе неправилно, але проблем најбоље вероjетно jест в минулом етапу.
Фаjл `suffixes.json` изгледаjе неправилно, але проблем најбоље вероjетно jест в минулом етапу: в составных словах (фраземах), кторе имаjут выше jедного слова.
~~(jа подзирам формы `najbolje`)~~.

~~(jа подзирам формы `se`)~~.
Expand Down
50 changes: 50 additions & 0 deletions example1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pymorphy2
import argparse


NOM_SING_CONVERT = {"femn": "я", "neut": "е", "masc": "й"}
ACC_SING_CONVERT = {"femn": "ю", "neut": "е", "masc": "й"}

def flavorise(word, golden_pos_tag, isv_morph):
variants = [v for v in isv_morph.parse(word) if golden_pos_tag in v.tag]
if golden_pos_tag == "VERB":
# all infinitives?
if all("infn" in v.tag for v in variants):
return word[:-1] + "ь"
# all 3rd person sing?
if all(("3per" in v.tag and "sing" in v.tag) for v in variants):
return word + "т"
if golden_pos_tag == "ADJF":
some_variant = variants[0] # no better way to choose
if some_variant.tag.case == "nomn":
if "sing" in some_variant.tag:
return word + NOM_SING_CONVERT[some_variant.tag.gender]
if "plur" in some_variant.tag:
return word[:-2] + "ие"
if some_variant.tag.case == "accs":
if some_variant.tag.animacy == "anim" and some_variant.tag.gender == "masc":
return word
if some_variant.tag.number == "sing":
return word + ACC_SING_CONVERT[some_variant.tag.gender]
if some_variant.tag.number == "plur":
return word[:-2] + "ие"
return word

def jota_translate(word):
return word.replace('ју', "ю").replace('ја', "я").replace('јо', "ё").replace('ији', "ии").replace('ј', "й")

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Kludge Flavorisation Example')
parser.add_argument('path')
args = parser.parse_args()

isv_morph = pymorphy2.MorphAnalyzer(args.path)

text = 'Такоже то може быти помочно за развиту флаворизацију Теоретично тој текст в развитој русској флаворизацији буде изгледати тако'.split()
tags = 'ADVB NPRO VERB VERB ADJF PREP ADJF NOUN ADVB NPRO NOUN PREP ADJF ADJF NOUN VERB VERB ADVB'.split()
assert len(text) == len(tags)
for word, tag in zip(text, tags):
raw_flavorized = flavorise(word, tag, isv_morph)
print(jota_translate(raw_flavorized), end=" ")

83 changes: 83 additions & 0 deletions example2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pymorphy2
import argparse

VERB_PREFIXES = [
'do', 'iz', 'izpo', 'nad', 'na', 'ne', 'ob', 'odpo', 'od', 'o', 'prědpo',
'pod', 'po', 'prě', 'pre', 'pri', 'pro', 'råzpro', 'razpro', 'råz', 'raz',
'sȯ', 's', 'u', 'vȯ', 'vo', 'v', 'vȯz', 'voz', 'vy', 'za',
]

SIMPLE_DIACR_SUBS = {
'e': 'ě', 'c': 'č', 'z': 'ž', 's': 'š',
}
# NOTE: pymorphy2 cannot work with several changes, i.e. {'e': 'ě', 'e': 'ę'}
ETM_DIACR_SUBS = {
'a': 'å', 'u': 'ų', 'č': 'ć', 'e': 'ę',
# 'dž': 'đ' # ne funguje
}

DEFAULT_UNITS = [
[
pymorphy2.units.DictionaryAnalyzer()
],
pymorphy2.units.KnownPrefixAnalyzer(known_prefixes=VERB_PREFIXES),
[
pymorphy2.units.UnknownPrefixAnalyzer(),
pymorphy2.units.KnownSuffixAnalyzer()
]
]

def dodavaj_bukvy(word, etm_morph):
# if "gl" in word:
# print(word)
# print(etm_morph.parse(word))
corrected = [f.word for f in etm_morph.parse(word)]
if len(set(corrected)) == 1:
return corrected[0]
if len(set(corrected)) == 0:
return word + "/?"
return "/".join(set(corrected))

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Kludge Spellcheck Example')
parser.add_argument('path')
args = parser.parse_args()
path = args.path

std_morph = pymorphy2.MorphAnalyzer(
path+"out_isv_lat",
units=DEFAULT_UNITS,
char_substitutes=SIMPLE_DIACR_SUBS
)

etm_morph = pymorphy2.MorphAnalyzer(
path+"out_isv_etm",
units=DEFAULT_UNITS,
char_substitutes=ETM_DIACR_SUBS
)

text = "ja funguju i razuměju avtododavanje etymologičnyh bukv"

text_smpl = "hcu preporuciti gledi pese troicky most v gradu celjabinsku zeđam foto za zenu"
text_stnd = "hču prěporučiti gledi pěše troicky most v gradu čeljabinsku žeđam foto za ženu"
text_full = "hćų prěporųčiti ględi pěše troicky most v grådu čeljabinsku žeđam foto za ženu"

# grad: gråd = town // grad = hail

print()
for word in text.split(" "):
print(dodavaj_bukvy(word, etm_morph), end=" ")
print()
print()

fixed_text = " ".join(dodavaj_bukvy(word, std_morph) for word in text_smpl.split(" "))
print(fixed_text)
print()
print(text_stnd)
print("------")
fixed_text = " ".join(dodavaj_bukvy(word, etm_morph) for word in text_stnd.split(" "))
print(fixed_text)
print()
print(text_full)
print()
53 changes: 53 additions & 0 deletions example3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pymorphy2
import argparse
from collections import Counter

VERB_PREFIXES = [
'do', 'iz', 'izpo', 'nad', 'na', 'ne', 'ob', 'odpo', 'od', 'o', 'prědpo',
'pod', 'po', 'prě', 'pre', 'pri', 'pro', 'råzpro', 'razpro', 'råz', 'raz',
'sȯ', 's', 'u', 'vȯ', 'vo', 'v', 'vȯz', 'voz', 'vy', 'za',
]

DEFAULT_UNITS = [
[
pymorphy2.units.DictionaryAnalyzer()
],
pymorphy2.units.KnownPrefixAnalyzer(known_prefixes=VERB_PREFIXES),
[
pymorphy2.units.UnknownPrefixAnalyzer(),
pymorphy2.units.KnownSuffixAnalyzer()
]
]

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Kludge Statistics Example')
parser.add_argument('path')
args = parser.parse_args()
path = args.path

etm_morph = pymorphy2.MorphAnalyzer(
path+"out_isv_etm",
units=DEFAULT_UNITS,
char_substitutes={}
)

text = "Naša misija jest govoriti najvyše råzumlivo, zato dělamo eksperimenty, čęsto pytajemo ljudi i diskutujemo o tom kako ulěpšati naše govorenje. Zato takože čęsto napominamo ljudi, kaki dělajųt pogrěšky, aby govorili drugo. To sųt vsegda sověty a tvoje govorenje to nakraj jest tvoj izbor. My prosto staramo sę byti možlivo najvyše råzumlivi"

cnt = Counter()
for word in text.replace(".", "").replace(",", "").split(" "):
forms = [v.normal_form for v in etm_morph.parse(word)]
if len(forms) == 0:
form = word
else:
form = forms[0] # najvyše věrojetna forma podolg spornym hevristikam
cnt[form] += 1
print(cnt)

cnt = Counter()
for word in text.replace(".", "").replace(",", "").split(" "):
forms = [v.normal_form for v in etm_morph.parse(word)]
for form in forms:
cnt[form] += 1/len(forms)
print(cnt)

0 comments on commit 4538045

Please sign in to comment.