-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Transliterate Greek according to ELOT 743. Untested #4
base: main
Are you sure you want to change the base?
Changes from all commits
718917e
65f3a64
b3561ba
2024756
a5d8fbe
0815eec
9c45414
9954e62
b02d138
da43311
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,231 @@ | ||
def greek_elot_transliteration(string): | ||
from remove_accentuation import remove_accentuation | ||
# reference_string = string | ||
string = remove_accentuation(string, 1) | ||
lowercase = { | ||
'α': 'a', | ||
'β': 'v', | ||
'γ': 'g', | ||
'δ': 'd', | ||
'ε': 'e', | ||
'ζ': 'z', | ||
'η': 'i', | ||
'θ': 'th', | ||
'ι': 'i', | ||
'κ': 'k', | ||
'λ': 'l', | ||
'μ': 'm', | ||
'ν': 'n', | ||
'ξ': 'x', | ||
'ο': 'o', | ||
'π': 'p', | ||
'ρ': 'r', | ||
'σ': 's', | ||
'τ': 't', | ||
'υ': 'y', | ||
'φ': 'f', | ||
'χ': 'ch', | ||
'ψ': 'ps', | ||
'ω': 'o' | ||
} | ||
caps = { | ||
'Α': 'A', | ||
'Β': 'V', | ||
'Γ': 'G', | ||
'Δ': 'D', | ||
'Ε': 'E', | ||
'Ζ': 'Z', | ||
'Η': 'I', | ||
'Θ': 'TH', | ||
'Ι': 'I', | ||
'Κ': 'K', | ||
'Λ': 'L', | ||
'Μ': 'M', | ||
'Ν': 'N', | ||
'Ξ': 'X', | ||
'Ο': 'O', | ||
'Π': 'P', | ||
'Ρ': 'R', | ||
'Σ': 'S', | ||
'Τ': 'T', | ||
'Υ': 'Y', | ||
'Φ': 'F', | ||
'Χ': 'CH', | ||
'Ψ': 'PS', | ||
'Ω': 'O' | ||
} | ||
# Simple digraphs with no extra rules or edge cases | ||
# No need for mixed casing "Γγ" or "Γξ" against Greek phonology | ||
el_simple_digraphs = [ | ||
'γγ', | ||
'γξ', | ||
'γχ', | ||
'ου' | ||
] | ||
eng_simple_digraphs = [ | ||
'ng', | ||
'nx', | ||
'nch', | ||
'ou' | ||
] | ||
el_simple_cap_digraphs = [ | ||
"ΓΓ", | ||
"ΓΞ", | ||
"ΓΧ", | ||
"ΟΥ" | ||
] | ||
eng_simple_cap_digraphs = [ | ||
"NG", | ||
"NX", | ||
"NCH", | ||
"OU" | ||
] | ||
|
||
el_mono_digraph_sub = [ | ||
"TH", | ||
"CH", | ||
"PS" | ||
] | ||
# Accent based digraphs | ||
# el_low_acc_digraphs = [ | ||
# "άυ", | ||
# "αϋ", | ||
# | ||
# "έυ", | ||
# "εϋ", | ||
# | ||
# "ήυ", | ||
# "ηϋ" | ||
# | ||
# ] | ||
# el_mix_acc_digraphs = [ | ||
# "Άυ", | ||
# "Αϋ", | ||
# | ||
# "Έυ", | ||
# "Εϋ", | ||
# | ||
# "Ήυ", | ||
# "Ηϋ" | ||
# ] | ||
# el_cap_acc_digraphs = [ | ||
# "ΆΥ", | ||
# "ΑΫ", | ||
# | ||
# "ΈΥ", | ||
# "ΕΫ", | ||
# | ||
# "ΉΥ", | ||
# "ΗΫ" | ||
# ] | ||
el_mp_digraph = [ | ||
"ΜΠ", | ||
"Μπ", | ||
"μπ" | ||
] | ||
eng_mp_digraph_0 = [ | ||
"B", | ||
"B", | ||
"b" | ||
] | ||
eng_mp_digraph_1 = [ | ||
"MP", | ||
"Mp", | ||
"mp" | ||
] | ||
el_xu_digraphs = [ | ||
"αυ", | ||
"ευ", | ||
"ηυ" | ||
] | ||
eng_xu_digraphs_v = [ | ||
"av", | ||
"ev", | ||
"iv" | ||
] | ||
eng_xu_digraphs_f = [ | ||
"af", | ||
"ef", | ||
"if" | ||
] | ||
xu_sound_modifiers_v = [ | ||
|
||
"β", | ||
"γ", | ||
"δ", | ||
"ζ", | ||
"λ", | ||
"μ", | ||
"ν", | ||
"ρ", | ||
|
||
"α", | ||
"ε", | ||
"η", | ||
"ι", | ||
"ο", | ||
"υ", | ||
"ω" | ||
] | ||
# +empty space (accounted for in code) | ||
xu_sound_modifiers_f = [ | ||
|
||
"θ", | ||
"κ", | ||
"ξ", | ||
"π", | ||
"σ", | ||
"τ", | ||
"φ", | ||
"χ", | ||
"ψ" | ||
] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would preffer all list pairs tyo be converted into dicts but it's more of a readability enchancement! |
||
# Replace ς with σ | ||
prep_string = string.replace("ς", "σ") | ||
# if el_low_acc_digraphs or el_mix_acc_digraphs or el_cap_acc_digraphs in string: | ||
# Do nothing, we don't care with current implementation | ||
# Prepare the Unicode tables for use with translate() | ||
lowercase = string.maketrans(lowercase) | ||
caps = string.maketrans(caps) | ||
# reference_string_list = reference_string.split(" ") | ||
new_string_list = prep_string.split(" ") | ||
output = "" | ||
for new_string in new_string_list: | ||
# Replace all digraphs, so they're ignored by the simple transcription | ||
for i in el_simple_digraphs: | ||
if i in string: | ||
new_string = new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)]) | ||
for i in el_simple_cap_digraphs: | ||
if i in string: | ||
new_string = new_string.replace(i, eng_simple_cap_digraphs[el_simple_cap_digraphs.index(i)]) | ||
# Check which "mp" sound to use depending on if it's at word start | ||
for i in el_mp_digraph: | ||
if i in string: | ||
if string.startswith(i): | ||
new_string = new_string.replace(i, eng_mp_digraph_0[el_mp_digraph.index(i)], 1) | ||
new_string = new_string.replace(i, eng_mp_digraph_1[el_mp_digraph.index(i)]) | ||
# Check what VOWEL+"υ" should transliterate to depending on the following letter. | ||
for i in el_xu_digraphs: | ||
if i in new_string: | ||
if len(new_string) > 2: # Make sure we're not calling an out of range index | ||
for loop in xu_sound_modifiers_f: | ||
if new_string[new_string.find(i)+2] in loop: | ||
new_string = new_string.replace(i, eng_xu_digraphs_f[el_xu_digraphs.index(i)]) | ||
|
||
for loop in xu_sound_modifiers_v: | ||
if new_string[new_string.find(i)+2] in loop: | ||
new_string = new_string.replace(i, eng_xu_digraphs_v[el_xu_digraphs.index(i)]) | ||
if len(new_string) == 2: # Account for VOWEL+"υ" at end of sentence | ||
new_string = new_string.replace(i, eng_xu_digraphs_f[el_xu_digraphs.index(i)]) | ||
|
||
# Simple transcription | ||
new_string = new_string.translate(caps) | ||
new_string = new_string.translate(lowercase) | ||
# Normalize capital letters if needed | ||
for i in el_mono_digraph_sub: | ||
if new_string.startswith(i): | ||
if new_string[3].islower() is True: | ||
new_string = new_string.replace(new_string[1], new_string[1].lower()) | ||
new_string += " " | ||
output += new_string | ||
return output |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
def remove_accentuation(string: str): | ||
def remove_accentuation(string: str, modifier=0): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be modifier=True not 0. I also would like to rename modifier to something like keep_dieresis=True or something like that. |
||
accents = { | ||
"ά": "α", | ||
"έ": "ε", | ||
|
@@ -17,14 +17,19 @@ def remove_accentuation(string: str): | |
"Ώ": "Ω", | ||
} | ||
dieresis = {"ι": "ϊ", "υ": "ϋ"} | ||
dieresis_reverse = {"ϊ": "ι", "ϋ": "υ"} | ||
new_string = "" | ||
prev_char = 0 | ||
for c in string: | ||
char = c | ||
if c in accents.keys(): | ||
char = accents[c] | ||
if c in dieresis.keys() and prev_char in ("ά", "ό", "έ"): | ||
char = dieresis[c] | ||
if modifier == 0: | ||
if c in dieresis.keys() and prev_char in ("ά", "ό", "έ"): | ||
char = dieresis[c] | ||
if modifier == 1: # Remove dieresis | ||
if c in dieresis_reverse.keys(): | ||
char = dieresis_reverse[c] | ||
Comment on lines
+27
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if modifier == 0 can become |
||
prev_char = c | ||
new_string += char | ||
return new_string |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest we delete all commented code since it's not used!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm... Reminder to myself not to use GitHub on the phone again. It had me fooled, thinking I only had one notification haha
Sure! I'll keep it saved in a file or something on my own computer for future use.