Skip to content

Commit

Permalink
scripts: more stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
bnnm committed Dec 1, 2024
1 parent a9a15e6 commit 1632b6a
Show file tree
Hide file tree
Showing 3 changed files with 298 additions and 79 deletions.
45 changes: 30 additions & 15 deletions doc/NAMES.md
Original file line number Diff line number Diff line change
Expand Up @@ -330,22 +330,29 @@ words.py -c 2 -de -mc 70 -sf -ho -jb -o words2_all-jb.txt
# generally slow and too many false positives if your wwnames/ww.txt names are >1MB,
# but very useful with the right txt. Remove less useful names and keep it <300kb for best results.
# words based on existing prefixes/sufixes, some value between 2-4 is a good compromise
words.py -de -o words_fa-pn.txt -fap 3
words.py -de -o words_fa-sn.txt -fas 3
words.py -de -o words_fa-pn-fs-sc.txt -fap 3 -fs -sc
words.py -de -o words_fa-sn-fs-sc.txt -fas 3 -fs -sc
words.py -de -o words_fa-pn-fs-jb.txt -fap 3 -fs -jb
words.py -de -o words_fa-sn-fs-jb.txt -fas 3 -fs -jb
# combos
words.py -c 2 -mc 80 -de -o words2_out.txt
# more combos
words.py -c 2 -zd -mc 70 -jb -de -o words2_all-jb.txt
words.py -c 2 -zd -mc 70 -js -de -o words2_all-js.txt
words.py -c 2 -zd -mc 70 -sc -de -o words2_all-sc.txt
words.py -c 2 -zd -mc 70 -sc -jb -de -o words2_all-sc-jb.txt
# a bit slow but creates words in a way that gets good results
words.py -de -o words_fa-sf.txt -zd -fa -sf
words.py -c 2 -mc 70 -jb -de -o words2_c2-jb.txt
words.py -c 2 -mc 70 -js -de -o words2_c2-js.txt
words.py -c 2 -mc 70 -sc -de -o words2_c2-sc.txt
words.py -c 2 -mc 70 -sc -jb -de -o words2_c2-sc-jb.txt
# similar variations but a low less useful, might as well
words.py -de -o words_fa-jb.txt -zd -fa -jb
words.py -de -o words_fa-sc.txt -zd -fa -sc
words.py -de -o words_fa-jb-sc.txt -zd -fa -jb -sc
# creates words based on all chunks of the words, slow but may get trickier combos
words.py -de -o words_fa-sf.txt -fa -sf
words.py -de -o words_fa-jb.txt -fa -jb
words.py -de -o words_fa-sc.txt -fa -sc
words.py -de -o words_fa-sc-sf.txt -fa -sc -sf
words.py -de -o words_fa-sc-jb.txt -fa -sc -jb
## extra commands
Expand Down Expand Up @@ -373,13 +380,21 @@ words.py -de -c 3 -mc 40 -ho -sf -o words3_sf.txt
## extra stuff
# not commands but a few extra tips:
# - go to the wwiser-utils/wwnames dir, copy all names in a single file (ex. cmd: copy *.txt ww.txt) and use it as for a few more names
# - use sample formats from wwiser-utils/words
words.py -de -zd -o words1_fmt.txt -f ../wwiser-utils/words/formats.txt
# - go to wwiser-utils/words and try sample formats.txt there
# - use sample formats existing wwnames from wwiser-utils/wwnames
words.py -de -zd -o words1_all.txt -i ../wwiser-utils/wwnames/*.txt (other flags)
# - take english-small.txt or ww.txt (extra names), add "#@section" on top; then try the "permutations" mode
# - use english-small.txt or ww.txt (extra names), add "#@section" on top then try the "permutations" mode
words.py -p -mc 40 -de -i ww.txt
words.py -p -mc 40 -de -i english-small.txt
words.py -p -mc 40 -de -i ../wwiser-utils/wwnames/dict/english-small.txt
## pypy setup
set PATH=%PATH%;%ROSS_USER%\git\dist\pypy3.9-v7.3.11-win64
set WDIR=%USERPROFILE%\git\wwiser-utils\words
pypy3.exe %WDIR%\words.py ... (flags)
```


Expand Down
253 changes: 216 additions & 37 deletions scripts/txt-cleaner.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,94 @@
# Cleans text files from strings2.exe garbage
#TODO IDEAS
#- make N-grams from ENG words > not accurate enough (games use custom stuff)
#- ignore start of N chars if not good enough > time consuming to make
#- ignore words that don't contain anything from a ENG list > may skip useful names to be used


import os, sys, itertools, re
import fnmatch

SPLIT_LINES = True
LINE_MAX = 60
REMOVE_NUMBERS = False
REMOVE_NUMBER_LETTERS_MAX = 0 #5abcd abcd5
MIN_LETTERS = 4
ACCEPTABLES_MIN = 50
REMOVE_IGNORABLES = True
REMOVE_NON_VOCALS = False

RENAMES_FILE = '_txt-renames.txt'
IGNORABLES_FILE = '_txt-ignorables.txt'

_PATTERN_WRONG = re.compile(r'[\t.<>,;.:{}\[\]()\'"$&/=!\\/#@+\^`´¨?|~*%]')
_PATTERN_SPLIT = re.compile(r'[\t.<>,;.:{}\[\]()\'"$&/=!\\/#@+\^`´¨?|~*% -]')
_WORD_ALLOWED = ['xiii', 'xviii','zzz']
_BAD_GROUPS = ['uu', 'fwfw','ldlD', 'vwu', 'zzz', 'abcde']
_ENDS_WITH = ['bc']

_RENAMES = []
_IGNORABLES = []
_IGNORABLES_START = []
_IGNORABLES_MIDDLE = []
_IGNORABLES_END = []

_RENAMES_START = []
_RENAMES_MIDDLE = []
_RENAMES_END = []

_ACCEPTABLES_START = []
_ACCEPTABLES_MIDDLE = []

REPEATS_EXTENDED = ['i', 't', 'l']

DONE = set()
split = False
remove_numbers = False


def get_external_lines(filename):
items = []
try:
with open(filename, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
if line.startswith('#'):
continue
line = line.split('#')[0]
line = line.strip()
items.append(line)
except FileNotFoundError:
pass
return items


def load_renames():
lines = get_external_lines(RENAMES_FILE)
_RENAMES.extend(lines)

for _RENAME in _RENAMES:
if _RENAME.startswith('*') and _RENAME.endswith('*'):
_RENAMES_MIDDLE.append(_RENAME[1:-1].lower())
elif _RENAME.endswith('*'):
_RENAMES_START.append(_RENAME[:-1].lower())
elif _RENAME.startswith('*'):
_RENAMES_END.append(_RENAME[1:].lower())

def load_ignorables():
lines = get_external_lines(IGNORABLES_FILE)
_IGNORABLES.extend(lines)

for _IGNORABLE in _IGNORABLES:
if _IGNORABLE.startswith('*') and _IGNORABLE.endswith('*'):
_IGNORABLES_MIDDLE.append(_IGNORABLE[1:-1].lower())
elif _IGNORABLE.endswith('*'):
_IGNORABLES_START.append(_IGNORABLE[:-1].lower())
elif _IGNORABLE.startswith('*'):
_IGNORABLES_END.append(_IGNORABLE[1:].lower())


load_renames()
load_ignorables()

#----

def get_match_max(line, regex):
count = 0
Expand All @@ -22,7 +101,7 @@ def is_match_max(line, count, max):
return True

return False


def is_line_ok(line):
line = line.strip()
Expand All @@ -37,11 +116,28 @@ def is_line_ok(line):
if line_lw in _WORD_ALLOWED:
return True

if MIN_LETTERS and line_len < MIN_LETTERS:
return False

#if '\x00' in line:
# return False

# skip wonky mini words
if line_len <= 4 and _PATTERN_WRONG.search(line):
return False

# skip mini words with several
if REMOVE_NON_VOCALS and not any(char in 'aeiou' for char in line_lw):
return False

if LINE_MAX and line_len > LINE_MAX:
return False

if REMOVE_NUMBER_LETTERS_MAX and line_len <= REMOVE_NUMBER_LETTERS_MAX:
if any(char in '1234567890' for char in line_lw):
return False


# skip mini words with several letters
max_match = get_match_max(line, _PATTERN_WRONG)
if (line_len >= 4 and line_len <= 5) and max_match == 2:
return False
Expand All @@ -50,52 +146,113 @@ def is_line_ok(line):
#if (line_len > 50) and max_match >= 10:
# return False

if line_len < 12:
# check for words like
for key, group in itertools.groupby(line):
group_len = len(list(group))
if key.lower() in ['0', '1', 'x', ' ']: #allow 000, 111, xxx
continue
if group_len > 2:
return False
#if line_len < 12:
# for key, group in itertools.groupby(line):
# group_len = len(list(group))
# if key.lower() in ['0', '1', 'x', ' ']: #allow 000, 111, xxx
# continue
# if group_len > 2:
# return False

if REMOVE_NUMBERS and line.isnumeric():
return False

if REMOVE_IGNORABLES:
#for ignorable in _IGNORABLES:
# if fnmatch.fnmatch(line_lw, ignorable):
# return False
if any(line_lw.startswith(sub) for sub in _IGNORABLES_START):
return False
if any(line_lw.endswith(sub) for sub in _IGNORABLES_END):
return False
if any(sub in line_lw for sub in _IGNORABLES_MIDDLE):
return False

if line_len < 7:
for group in _BAD_GROUPS:
if group in line_lw:
return False
# odd 'xAxBxCx' repeats
if '_' not in line_lw and '_0x' not in line_lw:
# - not lowercase to avoid stuff like HallucinogenicInitial many i
# - don't skip valid words like 'Abilities', 'Parallels'
#TODO: skip "takayama", "deleteme"
is_extended = any(sub in line_lw for sub in REPEATS_EXTENDED)
if line_len >= 6 and not is_extended:
# not 'i' since 'Abilities'
for i in range(6, line_len):
if line[i-0] != line[i-1]:
if line[i-0] == line[i-2] == line[i-4] and line[i-0] not in REPEATS_EXTENDED:
return False
if line_len >= 8 and is_extended:
for i in range(6, line_len):
if line[i-0] != line[i-1]:
if line[i-0] == line[i-2] == line[i-4] == line[i-6]:
return False
#else:
# for i in range(6, line_len):
# if line_lw[i-0] != line_lw[i-1]:
# if line_lw[i-0] == line_lw[i-2] == line_lw[i-4]:
# return False

for ew in _ENDS_WITH:
if line_lw.endswith(ew):
if _ACCEPTABLES_START:
is_acceptable = False
for acceptable in _ACCEPTABLES_START:
if line_lw.startswith(acceptable):
is_acceptable = True
break
if not is_acceptable:
return False

return True
if _ACCEPTABLES_MIDDLE:
is_acceptable = False
for acceptable in _ACCEPTABLES_MIDDLE:
if acceptable in line_lw and not line_lw.startswith(acceptable):
is_acceptable = True
break
if not is_acceptable:
return False

return True


def read_line(line, outfile_ok, outfile_ko, outfile_dp):
line = line.strip("\n")
def read_line_main(line, outfile_ok, outfile_ko, outfile_dp):
if not line:
return

if True:
line_lw = line.lower()
for _RENAME in _RENAMES_START:
if line_lw.startswith(_RENAME):
line = line [len(_RENAME):]
line_lw = line.lower()
break
for _RENAME in _RENAMES_END:
if line_lw.endswith(_RENAME):
line = line [:-len(_RENAME)]
line_lw = line.lower()
break

res = is_line_ok(line)
if res is None:
outfile_dp.write(line + '\n')
elif res:
if split:
items = _PATTERN_SPLIT.split(line)
for item in items:
if item in DONE:
continue
if remove_numbers and item.isnumeric():
continue
DONE.add(item)
outfile_ok.write(item + '\n')
elif not res:
outfile_ko.write(line + '\n')
else:
outfile_ok.write(line + '\n')


def read_line(line, outfile_ok, outfile_ko, outfile_dp):
line = line.strip("\n")
line = line.replace('\x00', ' ')
if not line:
return

if SPLIT_LINES:
items = _PATTERN_SPLIT.split(line)
if len(items) == 1:
read_line_main(line, outfile_ok, outfile_ko, outfile_dp)
else:
if remove_numbers and line.isnumeric():
return
outfile_ok.write(line + '\n')
for item in items:
read_line_main(item, outfile_ok, outfile_ko, outfile_dp)
else:
outfile_ko.write(line + '\n')
read_line_main(line, outfile_ok, outfile_ko, outfile_dp)


def read_file(in_name, out_name_ok, out_name_ko, out_name_dp):
Expand Down Expand Up @@ -123,8 +280,30 @@ def main():
print("missing filename")
return

try:
#if True:
with open('fnv3.lst', 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if ':' in line:
line, number = line.split(':')
if int(number) < ACCEPTABLES_MIN:
continue
if line.startswith('^'):
_ACCEPTABLES_START.append(line[1:]) # + '*'
else:
_ACCEPTABLES_MIDDLE.append(line) #'*' + line + '*'
print("loaded trigrams")
except:
# not found
#print("ignored trigrams")
pass


for i in range(1, len(sys.argv)):
in_name = sys.argv[i]
if 'split' in in_name:
SPLIT_LINES = True

base, _ = os.path.splitext(in_name)
out_name_ok = base + "_ok.txt"
Expand Down
Loading

0 comments on commit 1632b6a

Please sign in to comment.