scripts: more stuff

bnnm · Dec 1, 2024 · 1632b6a · 1632b6a
1 parent a9a15e6
commit 1632b6a
Show file tree

Hide file tree

Showing 3 changed files with 298 additions and 79 deletions.
diff --git a/doc/NAMES.md b/doc/NAMES.md
@@ -330,22 +330,29 @@ words.py -c 2 -de -mc 70 -sf -ho -jb -o words2_all-jb.txt
 # generally slow and too many false positives if your wwnames/ww.txt names are >1MB,
 # but very useful with the right txt. Remove less useful names and keep it <300kb for best results.
 
+# words based on existing prefixes/sufixes, some value between 2-4 is a good compromise
+words.py -de -o words_fa-pn.txt -fap 3
+words.py -de -o words_fa-sn.txt -fas 3
+words.py -de -o words_fa-pn-fs-sc.txt -fap 3 -fs -sc
+words.py -de -o words_fa-sn-fs-sc.txt -fas 3 -fs -sc
+words.py -de -o words_fa-pn-fs-jb.txt -fap 3 -fs -jb
+words.py -de -o words_fa-sn-fs-jb.txt -fas 3 -fs -jb
+
 # combos
 words.py -c 2 -mc 80  -de -o words2_out.txt
 
 # more combos
-words.py -c 2 -zd -mc 70 -jb -de -o words2_all-jb.txt 
-words.py -c 2 -zd -mc 70 -js -de -o words2_all-js.txt 
-words.py -c 2 -zd -mc 70 -sc -de -o words2_all-sc.txt 
-words.py -c 2 -zd -mc 70 -sc -jb -de -o words2_all-sc-jb.txt 
-
-# a bit slow but creates words in a way that gets good results
-words.py -de -o words_fa-sf.txt -zd -fa -sf
+words.py -c 2 -mc 70 -jb -de -o words2_c2-jb.txt 
+words.py -c 2 -mc 70 -js -de -o words2_c2-js.txt 
+words.py -c 2 -mc 70 -sc -de -o words2_c2-sc.txt 
+words.py -c 2 -mc 70 -sc -jb -de -o words2_c2-sc-jb.txt 
 
-# similar variations but a low less useful, might as well
-words.py -de -o words_fa-jb.txt -zd -fa -jb
-words.py -de -o words_fa-sc.txt -zd -fa -sc
-words.py -de -o words_fa-jb-sc.txt -zd -fa -jb -sc
+# creates words based on all chunks of the words, slow but may get trickier combos
+words.py -de -o words_fa-sf.txt -fa -sf
+words.py -de -o words_fa-jb.txt -fa -jb
+words.py -de -o words_fa-sc.txt -fa -sc
+words.py -de -o words_fa-sc-sf.txt -fa -sc -sf
+words.py -de -o words_fa-sc-jb.txt -fa -sc -jb
 
 
 ## extra commands
@@ -373,13 +380,21 @@ words.py -de -c 3 -mc 40 -ho -sf -o words3_sf.txt
 ## extra stuff
 # not commands but a few extra tips:
 
-# - go to the wwiser-utils/wwnames dir, copy all names in a single file (ex. cmd: copy *.txt ww.txt) and use it as for a few more names
+# - use sample formats from wwiser-utils/words
+words.py -de -zd -o words1_fmt.txt -f ../wwiser-utils/words/formats.txt
 
-# - go to wwiser-utils/words and try sample formats.txt there
+# - use sample formats existing wwnames from wwiser-utils/wwnames
+words.py -de -zd -o words1_all.txt -i ../wwiser-utils/wwnames/*.txt (other flags)
 
-# - take english-small.txt or ww.txt (extra names), add "#@section" on top; then try the "permutations" mode
+# - use english-small.txt or ww.txt (extra names), add "#@section" on top then try the "permutations" mode
 words.py -p -mc 40 -de -i ww.txt
-words.py -p -mc 40 -de -i english-small.txt
+words.py -p -mc 40 -de -i ../wwiser-utils/wwnames/dict/english-small.txt
+
+
+## pypy setup
+set PATH=%PATH%;%ROSS_USER%\git\dist\pypy3.9-v7.3.11-win64
+set WDIR=%USERPROFILE%\git\wwiser-utils\words
+pypy3.exe %WDIR%\words.py ... (flags)
 ```
 
 

diff --git a/scripts/txt-cleaner.py b/scripts/txt-cleaner.py
@@ -1,15 +1,94 @@
 # Cleans text files from strings2.exe garbage
+#TODO IDEAS
+#- make N-grams from ENG words > not accurate enough (games use custom stuff)
+#- ignore start of N chars if not good enough > time consuming to make
+#- ignore words that don't contain anything from a ENG list > may skip useful names to be used
+
 
 import os, sys, itertools, re
+import fnmatch
+
+SPLIT_LINES = True
+LINE_MAX = 60
+REMOVE_NUMBERS = False
+REMOVE_NUMBER_LETTERS_MAX = 0 #5abcd abcd5
+MIN_LETTERS = 4
+ACCEPTABLES_MIN = 50
+REMOVE_IGNORABLES = True
+REMOVE_NON_VOCALS = False
+
+RENAMES_FILE = '_txt-renames.txt'
+IGNORABLES_FILE = '_txt-ignorables.txt'
 
 _PATTERN_WRONG = re.compile(r'[\t.<>,;.:{}\[\]()\'"$&/=!\\/#@+\^`´¨?|~*%]')
 _PATTERN_SPLIT = re.compile(r'[\t.<>,;.:{}\[\]()\'"$&/=!\\/#@+\^`´¨?|~*% -]')
 _WORD_ALLOWED = ['xiii', 'xviii','zzz']
-_BAD_GROUPS = ['uu', 'fwfw','ldlD', 'vwu', 'zzz', 'abcde']
-_ENDS_WITH = ['bc']
+
+_RENAMES = []
+_IGNORABLES = []
+_IGNORABLES_START = []
+_IGNORABLES_MIDDLE = []
+_IGNORABLES_END = []
+
+_RENAMES_START = []
+_RENAMES_MIDDLE = []
+_RENAMES_END = []
+
+_ACCEPTABLES_START = []
+_ACCEPTABLES_MIDDLE = []
+
+REPEATS_EXTENDED = ['i', 't', 'l']
+
 DONE = set()
-split = False
-remove_numbers = False
+
+
+def get_external_lines(filename):
+    items = []
+    try:
+        with open(filename, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                if line.startswith('#'):
+                    continue
+                line = line.split('#')[0]
+                line = line.strip()
+                items.append(line)
+    except FileNotFoundError:
+        pass
+    return items
+
+
+def load_renames():
+    lines = get_external_lines(RENAMES_FILE)
+    _RENAMES.extend(lines)
+
+    for _RENAME in _RENAMES:
+        if _RENAME.startswith('*') and _RENAME.endswith('*'):
+            _RENAMES_MIDDLE.append(_RENAME[1:-1].lower())
+        elif _RENAME.endswith('*'):
+            _RENAMES_START.append(_RENAME[:-1].lower())
+        elif _RENAME.startswith('*'):
+            _RENAMES_END.append(_RENAME[1:].lower())
+
+def load_ignorables():
+    lines = get_external_lines(IGNORABLES_FILE)
+    _IGNORABLES.extend(lines)
+
+    for _IGNORABLE in _IGNORABLES:
+        if _IGNORABLE.startswith('*') and _IGNORABLE.endswith('*'):
+            _IGNORABLES_MIDDLE.append(_IGNORABLE[1:-1].lower())
+        elif _IGNORABLE.endswith('*'):
+            _IGNORABLES_START.append(_IGNORABLE[:-1].lower())
+        elif _IGNORABLE.startswith('*'):
+            _IGNORABLES_END.append(_IGNORABLE[1:].lower())
+
+
+load_renames()
+load_ignorables()
+
+#----
 
 def get_match_max(line, regex):
     count = 0
@@ -22,7 +101,7 @@ def is_match_max(line, count, max):
         return True
 
     return False
-    
+
 
 def is_line_ok(line):
     line = line.strip()
@@ -37,11 +116,28 @@ def is_line_ok(line):
     if line_lw in _WORD_ALLOWED:
         return True
 
+    if MIN_LETTERS and line_len < MIN_LETTERS:
+        return False
+
+    #if '\x00' in line:
+    #    return False
+
     # skip wonky mini words
     if line_len <= 4 and _PATTERN_WRONG.search(line):
         return False
 
-    # skip mini words with several
+    if REMOVE_NON_VOCALS and not any(char in 'aeiou' for char in line_lw):
+        return False
+
+    if LINE_MAX and line_len > LINE_MAX:
+        return False
+
+    if REMOVE_NUMBER_LETTERS_MAX and line_len <= REMOVE_NUMBER_LETTERS_MAX:
+        if any(char in '1234567890' for char in line_lw):
+            return False
+
+
+    # skip mini words with several letters
     max_match = get_match_max(line, _PATTERN_WRONG)
     if (line_len >= 4 and line_len <= 5) and max_match == 2:
         return False
@@ -50,52 +146,113 @@ def is_line_ok(line):
     #if (line_len > 50) and max_match >= 10:
     #    return False
 
-    if line_len < 12:
-        # check for words like 
-        for key, group in itertools.groupby(line):
-            group_len = len(list(group))
-            if key.lower() in ['0', '1', 'x', ' ']: #allow 000, 111, xxx
-                continue
-            if group_len > 2:
-                return False
+    #if line_len < 12:
+    #    for key, group in itertools.groupby(line):
+    #        group_len = len(list(group))
+    #        if key.lower() in ['0', '1', 'x', ' ']: #allow 000, 111, xxx
+    #            continue
+    #        if group_len > 2:
+    #            return False
+
+    if REMOVE_NUMBERS and line.isnumeric():
+        return False
+
+    if REMOVE_IGNORABLES:
+        #for ignorable in _IGNORABLES:
+        #    if fnmatch.fnmatch(line_lw, ignorable):
+        #        return False
+        if any(line_lw.startswith(sub) for sub in _IGNORABLES_START):
+            return False
+        if any(line_lw.endswith(sub) for sub in _IGNORABLES_END):
+            return False
+        if any(sub in line_lw for sub in _IGNORABLES_MIDDLE):
+            return False
 
-    if line_len < 7:
-        for group in _BAD_GROUPS:
-            if group in line_lw:
-                return False
+    # odd 'xAxBxCx' repeats
+    if '_' not in line_lw and '_0x' not in line_lw:
+        # - not lowercase to avoid stuff like HallucinogenicInitial many i
+        # - don't skip valid words like 'Abilities', 'Parallels'
+        #TODO: skip "takayama", "deleteme"
+        is_extended = any(sub in line_lw for sub in REPEATS_EXTENDED)
+        if line_len >= 6 and not is_extended:
+            # not 'i' since 'Abilities' 
+            for i in range(6, line_len):
+                if line[i-0] != line[i-1]:
+                    if line[i-0] == line[i-2] == line[i-4] and line[i-0] not in REPEATS_EXTENDED:
+                        return False
+        if line_len >= 8 and is_extended:
+            for i in range(6, line_len):
+                if line[i-0] != line[i-1]:
+                    if line[i-0] == line[i-2] == line[i-4] == line[i-6]:
+                        return False
+        #else:
+        #    for i in range(6, line_len):
+        #        if line_lw[i-0] != line_lw[i-1]:
+        #            if line_lw[i-0] == line_lw[i-2] == line_lw[i-4]:
+        #                return False
 
-    for ew in _ENDS_WITH:
-        if line_lw.endswith(ew):
+    if _ACCEPTABLES_START:
+        is_acceptable = False
+        for acceptable in _ACCEPTABLES_START:
+            if line_lw.startswith(acceptable):
+                is_acceptable = True
+                break
+        if not is_acceptable:
             return False
 
-    return True
+    if _ACCEPTABLES_MIDDLE:
+        is_acceptable = False
+        for acceptable in _ACCEPTABLES_MIDDLE:
+            if acceptable in line_lw and not line_lw.startswith(acceptable):
+                is_acceptable = True
+                break
+        if not is_acceptable:
+            return False
 
+    return True
 
 
-def read_line(line, outfile_ok, outfile_ko, outfile_dp):
-    line = line.strip("\n")
+def read_line_main(line, outfile_ok, outfile_ko, outfile_dp):
     if not line:
         return
 
+    if True:
+        line_lw = line.lower()
+        for _RENAME in _RENAMES_START:
+            if line_lw.startswith(_RENAME):
+                line    = line   [len(_RENAME):]
+                line_lw = line.lower()
+                break
+        for _RENAME in _RENAMES_END:
+            if line_lw.endswith(_RENAME):
+                line    = line   [:-len(_RENAME)]
+                line_lw = line.lower()
+                break
+
     res = is_line_ok(line)
     if res is None:
         outfile_dp.write(line + '\n')
-    elif res:
-        if split:
-            items = _PATTERN_SPLIT.split(line)
-            for item in items:
-                if item in DONE:
-                    continue
-                if remove_numbers and item.isnumeric():
-                    continue
-                DONE.add(item)
-                outfile_ok.write(item + '\n')
+    elif not res:
+        outfile_ko.write(line + '\n')
+    else:
+        outfile_ok.write(line + '\n')
+
+
+def read_line(line, outfile_ok, outfile_ko, outfile_dp):
+    line = line.strip("\n")
+    line = line.replace('\x00', ' ')
+    if not line:
+        return
+
+    if SPLIT_LINES:
+        items = _PATTERN_SPLIT.split(line)
+        if len(items) == 1:
+            read_line_main(line, outfile_ok, outfile_ko, outfile_dp)
         else:
-            if remove_numbers and line.isnumeric():
-                return
-            outfile_ok.write(line + '\n')
+            for item in items:
+                read_line_main(item, outfile_ok, outfile_ko, outfile_dp)
     else:
-        outfile_ko.write(line + '\n')
+        read_line_main(line, outfile_ok, outfile_ko, outfile_dp)
 
 
 def read_file(in_name, out_name_ok, out_name_ko, out_name_dp):
@@ -123,8 +280,30 @@ def main():
         print("missing filename")
         return
 
+    try:
+    #if True:
+        with open('fnv3.lst', 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if ':' in line:
+                    line, number = line.split(':')
+                    if int(number) < ACCEPTABLES_MIN:
+                        continue
+                if line.startswith('^'):
+                    _ACCEPTABLES_START.append(line[1:]) # + '*'
+                else:
+                    _ACCEPTABLES_MIDDLE.append(line) #'*' + line + '*'
+        print("loaded trigrams")
+    except:
+        # not found
+        #print("ignored trigrams")
+        pass
+
+
     for i in range(1, len(sys.argv)):
         in_name = sys.argv[i]
+        if 'split' in in_name:
+            SPLIT_LINES = True
 
         base, _ = os.path.splitext(in_name)
         out_name_ok = base + "_ok.txt"