script tweaks

bnnm · Feb 24, 2024 · 3f5b3ed · 3f5b3ed
1 parent 246b3ef
commit 3f5b3ed
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 33 deletions.
diff --git a/scripts/txt-cleaner.py b/scripts/txt-cleaner.py
@@ -9,6 +9,7 @@
 _ENDS_WITH = ['bc']
 DONE = set()
 split = False
+remove_numbers = False
 
 def get_match_max(line, regex):
     count = 0
@@ -85,9 +86,13 @@ def read_line(line, outfile_ok, outfile_ko, outfile_dp):
             for item in items:
                 if item in DONE:
                     continue
+                if remove_numbers and item.isnumeric():
+                    continue
                 DONE.add(item)
                 outfile_ok.write(item + '\n')
         else:
+            if remove_numbers and line.isnumeric():
+                return
             outfile_ok.write(line + '\n')
     else:
         outfile_ko.write(line + '\n')

diff --git a/sstr/sstr.c b/sstr/sstr.c
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include <stdint.h>
+#include <stdbool.h>
 #include <string.h>
 #include <stdlib.h>
 #include <time.h>
@@ -12,6 +13,7 @@
 // Some games have strings like (size)(id)(string), or (size)(string).
 // strings2.exe trips on those and may create things like "b(string)A",
 // while this program should handle them fine (may still output some false positives though)
+// Mainly for names found in stuff like the Decima engine.
 
 // todo config bufsize
 // todo BE mode
@@ -31,6 +33,7 @@ typedef struct {
     uint32_t buf_size;
     const char* targets[MAX_TARGETS];
     uint32_t targets_count;
+    bool limited;
 } sstr_config;
 
 //*************************************************************************
@@ -44,12 +47,24 @@ static uint32_t get_u32le(const uint8_t *p) {
     return ret;
 }
 
-static int is_ascii_str(const uint8_t* buf, int str_len) {
-    for (int i = 0; i < str_len - 1; i++) {
-        uint8_t curr = buf[i];
-        if (curr < 0x20 || curr >= 0x7F) // useful only ASCII
-            return 0;
+static int is_ascii_str(const uint8_t* buf, int str_len, bool limited) {
+    if (limited) {
+        // decima hashes only
+        for (int i = 0; i < str_len - 1; i++) {
+            uint8_t curr = buf[i];
+            if (curr < 0x2d && curr != 0x20 || curr > 0x7a || curr >= 0x3b && curr <= 0x40 || curr >= 0x5b && curr <= 0x5e)
+                return 0;
+        }
+    }
+    else {
+        // useful only ASCII
+        for (int i = 0; i < str_len - 1; i++) {
+            uint8_t curr = buf[i];
+            if (curr < 0x20 || curr >= 0x7F)
+                return 0;
+        }
     }
+
     // last char can be a null
     uint8_t last = buf[str_len-1];
     if (last != 0 && last < 0x20 || last >= 0x7F)
@@ -58,25 +73,25 @@ static int is_ascii_str(const uint8_t* buf, int str_len) {
     return 1;
 }
 
-static int test_str(const uint8_t* buf, int str_len) {
-    if (is_ascii_str(buf, str_len)) {
+static int test_str(const uint8_t* buf, int str_len, bool limited) {
+    if (is_ascii_str(buf, str_len, limited)) {
         printf("%.*s\n", str_len, buf);
         return 1;
     }
     return 0;
 }
 
-static void find_string(const uint8_t* buf, uint32_t buf_size) {
+static void find_string(const uint8_t* buf, uint32_t buf_size, bool limited) {
     uint32_t pos = 0;
 
     // test (len)(str) and (len)(id)(str)
     while (pos < buf_size) {
         uint32_t str_len = get_u32le(buf + pos + 0x00);
         if (str_len > MIN_STR && str_len < MAX_STR) {
             // both are possible at the same time in some cases
-            int test1 = test_str(buf + pos + 0x04, str_len);
-            int test2 = test_str(buf + pos + 0x08, str_len);
-            
+            int test1 = test_str(buf + pos + 0x04, str_len, limited);
+            int test2 = test_str(buf + pos + 0x08, str_len, limited);
+
             if (test2) {
                 pos += 0x08 + str_len;
             }
@@ -130,6 +145,9 @@ static int parse_cfg(sstr_config* cfg, int argc, const char* argv[]) {
             case 'h':
                 print_usage(argv[0]);
                 return 0;
+            case 'l':
+                cfg->limited = true;
+                break;
             default:
                 CHECK_EXIT(1, "ERROR: unknown parameter '%s'\n", argv[i]);
                 break;
@@ -199,7 +217,7 @@ int main(int argc, const char* argv[]) {
             if (!bytes)
                 break;
 
-            find_string(buf, BUF_HEAD + bytes);
+            find_string(buf, BUF_HEAD + bytes, cfg.limited);
 
             // copy last bytes as next head (shouldn't overlap)
             memcpy(buf, buf + BUF_HEAD + bytes - BUF_HEAD, BUF_HEAD);

diff --git a/sstr/sstr.exe b/sstr/sstr.exe
diff --git a/wwnames/_wwnames-fixer.py b/wwnames/_wwnames-fixer.py
@@ -3,17 +3,18 @@
 #   - should have lines like "# 3933301714"
 # - add words.py reversed names, format "3933301714: banana"
 # - run this tool (drag and drop)
-#   - this will replace "# 3933301714" by "banana"
+#   - this will replace "# 3933301714" with "banana"
 #   - if list has "### (name)" sections, sections are sorted too
-# - output is "(name)-clean.txt"
+# - output is "(name)-clean.txt", except if (name) is wwname-*.txt in which case will be replaced
+# - if some name is wrong add "#ko" at the end ("ZZSXSBanana #ko")
+#   - use this script and wrong name will be converted back to "# (hash number)"
 
 import sys, re
 
 FULL_CLEAN = True
 CLEAN_ORDER = True
 UPDATE_ORIGINAL = True
 FNV_FORMAT = re.compile(r"^[A-Za-z_][A-Za-z0-9\_]*$")
-#HDR_FORMAT = re.compile(r"^###+*\([^\t]+\).+[\t ]*([^\t]*)[\t ]*([^\t]*)")
 HDR_FORMAT1 = re.compile(r"^###.+\(langs/(.+)\.bnk\)")
 HDR_FORMAT2 = re.compile(r"^###.+\((.+)\.bnk\)")
 
@@ -131,7 +132,10 @@ def fix_wwnames(inname):
             if items:
                 # register solved ids and ignore line
                 sid, hashname = items
-                hashed[sid] = hashname
+                if sid not in hashed:
+                    hashed[sid] = []
+                if hashname not in hashed[sid]:
+                    hashed[sid].append(hashname)
             else:
                 # register base lines as-is, except when fixing headers
                 if line.startswith('### '):
@@ -145,9 +149,9 @@ def fix_wwnames(inname):
 
                     if bankname.isdigit():
                         sid = int(bankname)
-                        hashname = hashed.get(sid)
-                        if hashname:
-                            line = line.replace('.bnk', '.bnk: %s' % hashname)
+                        hashnames = hashed.get(sid)
+                        if hashnames:
+                            line = line.replace('.bnk', '.bnk: %s' % hashnames[0])
 
                 # use case as found in first line 
                 # (so if BLAH is used in several points and changed once to Blah, other points use that too)
@@ -160,47 +164,67 @@ def fix_wwnames(inname):
                 if not line.startswith('#'):
                     hashname = line.split('#')[0]
                     sid = get_fnv(hashname)
-                    hashed[sid] = hashname
-
+                    if sid not in hashed:
+                        hashed[sid] = []
+                    if hashname not in hashed[sid]:
+                        hashed[sid].append(hashname)
 
 
+    section = False
     clines = []
     for bline in blines:
-        if bline in koed:
+        if bline.startswith('### '):
+            section = True
+
+        if bline.lower() in koed:
             sid = get_fnv(bline)
             bline = "# %s" % (sid)
 
-        if bline.startswith('#ko') and ':' in bline and FULL_CLEAN:
+        if bline and bline.startswith('#ko') and ':' in bline and FULL_CLEAN:
             _, hashname = bline.split(':')
             hashname = hashname.strip()
             sid = get_fnv(hashname)
-            bline = "# %s" % (sid)
-            koed.add(hashname)
+            koed.add(hashname.lower())
+            if section:  # '#ko' on top get ignored
+                bline = "# %s" % (sid)
+            else:
+                continue
 
         if bline.endswith('#ko') and FULL_CLEAN:
             if bline.startswith('# '):
                 fnv = bline.split(' ')[1]
                 koed.add(fnv.strip())
                 continue
+            elif bline.startswith('#'):
+                pass
             else:
-                hashname, _ = bline.split('#ko')
+                hashname, _ = bline.split('#ko', 1)
                 hashname = hashname.strip()
                 sid = get_fnv(hashname)
-                bline = "# %s" % (sid)
-                koed.add(hashname)
+                koed.add(hashname.lower())
+                if section: # '#ko' on top get ignored
+                    bline = "# %s" % (sid)
+                else:
+                    continue
 
 
         if bline.startswith('# ') and ':' not in bline:
             sid = bline[2:].strip()
             if sid in hashed:
-                hashname = hashed[sid]
-                if FULL_CLEAN:
-                    bline = "%s" % (hashname)
-                else:
-                    bline = "%s: %s" % (sid, hashname)
+                hashnames = hashed[sid]
+                for i, hashname in enumerate(hashnames):
+                    if FULL_CLEAN:
+                        bline = "%s" % (hashname)
+                    else:
+                        bline = "%s: %s" % (sid, hashname)
+                    if i > 0:
+                        bline += ' #alt'
+                    clines.append(bline)
+                continue
 
         clines.append(bline)
 
+
     clines = order_list(clines)
     clines = clean_lines(clines)
     outname = inname