Samsung
diff --git a/‎.ci/benchmark.txt
+33-32 b/‎.ci/benchmark.txt
+33-32
diff --git a/‎.github/workflows/benchmark.yml
+1-3 b/‎.github/workflows/benchmark.yml
+1-3
diff --git a/‎.github/workflows/check.yml
+2-2 b/‎.github/workflows/check.yml
+2-2
diff --git a/‎credsweeper/ml_model/ml_config.json
+21-1 b/‎credsweeper/ml_model/ml_config.json
+21-1
diff --git a/‎credsweeper/ml_model/ml_model.onnx
3.55 MB b/‎credsweeper/ml_model/ml_model.onnx
3.55 MB
diff --git a/‎credsweeper/rules/config.yaml
+18 b/‎credsweeper/rules/config.yaml
+18
diff --git a/‎experiment/main.py
+45-23 b/‎experiment/main.py
+45-23
diff --git a/‎experiment/main.sh
+44-12 b/‎experiment/main.sh
+44-12
diff --git a/‎experiment/src/data_loader.py
-2 b/‎experiment/src/data_loader.py
-2
diff --git a/‎experiment/src/entropy_test.py
-3 b/‎experiment/src/entropy_test.py
-3
diff --git a/‎experiment/src/log_callback.py
+1-1 b/‎experiment/src/log_callback.py
+1-1
@@ -440,9 +440,7 @@ jobs:
           cd experiment
           # check whether credsweeper is available as module
           python -m credsweeper --banner
-          # use only 2 epochs for the test
-          sed -i 's/max_epochs = .*/max_epochs = 2/' main.py
-          python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
+          python main.py --data ${{ github.workspace }}/CredData --jobs $(( 2 * $(nproc) )) --epochs 2
           # dbg
           git diff
           # crc32 should be changed
 
@@ -40,8 +40,8 @@ jobs:
     - name: Check ml_config.json and ml_model.onnx integrity
       if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
       run: |
-        md5sum --binary credsweeper/ml_model/ml_config.json | grep 092a588d5bebdac5136c4d01c87abf27
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep a707745d781517556fd58890cb2812be
+        md5sum --binary credsweeper/ml_model/ml_config.json | grep 3a4bfcd6f3ea74461b158d4ec073cc06
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 9725b166e07e60f94929fea986f84ae2
 
     # # # line ending
 
 
@@ -1,5 +1,5 @@
 {
-    "char_set": " \t\n0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+    "char_set": "\u001b\t\n\r !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~",
     "thresholds": {
         "lowest": 0.22917,
         "low": 0.35739,
@@ -54,6 +54,22 @@
                 "attribute": "value"
             }
         },
+        {
+            "type": "SearchInAttribute",
+            "comment": "Repeated symbol",
+            "kwargs": {
+                "pattern": ".*(?:(\\S)(\\S))((\\1.)|(.\\2)){7,}",
+                "attribute": "value"
+            }
+        },
+        {
+            "type": "SearchInAttribute",
+            "comment": "SHA marker",
+            "kwargs": {
+                "pattern": ".*(?i:sha)[_-]?(224|256|384|512)",
+                "attribute": "value"
+            }
+        },
         {
             "type": "SearchInAttribute",
             "comment": "VariableNotAllowedPatternCheck",
@@ -239,6 +255,7 @@
                     ".bat",
                     ".bats",
                     ".bazel",
+                    ".build",
                     ".bundle",
                     ".bzl",
                     ".c",
@@ -295,6 +312,7 @@
                     ".jsx",
                     ".ks",
                     ".kt",
+                    ".kts",
                     ".las",
                     ".ldif",
                     ".ldml",
@@ -360,6 +378,7 @@
                     ".sql",
                     ".storyboard",
                     ".strings",
+                    ".sty",
                     ".t",
                     ".td",
                     ".tdf",
@@ -403,6 +422,7 @@
                     "Key",
                     "Nonce",
                     "Password",
+                    "SQL Password",
                     "Salt",
                     "Secret",
                     "Token",
 
@@ -144,6 +144,24 @@
   target:
     - doc
 
+- name: SQL Password
+  severity: medium
+  confidence: weak
+  type: pattern
+  values:
+    - (\\[nrt]|\b)(?i:(?P<variable>(CREATE|ALTER|SET\s{1,8}PASSWORD|INSERT(\s{1,8}IGNORE)?|UPDATE\s{1,8}[^\s;]{1,80})\s{1,8}(LOGIN|USER|ROLE|FOR|INTO|SET)\s{1,8}([^\s;]{1,80}\s{1,8}|VALUES\s*\(){1,8}(IDENTIFIED((\s{1,8}WITH\s{1,8}\S{1,80})?\s{1,8}(BY|AS))|(=|WITH)?\s*PASSWORD\b(\s*=)?)))\s*(?P<wrap>[(]\s*)?(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4})?(?P<value>(?(value_leftquote)((?!(?P=value_leftquote))(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))|(?!&(quot|apos);)(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])){3,80})(?(value_leftquote)(?P<value_rightquote>(?<!\\)(?P=value_leftquote))|(?(wrap)[)]|[\s`'\",;]))
+  filter_type:
+    - ValueAllowlistCheck
+    - ValuePatternCheck(4)
+  min_line_len: 8
+  required_substrings:
+    - password
+    - identified
+  target:
+    - doc
+    - code
+  use_ml: true
+
 - name: API
   severity: medium
   confidence: moderate
 
@@ -53,7 +53,13 @@ def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray
               f"F1:{f1:0.6f}")
 
 
-def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
+def main(cred_data_location: str,
+         jobs: int,
+         epochs: int,
+         batch_size: int,
+         patience: int,
+         doc_target: bool,
+         use_tuner: bool = False) -> str:
     print(f"Memory at start: {LogCallback.get_memory_info()}")
 
     current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -62,7 +68,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
     os.makedirs(dir_path, exist_ok=True)
 
     print(f"Train model on data from {cred_data_location}")
-    prepare_train_data(_cred_data_location, jobs)
+    prepare_train_data(cred_data_location, jobs, doc_target)
 
     # detected data means which data is passed to ML validator of credsweeper after filters with RuleName
     cred_data_location_path = pathlib.Path(cred_data_location) / "data"
@@ -82,7 +88,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
     for i in range(3):
         # there are 2 times possible fails due ml config was updated
         try:
-            thresholds = model_config_preprocess(df_all)
+            thresholds = model_config_preprocess(df_all, doc_target)
             break
         except RuntimeError as exc:
             if "RESTART:" in str(exc):
@@ -136,12 +142,6 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
 
     print(f"Memory before search / compile: {LogCallback.get_memory_info()}")
 
-    max_epochs = 100
-    # ^^^ the line is patched in GitHub action to speed-up test train
-    batch_size = 256
-    patience = 5
-    #return
-
     log_callback = LogCallback()
     if use_tuner:
         tuner = kt.GridSearch(
@@ -158,7 +158,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
         tuner.search(
             x=[x_train_line, x_train_variable, x_train_value, x_train_features],
             y=y_train,
-            epochs=max_epochs,
+            epochs=epochs,
             batch_size=batch_size,
             callbacks=[search_early_stopping, log_callback],
             validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
@@ -189,7 +189,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
     fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features],
                                   y=y_train,
                                   batch_size=batch_size,
-                                  epochs=max_epochs,
+                                  epochs=epochs,
                                   verbose=2,
                                   validation_data=([x_test_line, x_test_variable, x_test_value,
                                                     x_test_features], y_test),
@@ -259,7 +259,8 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
 
 if __name__ == "__main__":
     parser = ArgumentParser()
-    parser.add_argument("--data",
+    parser.add_argument("-d",
+                        "--data",
                         nargs="?",
                         help="CredData location",
                         dest="cred_data_location",
@@ -271,25 +272,46 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
                         default=4,
                         dest="jobs",
                         metavar="POSITIVE_INT")
-    parser.add_argument("-t", "--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
+    parser.add_argument("-e",
+                        "--epochs",
+                        help="maximal epochs to train (default: 100)",
+                        default=100,
+                        dest="epochs",
+                        metavar="POSITIVE_INT")
+    parser.add_argument("-b",
+                        "--batch_size",
+                        help="batch size (default: 256)",
+                        default=256,
+                        dest="batch_size",
+                        metavar="POSITIVE_INT")
+    parser.add_argument("-p",
+                        "--patience",
+                        help="early stopping patience (default: 5)",
+                        default=5,
+                        dest="patience",
+                        metavar="POSITIVE_INT")
+    parser.add_argument("--doc", help="use doc target", dest="doc_target", action="store_true")
+    parser.add_argument("--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
     args = parser.parse_args()
 
-    fixed_seed = 20241126  # int(datetime.now().timestamp())
-    # print(f"Random seed:{fixed_seed}")
-    if fixed_seed is not None:
-        tf.random.set_seed(fixed_seed)
-        np.random.seed(fixed_seed)
-        random.seed(fixed_seed)
-
-    _cred_data_location = args.cred_data_location
-    _jobs = int(args.jobs)
+    fixed_seed = 20250117
+    print(f"Fixed seed:{fixed_seed}")
+    tf.random.set_seed(fixed_seed)
+    np.random.seed(fixed_seed)
+    random.seed(fixed_seed)
 
     # to keep the hash in log and verify
     command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_config.json"
     subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
     command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
     subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
 
-    _model_file_name = main(_cred_data_location, _jobs, args.use_tuner)
+    _model_file_name = main(cred_data_location=args.cred_data_location,
+                            jobs=int(args.jobs),
+                            epochs=int(args.epochs),
+                            batch_size=int(args.batch_size),
+                            patience=int(args.patience),
+                            doc_target=bool(args.doc_target),
+                            use_tuner=bool(args.use_tuner))
     # print in last line the name
     print(f"\nYou can find your model in:\n{_model_file_name}")
@@ -2,25 +2,57 @@
 
 set -ex
 
-CREDSWEEPER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." > /dev/null 2>&1 && pwd )"
-export PYTHONPATH=${CREDSWEEPER_DIR}:$PYTHONPATH
-echo $PYTHONPATH
-${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --banner
+START_TIME=$(date +%s)
+NOW=$(date +%Y%m%d_%H%M%S)
+echo ">>> START ${BASH_SOURCE[0]} in $(pwd) at ${NOW}"
 
-now=$(date +%Y%m%d_%H%M%S)
+# use the path environments without / at end
 
-RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results
-mkdir -vp ${RESULT_DIR}
+echo "CREDSWEEPER_DIR='${CREDSWEEPER_DIR}'"
+if [ -z "${CREDSWEEPER_DIR}" ] || [ ! -d "${CREDSWEEPER_DIR}" ]; then
+    echo "CREDSWEEPER_DIR environment is empty or does not exist"
+    exit 1
+fi
+
+echo "CREDDATA_DIR='${CREDDATA_DIR}'"
+if [ -z "${CREDDATA_DIR}" ] || [ ! -d "${CREDDATA_DIR}" ]; then
+    echo "CREDDATA_DIR environment is empty or does not exist"
+    exit 1
+fi
+
+echo "JOBS=$(nproc)"
+if [ -z "${JOBS}" ]; then
+    JOBS=$(nproc)
+    echo "Used JOBS=${JOBS} for multiple process"
+elif [ ! 0 -lt ${JOBS} ]; then
+    echo "Unappropriated JOBS=${JOBS}"
+    exit 1
+fi
+
+export PYTHONPATH="${CREDSWEEPER_DIR}":$PYTHONPATH
+
+# check whether current version
+"${CREDSWEEPER_DIR}"/.venv/bin/python -m credsweeper --banner
+
+WORK_DIR="${CREDSWEEPER_DIR}/experiment"
+cd "${WORK_DIR}"
+RESULT_DIR="${WORK_DIR}/results"
+mkdir -vp "${RESULT_DIR}"
 
 # set env TUNER to use keras-tuner
 #TUNER=--tuner
-${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/q/DataCred/main --jobs $(nproc) ${TUNER} | tee ${RESULT_DIR}/${now}.train.log
+# set env DOC to apply doc dataset
+#DOC=--doc
+"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} | tee "${RESULT_DIR}/${NOW}.train.log"
 error_code=${PIPESTATUS}
 if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
 
-cd ${CREDSWEEPER_DIR}
-report_file=${RESULT_DIR}/${now}.json
-${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/main/data --log info --job $(nproc) --subtext --save-json ${report_file}
+cd "${CREDSWEEPER_DIR}"
+report_file=${RESULT_DIR}/${NOW}.json
+${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS}  --subtext --save-json ${report_file}
 
-cd ~/q/DataCred/main
+cd "${CREDDATA_DIR}"
 .venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt
+
+SPENT_TIME=$(date -ud "@$(( $(date +%s) - ${START_TIME} ))" +"%H:%M:%S")
+echo "<<< DONE ${BASH_SOURCE[0]} in $(pwd) at $(date) elapsed ${SPENT_TIME}"
@@ -1,9 +1,7 @@
 import contextlib
-import copy
 import json
 import os
 import pathlib
-import subprocess
 from copy import deepcopy
 from functools import cache
 from typing import Tuple, Dict, Set, Any
 
@@ -2,15 +2,12 @@
 import random
 import signal
 import statistics
-import string
 import threading
 import time
-from datetime import datetime
 from multiprocessing import Pool
 from typing import Tuple, Dict
 
 from credsweeper.common.constants import Chars
-from credsweeper.filters import ValueEntropyBase36Check
 from credsweeper.utils import Util
 
 random_data: str
 
@@ -1,7 +1,7 @@
 import datetime
 
-from keras.src.callbacks import Callback
 import psutil
+from keras.src.callbacks import Callback
 
 
 class LogCallback(Callback):