Skip to content

Commit dc985a7

Browse files
committed
sql-password
1 parent 75df2ab commit dc985a7

27 files changed

+10050
-11789
lines changed

.ci/benchmark.txt

+33-32
Large diffs are not rendered by default.

.github/workflows/benchmark.yml

+1-3
Original file line numberDiff line numberDiff line change
@@ -440,9 +440,7 @@ jobs:
440440
cd experiment
441441
# check whether credsweeper is available as module
442442
python -m credsweeper --banner
443-
# use only 2 epochs for the test
444-
sed -i 's/max_epochs = .*/max_epochs = 2/' main.py
445-
python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
443+
python main.py --data ${{ github.workspace }}/CredData --jobs $(( 2 * $(nproc) )) --epochs 2
446444
# dbg
447445
git diff
448446
# crc32 should be changed

.github/workflows/check.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ jobs:
4040
- name: Check ml_config.json and ml_model.onnx integrity
4141
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
4242
run: |
43-
md5sum --binary credsweeper/ml_model/ml_config.json | grep 092a588d5bebdac5136c4d01c87abf27
44-
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep a707745d781517556fd58890cb2812be
43+
md5sum --binary credsweeper/ml_model/ml_config.json | grep 3a4bfcd6f3ea74461b158d4ec073cc06
44+
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 9725b166e07e60f94929fea986f84ae2
4545
4646
# # # line ending
4747

credsweeper/ml_model/ml_config.json

+21-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"char_set": " \t\n0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
2+
"char_set": "\u001b\t\n\r !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~",
33
"thresholds": {
44
"lowest": 0.22917,
55
"low": 0.35739,
@@ -54,6 +54,22 @@
5454
"attribute": "value"
5555
}
5656
},
57+
{
58+
"type": "SearchInAttribute",
59+
"comment": "Repeated symbol",
60+
"kwargs": {
61+
"pattern": ".*(?:(\\S)(\\S))((\\1.)|(.\\2)){7,}",
62+
"attribute": "value"
63+
}
64+
},
65+
{
66+
"type": "SearchInAttribute",
67+
"comment": "SHA marker",
68+
"kwargs": {
69+
"pattern": ".*(?i:sha)[_-]?(224|256|384|512)",
70+
"attribute": "value"
71+
}
72+
},
5773
{
5874
"type": "SearchInAttribute",
5975
"comment": "VariableNotAllowedPatternCheck",
@@ -239,6 +255,7 @@
239255
".bat",
240256
".bats",
241257
".bazel",
258+
".build",
242259
".bundle",
243260
".bzl",
244261
".c",
@@ -295,6 +312,7 @@
295312
".jsx",
296313
".ks",
297314
".kt",
315+
".kts",
298316
".las",
299317
".ldif",
300318
".ldml",
@@ -360,6 +378,7 @@
360378
".sql",
361379
".storyboard",
362380
".strings",
381+
".sty",
363382
".t",
364383
".td",
365384
".tdf",
@@ -403,6 +422,7 @@
403422
"Key",
404423
"Nonce",
405424
"Password",
425+
"SQL Password",
406426
"Salt",
407427
"Secret",
408428
"Token",

credsweeper/ml_model/ml_model.onnx

3.55 MB
Binary file not shown.

credsweeper/rules/config.yaml

+18
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,24 @@
144144
target:
145145
- doc
146146

147+
- name: SQL Password
148+
severity: medium
149+
confidence: weak
150+
type: pattern
151+
values:
152+
- (\\[nrt]|\b)(?i:(?P<variable>(CREATE|ALTER|SET\s{1,8}PASSWORD|INSERT(\s{1,8}IGNORE)?|UPDATE\s{1,8}[^\s;]{1,80})\s{1,8}(LOGIN|USER|ROLE|FOR|INTO|SET)\s{1,8}([^\s;]{1,80}\s{1,8}|VALUES\s*\(){1,8}(IDENTIFIED((\s{1,8}WITH\s{1,8}\S{1,80})?\s{1,8}(BY|AS))|(=|WITH)?\s*PASSWORD\b(\s*=)?)))\s*(?P<wrap>[(]\s*)?(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4})?(?P<value>(?(value_leftquote)((?!(?P=value_leftquote))(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))|(?!&(quot|apos);)(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])){3,80})(?(value_leftquote)(?P<value_rightquote>(?<!\\)(?P=value_leftquote))|(?(wrap)[)]|[\s`'\",;]))
153+
filter_type:
154+
- ValueAllowlistCheck
155+
- ValuePatternCheck(4)
156+
min_line_len: 8
157+
required_substrings:
158+
- password
159+
- identified
160+
target:
161+
- doc
162+
- code
163+
use_ml: true
164+
147165
- name: API
148166
severity: medium
149167
confidence: moderate

experiment/main.py

+45-23
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,13 @@ def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray
5353
f"F1:{f1:0.6f}")
5454

5555

56-
def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
56+
def main(cred_data_location: str,
57+
jobs: int,
58+
epochs: int,
59+
batch_size: int,
60+
patience: int,
61+
doc_target: bool,
62+
use_tuner: bool = False) -> str:
5763
print(f"Memory at start: {LogCallback.get_memory_info()}")
5864

5965
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -62,7 +68,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
6268
os.makedirs(dir_path, exist_ok=True)
6369

6470
print(f"Train model on data from {cred_data_location}")
65-
prepare_train_data(_cred_data_location, jobs)
71+
prepare_train_data(cred_data_location, jobs, doc_target)
6672

6773
# detected data means which data is passed to ML validator of credsweeper after filters with RuleName
6874
cred_data_location_path = pathlib.Path(cred_data_location) / "data"
@@ -82,7 +88,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
8288
for i in range(3):
8389
# there are 2 times possible fails due ml config was updated
8490
try:
85-
thresholds = model_config_preprocess(df_all)
91+
thresholds = model_config_preprocess(df_all, doc_target)
8692
break
8793
except RuntimeError as exc:
8894
if "RESTART:" in str(exc):
@@ -136,12 +142,6 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
136142

137143
print(f"Memory before search / compile: {LogCallback.get_memory_info()}")
138144

139-
max_epochs = 100
140-
# ^^^ the line is patched in GitHub action to speed-up test train
141-
batch_size = 256
142-
patience = 5
143-
#return
144-
145145
log_callback = LogCallback()
146146
if use_tuner:
147147
tuner = kt.GridSearch(
@@ -158,7 +158,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
158158
tuner.search(
159159
x=[x_train_line, x_train_variable, x_train_value, x_train_features],
160160
y=y_train,
161-
epochs=max_epochs,
161+
epochs=epochs,
162162
batch_size=batch_size,
163163
callbacks=[search_early_stopping, log_callback],
164164
validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
@@ -189,7 +189,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
189189
fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features],
190190
y=y_train,
191191
batch_size=batch_size,
192-
epochs=max_epochs,
192+
epochs=epochs,
193193
verbose=2,
194194
validation_data=([x_test_line, x_test_variable, x_test_value,
195195
x_test_features], y_test),
@@ -259,7 +259,8 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
259259

260260
if __name__ == "__main__":
261261
parser = ArgumentParser()
262-
parser.add_argument("--data",
262+
parser.add_argument("-d",
263+
"--data",
263264
nargs="?",
264265
help="CredData location",
265266
dest="cred_data_location",
@@ -271,25 +272,46 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
271272
default=4,
272273
dest="jobs",
273274
metavar="POSITIVE_INT")
274-
parser.add_argument("-t", "--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
275+
parser.add_argument("-e",
276+
"--epochs",
277+
help="maximal epochs to train (default: 100)",
278+
default=100,
279+
dest="epochs",
280+
metavar="POSITIVE_INT")
281+
parser.add_argument("-b",
282+
"--batch_size",
283+
help="batch size (default: 256)",
284+
default=256,
285+
dest="batch_size",
286+
metavar="POSITIVE_INT")
287+
parser.add_argument("-p",
288+
"--patience",
289+
help="early stopping patience (default: 5)",
290+
default=5,
291+
dest="patience",
292+
metavar="POSITIVE_INT")
293+
parser.add_argument("--doc", help="use doc target", dest="doc_target", action="store_true")
294+
parser.add_argument("--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
275295
args = parser.parse_args()
276296

277-
fixed_seed = 20241126 # int(datetime.now().timestamp())
278-
# print(f"Random seed:{fixed_seed}")
279-
if fixed_seed is not None:
280-
tf.random.set_seed(fixed_seed)
281-
np.random.seed(fixed_seed)
282-
random.seed(fixed_seed)
283-
284-
_cred_data_location = args.cred_data_location
285-
_jobs = int(args.jobs)
297+
fixed_seed = 20250117
298+
print(f"Fixed seed:{fixed_seed}")
299+
tf.random.set_seed(fixed_seed)
300+
np.random.seed(fixed_seed)
301+
random.seed(fixed_seed)
286302

287303
# to keep the hash in log and verify
288304
command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_config.json"
289305
subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
290306
command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
291307
subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
292308

293-
_model_file_name = main(_cred_data_location, _jobs, args.use_tuner)
309+
_model_file_name = main(cred_data_location=args.cred_data_location,
310+
jobs=int(args.jobs),
311+
epochs=int(args.epochs),
312+
batch_size=int(args.batch_size),
313+
patience=int(args.patience),
314+
doc_target=bool(args.doc_target),
315+
use_tuner=bool(args.use_tuner))
294316
# print in last line the name
295317
print(f"\nYou can find your model in:\n{_model_file_name}")

experiment/main.sh

+44-12
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,57 @@
22

33
set -ex
44

5-
CREDSWEEPER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." > /dev/null 2>&1 && pwd )"
6-
export PYTHONPATH=${CREDSWEEPER_DIR}:$PYTHONPATH
7-
echo $PYTHONPATH
8-
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --banner
5+
START_TIME=$(date +%s)
6+
NOW=$(date +%Y%m%d_%H%M%S)
7+
echo ">>> START ${BASH_SOURCE[0]} in $(pwd) at ${NOW}"
98

10-
now=$(date +%Y%m%d_%H%M%S)
9+
# use the path environments without / at end
1110

12-
RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results
13-
mkdir -vp ${RESULT_DIR}
11+
echo "CREDSWEEPER_DIR='${CREDSWEEPER_DIR}'"
12+
if [ -z "${CREDSWEEPER_DIR}" ] || [ ! -d "${CREDSWEEPER_DIR}" ]; then
13+
echo "CREDSWEEPER_DIR environment is empty or does not exist"
14+
exit 1
15+
fi
16+
17+
echo "CREDDATA_DIR='${CREDDATA_DIR}'"
18+
if [ -z "${CREDDATA_DIR}" ] || [ ! -d "${CREDDATA_DIR}" ]; then
19+
echo "CREDDATA_DIR environment is empty or does not exist"
20+
exit 1
21+
fi
22+
23+
echo "JOBS=$(nproc)"
24+
if [ -z "${JOBS}" ]; then
25+
JOBS=$(nproc)
26+
echo "Used JOBS=${JOBS} for multiple process"
27+
elif [ ! 0 -lt ${JOBS} ]; then
28+
echo "Unappropriated JOBS=${JOBS}"
29+
exit 1
30+
fi
31+
32+
export PYTHONPATH="${CREDSWEEPER_DIR}":$PYTHONPATH
33+
34+
# check whether current version
35+
"${CREDSWEEPER_DIR}"/.venv/bin/python -m credsweeper --banner
36+
37+
WORK_DIR="${CREDSWEEPER_DIR}/experiment"
38+
cd "${WORK_DIR}"
39+
RESULT_DIR="${WORK_DIR}/results"
40+
mkdir -vp "${RESULT_DIR}"
1441

1542
# set env TUNER to use keras-tuner
1643
#TUNER=--tuner
17-
${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/q/DataCred/main --jobs $(nproc) ${TUNER} | tee ${RESULT_DIR}/${now}.train.log
44+
# set env DOC to apply doc dataset
45+
#DOC=--doc
46+
"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} | tee "${RESULT_DIR}/${NOW}.train.log"
1847
error_code=${PIPESTATUS}
1948
if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
2049

21-
cd ${CREDSWEEPER_DIR}
22-
report_file=${RESULT_DIR}/${now}.json
23-
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/main/data --log info --job $(nproc) --subtext --save-json ${report_file}
50+
cd "${CREDSWEEPER_DIR}"
51+
report_file=${RESULT_DIR}/${NOW}.json
52+
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file}
2453

25-
cd ~/q/DataCred/main
54+
cd "${CREDDATA_DIR}"
2655
.venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt
56+
57+
SPENT_TIME=$(date -ud "@$(( $(date +%s) - ${START_TIME} ))" +"%H:%M:%S")
58+
echo "<<< DONE ${BASH_SOURCE[0]} in $(pwd) at $(date) elapsed ${SPENT_TIME}"

experiment/src/data_loader.py

-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
import contextlib
2-
import copy
32
import json
43
import os
54
import pathlib
6-
import subprocess
75
from copy import deepcopy
86
from functools import cache
97
from typing import Tuple, Dict, Set, Any

experiment/src/entropy_test.py

-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,12 @@
22
import random
33
import signal
44
import statistics
5-
import string
65
import threading
76
import time
8-
from datetime import datetime
97
from multiprocessing import Pool
108
from typing import Tuple, Dict
119

1210
from credsweeper.common.constants import Chars
13-
from credsweeper.filters import ValueEntropyBase36Check
1411
from credsweeper.utils import Util
1512

1613
random_data: str

experiment/src/log_callback.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import datetime
22

3-
from keras.src.callbacks import Callback
43
import psutil
4+
from keras.src.callbacks import Callback
55

66

77
class LogCallback(Callback):

0 commit comments

Comments
 (0)