diff --git a/THIRD-PARTY-NOTICES b/THIRD-PARTY-NOTICES
index 723ff579..4d1f9d46 100644
--- a/THIRD-PARTY-NOTICES
+++ b/THIRD-PARTY-NOTICES
@@ -109,11 +109,6 @@ Licensor: ExplosionAI UG
Website: https://github.com/explosion/srsly
License: MIT License
-Component: tqdm
-Licensor: Casper da Costa-Luis, Google Inc., Noam Yorav-Raphael, and contributors
-Website: https://github.com/tqdm/tqdm
-License: MIT License and MPLv2.0
-
--------------------------------------------------------------------------------------------------------------------------
diff --git a/credentialdigger/cli/get_discoveries.py b/credentialdigger/cli/get_discoveries.py
index 70cd3e75..f46e066a 100644
--- a/credentialdigger/cli/get_discoveries.py
+++ b/credentialdigger/cli/get_discoveries.py
@@ -133,13 +133,12 @@ def discoveries_to_csv(discoveries):
csv_writer.writeheader()
csv_writer.writerows(discoveries)
csv_data = stringIO.getvalue()
+ return csv_data
except IndexError as error:
logger.error(error)
except Exception as exception:
logger.exception(exception)
- return csv_data
-
def export_csv(discoveries, client, save=False):
""" Export discoveries as a CSV file.
diff --git a/credentialdigger/client.py b/credentialdigger/client.py
index 50bfcb7b..c375bed8 100644
--- a/credentialdigger/client.py
+++ b/credentialdigger/client.py
@@ -8,13 +8,13 @@
import yaml
from github import Github
-from tqdm import tqdm
+from rich.progress import Progress
from .generator import ExtractorGenerator
from .models.model_manager import ModelManager
from .scanners.file_scanner import FileScanner
-from .scanners.git_scanner import GitScanner
from .scanners.git_file_scanner import GitFileScanner
+from .scanners.git_scanner import GitScanner
from .snippet_similarity import (build_embedding_model, compute_similarity,
compute_snippet_embedding)
@@ -1144,14 +1144,17 @@ def _scan(self, repo_url, scanner, models=None, force=False, debug=False,
discoveries_ids = list()
if debug:
logger.debug('Update database with these discoveries.')
- for i in tqdm(range(len(new_discoveries))):
- curr_d = new_discoveries[i]
- new_id = self.add_discovery(
- curr_d['file_name'], curr_d['commit_id'],
- curr_d['line_number'], curr_d['snippet'], repo_url,
- curr_d['rule_id'], curr_d['state'])
- if new_id != -1 and curr_d['state'] != 'false_positive':
- discoveries_ids.append(new_id)
+ with Progress() as progress:
+ inserting_task = progress.add_task('Inserting discoveries...',
+ total=len(new_discoveries))
+ for curr_d in new_discoveries:
+ new_id = self.add_discovery(
+ curr_d['file_name'], curr_d['commit_id'],
+ curr_d['line_number'], curr_d['snippet'], repo_url,
+ curr_d['rule_id'], curr_d['state'])
+ if new_id != -1 and curr_d['state'] != 'false_positive':
+ discoveries_ids.append(new_id)
+ progress.update(inserting_task, advance=1)
logger.debug(f'{len(discoveries_ids)} discoveries left for manual '
'review.')
else:
@@ -1191,11 +1194,13 @@ def _analyze_discovery(d):
if debug:
model_name = model_manager.model.__class__.__name__
logger.debug(f'Analyzing discoveries with model {model_name}')
-
false_positives = 0
- for i in tqdm(range(len(discoveries))):
- false_positives += _analyze_discovery(discoveries[i])
-
+ with Progress() as progress:
+ scanning_task = progress.add_task('Scanning discoveries...',
+ total=len(discoveries))
+ for curr_discovery in discoveries:
+ false_positives += _analyze_discovery(curr_discovery)
+ progress.update(scanning_task, advance=1)
logger.debug(f'Model {model_name} classified {false_positives} '
'discoveries.')
else:
diff --git a/credentialdigger/generator/generator.py b/credentialdigger/generator/generator.py
index 5f0fa2da..c519e31f 100644
--- a/credentialdigger/generator/generator.py
+++ b/credentialdigger/generator/generator.py
@@ -11,7 +11,7 @@
import pkg_resources
import string_utils
from git import Repo as GitRepo
-from tqdm import tqdm
+from rich.progress import Progress
from .qlearning import compute_dataset
from .training import create_snippet_model
@@ -280,10 +280,13 @@ def _pre_process(raw_data):
data_list = []
# Preprocess the dataset with naming convention, etc.
- for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
- row_data = {}
- for column in ['text', 'key', 'value']:
- row_data[column] = _pre_process(row[column])
- data_list.append(row_data)
-
+ with Progress() as progress:
+ preprocess_task = progress.add_task('Pre-processing dataset...',
+ total=data.shape[0])
+ for idx, row in data.iterrows():
+ row_data = {}
+ for column in ['text', 'key', 'value']:
+ row_data[column] = _pre_process(row[column])
+ data_list.append(row_data)
+ progress.update(preprocess_task, advance=1)
return pd.DataFrame(data=data_list)
diff --git a/credentialdigger/generator/qlearning.py b/credentialdigger/generator/qlearning.py
index 109792c9..5b2c62a4 100644
--- a/credentialdigger/generator/qlearning.py
+++ b/credentialdigger/generator/qlearning.py
@@ -2,8 +2,8 @@
import random
import numpy as np
+from rich.progress import Progress
from sklearn.metrics.pairwise import cosine_similarity
-from tqdm import tqdm
from .stylometry import compute_vector, word_unigram_tf
from .transform import (build_dummy_dict, choose_applicable_transformation,
@@ -71,26 +71,32 @@ def compute_dataset(corpus, actions_n, states_n, alpha, gamma, epochs_basis=50,
dataset = []
- # Apply Q-learning for each pattern
- for pattern_index in tqdm(range(len(all_patterns))):
- # Select a random extract and remove it from the corpus
- reference_extract = corpus.pop(random.randrange(len(corpus)))
- # Cut extracts too long
- reference_extract = reference_extract[:extract_max_length]
-
- # Increase epochs for more complex patterns
- epochs = int(epochs_basis * (1 + (pattern_index / len(all_patterns))))
- # Update epochs in args
- args['epochs'] = epochs
-
- # Compute the optimal modifications to the basic patterns
- final_transformation, modification_dict = _optimal_transformation(
- reference_extract, all_patterns[pattern_index], args)
-
- # Generate the dataset, with optimal transformations
- for i in range(epochs):
- dataset += generate_data(all_patterns[pattern_index],
- modification_dict)
+ with Progress() as progress:
+ patterns_count = len(all_patterns)
+ qlearn_task = progress.add_task('Apply Q-learning to patterns...',
+ total=patterns_count)
+ # Apply Q-learning for each pattern
+ for pattern_index in range(patterns_count):
+ # Select a random extract and remove it from the corpus
+ reference_extract = corpus.pop(random.randrange(len(corpus)))
+ # Cut extracts too long
+ reference_extract = reference_extract[:extract_max_length]
+
+ # Increase epochs for more complex patterns
+ epochs = int(epochs_basis *
+ (1 + (pattern_index / patterns_count)))
+ # Update epochs in args
+ args['epochs'] = epochs
+
+ # Compute the optimal modifications to the basic patterns
+ final_transformation, modification_dict = _optimal_transformation(
+ reference_extract, all_patterns[pattern_index], args)
+
+ # Generate the dataset, with optimal transformations
+ for i in range(epochs):
+ dataset += generate_data(all_patterns[pattern_index],
+ modification_dict)
+ progress.update(qlearn_task, advance=1)
return dataset
diff --git a/credentialdigger/snippet_similarity.py b/credentialdigger/snippet_similarity.py
index be12f686..e02ef931 100644
--- a/credentialdigger/snippet_similarity.py
+++ b/credentialdigger/snippet_similarity.py
@@ -2,13 +2,14 @@
import os
import numpy as np
+# In order not to raise tensorflow warnings, we need to set this environment
+# variable before importing the `tensorflow` package
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import tensorflow.keras.preprocessing.text
import tensorflow_hub as hub
import tensorflow_text
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
def build_embedding_model():
""" Build model by stacking up a preprocessing layer
diff --git a/requirements.txt b/requirements.txt
index e783187c..efea928e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,9 +15,8 @@ requests
rich
scikit-learn
srsly>=2.4.0
-tensorflow==2.5.0; python_version >= "3.8"
+tensorflow==2.5.1; python_version >= "3.8"
tensorflow==2.4.2; python_version < "3.8"
tensorflow-text==2.5.0; python_version >= "3.8"
tensorflow-text==2.4.2; python_version < "3.8"
-tf-models-official
-tqdm
+tf-models-official
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b28b29d2..d35368bb 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ def requirements():
setuptools.setup(
name='credentialdigger',
- version='4.0.5',
+ version='4.1.0',
author='SAP SE',
maintainer='Marco Rosa, Slim Trabelsi',
maintainer_email='marco.rosa@sap.com, slim.trabelsi@sap.com',
diff --git a/tests/functional_tests/test_get_discoveries_postgres.py b/tests/functional_tests/test_get_discoveries_postgres.py
new file mode 100644
index 00000000..150dddae
--- /dev/null
+++ b/tests/functional_tests/test_get_discoveries_postgres.py
@@ -0,0 +1,146 @@
+import os
+import random
+import tempfile
+import unittest
+
+import pandas as pd
+from credentialdigger.cli import cli
+from credentialdigger.client_postgres import PgClient
+from parameterized import param, parameterized
+
+REPO_URL = ''.join(random.choice('0123456789ABCDEF') for i in range(16))
+
+
+class TestGetDiscoveries(unittest.TestCase):
+ dotenv = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.env')
+
+ @classmethod
+ def setUpClass(cls):
+ # Add rules
+ cli.main(['', 'add_rules', '--dotenv', cls.dotenv,
+ 'tests/functional_tests/test_rules.yml'])
+
+ # Set CSV temporary export path
+ cls.tmp_path = tempfile.mkdtemp()
+ cls.csv_path = os.path.join(cls.tmp_path, 'test.csv')
+
+ # Set up Postgres client
+ client = PgClient(dbname=os.getenv('POSTGRES_DB'),
+ dbuser=os.getenv('POSTGRES_USER'),
+ dbpassword=os.getenv('POSTGRES_PASSWORD'),
+ dbhost=os.getenv('DBHOST'),
+ dbport=os.getenv('DBPORT'))
+
+ client.add_repo(REPO_URL)
+
+ # Insert fake discoveries
+ discoveries = []
+ discoveries_count = 5
+ for state in ['new', 'false_positive', 'addressing',
+ 'not_relevant', 'fixed']:
+ for i in range(discoveries_count):
+ discovery = {
+ 'file_name': 'danger' if state == 'new' else 'fake_file',
+ 'commit_id': '0xtmp_commit_id',
+ 'line_number': '1',
+ 'snippet': 'tmp_snippet',
+ 'rule_id': 1,
+ 'state': state,
+ 'timestamp': '2021-08-05T01:13',
+ }
+ discoveries.append(discovery)
+ discoveries_count += 1
+ client.add_discoveries(discoveries, REPO_URL)
+ cls.client = client
+
+ @classmethod
+ def tearDownClass(cls):
+ """ Remove the repo and all its discoveries. """
+ cls.client.delete_repo(REPO_URL)
+ cls.client.delete_discoveries(REPO_URL)
+ os.remove(cls.csv_path)
+
+ @parameterized.expand([
+ param(state='new', count=5),
+ param(state='false_positive', count=6),
+ param(state='addressing', count=7),
+ param(state='not_relevant', count=8),
+ param(state='fixed', count=9)
+ ])
+ def test_get_discoveries(self, state, count):
+ """ Test if we retrieve the correct number of discoveries for every
+ possible state value.
+
+ Parameters
+ ----------
+ state: str
+ The state to filter discoveries on
+ count: int
+ The expected number of discoveries to be returned
+ """
+ with self.assertRaises(SystemExit) as cm:
+ cli.main(
+ [
+ '',
+ 'get_discoveries',
+ REPO_URL,
+ '--save',
+ self.csv_path,
+ '--state',
+ state,
+ '--dotenv',
+ self.dotenv
+ ]
+ )
+ self.assertEqual(cm.exception.code, count)
+
+ @parameterized.expand([
+ param(file='danger', count=5),
+ param(file='fake_file', count=30)
+ ])
+ def test_get_discoveries_per_file(self, file, count):
+ """ Test if we retrieve the correct number of discoveries based on
+ filename input.
+
+ Parameters
+ ----------
+ file: str
+ The file name to filter discoveries on
+ count: int
+ The expected number of discoveries to be returned
+ """
+ with self.assertRaises(SystemExit) as cm:
+ cli.main(
+ [
+ '',
+ 'get_discoveries',
+ REPO_URL,
+ '--save',
+ self.csv_path,
+ '--filename',
+ file,
+ '--dotenv',
+ self.dotenv
+ ]
+ )
+ self.assertEqual(cm.exception.code, count)
+
+ def test_csv_written(self):
+ """ Test if the CLI command writes correctly the CSV file. """
+ with self.assertRaises(SystemExit):
+ cli.main(
+ [
+ '',
+ 'get_discoveries',
+ REPO_URL,
+ '--save',
+ self.csv_path,
+ '--dotenv',
+ self.dotenv
+ ]
+ )
+ data_frame = pd.read_csv(self.csv_path)
+ try:
+ assert data_frame.notna().values.all()
+ except AssertionError:
+ assert False, 'CSV file contains NaN'
diff --git a/tests/functional_tests/test_get_discoveries_sqlite.py b/tests/functional_tests/test_get_discoveries_sqlite.py
new file mode 100644
index 00000000..b64e21a8
--- /dev/null
+++ b/tests/functional_tests/test_get_discoveries_sqlite.py
@@ -0,0 +1,136 @@
+import os
+import tempfile
+import unittest
+
+import pandas as pd
+from credentialdigger.cli import cli
+from credentialdigger.client_sqlite import SqliteClient
+from parameterized import param, parameterized
+
+
+class TestGetDiscoveries(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ """ Set up a database and insert fake discoveries into it for testing
+ purposes.
+ """
+ # Set up sqlite database and CSV export temporary paths
+ cls.tmp_path = tempfile.mkdtemp()
+ cls.db_path = os.path.join(cls.tmp_path, 'test_db.sqlite')
+ cls.csv_path = os.path.join(cls.tmp_path, 'test.csv')
+
+ # Set up sqlite database
+ client = SqliteClient(cls.db_path)
+ client.add_rules_from_file('tests/functional_tests/test_rules.yml')
+ client.add_repo('test_repo')
+
+ # Insert fake discoveries
+ discoveries = []
+ discoveries_count = 5
+ for state in ['new', 'false_positive', 'addressing',
+ 'not_relevant', 'fixed']:
+ for i in range(discoveries_count):
+ discovery = {
+ 'file_name': 'danger' if state == 'new' else 'fake_file',
+ 'commit_id': '0xtmp_commit_id',
+ 'line_number': '1',
+ 'snippet': 'tmp_snippet',
+ 'rule_id': 1,
+ 'state': state,
+ 'timestamp': '2021-08-05T01:13',
+ }
+ discoveries.append(discovery)
+ discoveries_count += 1
+ client.add_discoveries(discoveries, 'test_repo')
+
+ @classmethod
+ def tearDownClass(cls):
+ """ Remove database and exported CSV after finishing tests. """
+ os.remove(cls.db_path)
+ os.remove(cls.csv_path)
+
+ @parameterized.expand([
+ param(state='new', count=5),
+ param(state='false_positive', count=6),
+ param(state='addressing', count=7),
+ param(state='not_relevant', count=8),
+ param(state='fixed', count=9)
+ ])
+ def test_get_discoveries(self, state, count):
+ """ Test if we retrieve the correct number of discoveries for every
+ possible state value.
+
+ Parameters
+ ----------
+ state: str
+ The state to filter discoveries on
+ count: int
+ The expected number of discoveries to be returned
+ """
+ with self.assertRaises(SystemExit) as cm:
+ cli.main(
+ [
+ '',
+ 'get_discoveries',
+ 'test_repo',
+ '--sqlite',
+ self.db_path,
+ '--save',
+ self.csv_path,
+ '--state',
+ state
+ ]
+ )
+ self.assertEqual(cm.exception.code, count)
+
+ @parameterized.expand([
+ param(file='danger', count=5),
+ param(file='fake_file', count=30)
+ ])
+ def test_get_discoveries_per_file(self, file, count):
+ """ Test if we retrieve the correct number of discoveries based on
+ filename input.
+
+ Parameters
+ ----------
+ file: str
+ The file name to filter discoveries on
+ count: int
+ The expected number of discoveries to be returned
+ """
+ with self.assertRaises(SystemExit) as cm:
+ cli.main(
+ [
+ '',
+ 'get_discoveries',
+ 'test_repo',
+ '--sqlite',
+ self.db_path,
+ '--save',
+ self.csv_path,
+ '--filename',
+ file
+ ]
+ )
+ self.assertEqual(cm.exception.code, count)
+
+ def test_csv_written(self):
+ """ Test if the CLI command writes correctly the CSV file. """
+ with self.assertRaises(SystemExit):
+ cli.main(
+ [
+ '',
+ 'get_discoveries',
+ 'test_repo',
+ '--sqlite',
+ self.db_path,
+ '--save',
+ self.csv_path
+ ]
+ )
+ data_frame = pd.read_csv(self.csv_path)
+ try:
+ assert data_frame.notna().values.all()
+ except AssertionError:
+ assert False, 'CSV file contains NaN'
diff --git a/ui/Dockerfile b/ui/Dockerfile
index 3db8ad68..0ecde17b 100644
--- a/ui/Dockerfile
+++ b/ui/Dockerfile
@@ -5,6 +5,8 @@ RUN apt-get update && apt-get install -y libhyperscan5 libpq-dev gunicorn3
# Don't verify ssl for github enterprise
RUN git config --global http.sslverify false
+# Docker Windows support
+RUN git config --global core.autocrlf false
# Install Credential Digger
RUN pip install credentialdigger
@@ -19,4 +21,4 @@ ARG SSL_private_key
COPY . /credential-digger-ui
WORKDIR /credential-digger-ui/
RUN chmod +x run.sh
-CMD [ "./run.sh" ]
\ No newline at end of file
+CMD [ "./run.sh" ]
diff --git a/ui/backend/client_ui.py b/ui/backend/client_ui.py
index 5051be8b..c2a39ea9 100644
--- a/ui/backend/client_ui.py
+++ b/ui/backend/client_ui.py
@@ -40,18 +40,39 @@ def get_discoveries_count(self, query, params):
def get_all_discoveries_count(self):
""" Get the repositories together with their total number of
- discoveries and the number of "new" ones.
+ discoveries.
Returns
-------
list
- A list of tuples containing (repo_url, total disc, new disc)
+ A list of tuples containing (repo_url, total discoveries, true
+ positives, false positives,
+ addressing, not_relevant, fixed)
"""
query = '''SELECT repo_url,
- COUNT(*) as total,
- sum(case when STATE='new' then 1 else 0 end) as tp
- FROM discoveries
- GROUP BY repo_url;'''
+ COUNT(*) AS total,
+ sum(CASE
+ WHEN STATE='new' THEN 1
+ ELSE 0
+ END) AS true_positive,
+ sum(CASE
+ WHEN STATE='false_positive' THEN 1
+ ELSE 0
+ END) AS false_positive,
+ sum(CASE
+ WHEN STATE='addressing' THEN 1
+ ELSE 0
+ END) AS addressing,
+ sum(CASE
+ WHEN STATE='not_relevant' THEN 1
+ ELSE 0
+ END) AS not_relevant,
+ sum(CASE
+ WHEN STATE='fixed' THEN 1
+ ELSE 0
+ END) AS fixed
+ FROM discoveries
+ GROUP BY repo_url;'''
cursor = self.db.cursor()
cursor.execute(query)
result = cursor.fetchall()
diff --git a/ui/res/js/exportForm.js b/ui/res/js/exportForm.js
new file mode 100644
index 00000000..73d5bc4b
--- /dev/null
+++ b/ui/res/js/exportForm.js
@@ -0,0 +1,37 @@
+function changeAll() {
+ let checkBoxes = document.getElementsByName("check");
+ for (let i = 0; i < checkBoxes.length; i++) {
+ const cb = checkBoxes[i];
+ a_field_id = cb.nextSibling.nextSibling.id;
+ a_field_value = document.getElementById(a_field_id).innerHTML;
+ if (a_field_value != "(0)") cb.checked = !$("#cbAll").prop("checked");
+ }
+}
+
+function alternateCheckBoxes() {
+ let checkBoxes = document.getElementsByName("check");
+ for (let i = 0; i < checkBoxes.length; i++) {
+ let cb = checkBoxes[i];
+ if (cb.checked) {
+ $("#cbAll").prop("checked", false);
+ return;
+ }
+ }
+}
+
+/*
+ Make sure at least one checkbox is checked
+*/
+function validateCheckBoxes() {
+ let checkBoxes = document.getElementsByName("check");
+ for (let i = 0; i < checkBoxes.length; i++) {
+ let cb = checkBoxes[i];
+ if (cb.checked) {
+ document.getElementById("error_msg").hidden = true;
+ return true;
+ }
+ }
+ checkbox_All = $("#cbAll").prop("checked");
+ document.getElementById("error_msg").hidden = checkbox_All;
+ return checkbox_All;
+}
diff --git a/ui/res/js/repos.js b/ui/res/js/repos.js
index dbb50a86..e51eeb28 100755
--- a/ui/res/js/repos.js
+++ b/ui/res/js/repos.js
@@ -11,6 +11,7 @@ document.addEventListener("DOMContentLoaded", function () {
initDeleteRepo();
initModals();
initAlternanteScanRescan();
+ initExportCSV();
});
/**
@@ -127,6 +128,26 @@ function initReposDataTable() {
item.scan_active ? `Scanning...` : `Rescan`
}
+