diff --git a/THIRD-PARTY-NOTICES b/THIRD-PARTY-NOTICES index 723ff579..4d1f9d46 100644 --- a/THIRD-PARTY-NOTICES +++ b/THIRD-PARTY-NOTICES @@ -109,11 +109,6 @@ Licensor: ExplosionAI UG Website: https://github.com/explosion/srsly License: MIT License -Component: tqdm -Licensor: Casper da Costa-Luis, Google Inc., Noam Yorav-Raphael, and contributors -Website: https://github.com/tqdm/tqdm -License: MIT License and MPLv2.0 - -------------------------------------------------------------------------------------------------------------------------- diff --git a/credentialdigger/cli/get_discoveries.py b/credentialdigger/cli/get_discoveries.py index 70cd3e75..f46e066a 100644 --- a/credentialdigger/cli/get_discoveries.py +++ b/credentialdigger/cli/get_discoveries.py @@ -133,13 +133,12 @@ def discoveries_to_csv(discoveries): csv_writer.writeheader() csv_writer.writerows(discoveries) csv_data = stringIO.getvalue() + return csv_data except IndexError as error: logger.error(error) except Exception as exception: logger.exception(exception) - return csv_data - def export_csv(discoveries, client, save=False): """ Export discoveries as a CSV file. diff --git a/credentialdigger/client.py b/credentialdigger/client.py index 50bfcb7b..c375bed8 100644 --- a/credentialdigger/client.py +++ b/credentialdigger/client.py @@ -8,13 +8,13 @@ import yaml from github import Github -from tqdm import tqdm +from rich.progress import Progress from .generator import ExtractorGenerator from .models.model_manager import ModelManager from .scanners.file_scanner import FileScanner -from .scanners.git_scanner import GitScanner from .scanners.git_file_scanner import GitFileScanner +from .scanners.git_scanner import GitScanner from .snippet_similarity import (build_embedding_model, compute_similarity, compute_snippet_embedding) @@ -1144,14 +1144,17 @@ def _scan(self, repo_url, scanner, models=None, force=False, debug=False, discoveries_ids = list() if debug: logger.debug('Update database with these discoveries.') - for i in tqdm(range(len(new_discoveries))): - curr_d = new_discoveries[i] - new_id = self.add_discovery( - curr_d['file_name'], curr_d['commit_id'], - curr_d['line_number'], curr_d['snippet'], repo_url, - curr_d['rule_id'], curr_d['state']) - if new_id != -1 and curr_d['state'] != 'false_positive': - discoveries_ids.append(new_id) + with Progress() as progress: + inserting_task = progress.add_task('Inserting discoveries...', + total=len(new_discoveries)) + for curr_d in new_discoveries: + new_id = self.add_discovery( + curr_d['file_name'], curr_d['commit_id'], + curr_d['line_number'], curr_d['snippet'], repo_url, + curr_d['rule_id'], curr_d['state']) + if new_id != -1 and curr_d['state'] != 'false_positive': + discoveries_ids.append(new_id) + progress.update(inserting_task, advance=1) logger.debug(f'{len(discoveries_ids)} discoveries left for manual ' 'review.') else: @@ -1191,11 +1194,13 @@ def _analyze_discovery(d): if debug: model_name = model_manager.model.__class__.__name__ logger.debug(f'Analyzing discoveries with model {model_name}') - false_positives = 0 - for i in tqdm(range(len(discoveries))): - false_positives += _analyze_discovery(discoveries[i]) - + with Progress() as progress: + scanning_task = progress.add_task('Scanning discoveries...', + total=len(discoveries)) + for curr_discovery in discoveries: + false_positives += _analyze_discovery(curr_discovery) + progress.update(scanning_task, advance=1) logger.debug(f'Model {model_name} classified {false_positives} ' 'discoveries.') else: diff --git a/credentialdigger/generator/generator.py b/credentialdigger/generator/generator.py index 5f0fa2da..c519e31f 100644 --- a/credentialdigger/generator/generator.py +++ b/credentialdigger/generator/generator.py @@ -11,7 +11,7 @@ import pkg_resources import string_utils from git import Repo as GitRepo -from tqdm import tqdm +from rich.progress import Progress from .qlearning import compute_dataset from .training import create_snippet_model @@ -280,10 +280,13 @@ def _pre_process(raw_data): data_list = [] # Preprocess the dataset with naming convention, etc. - for idx, row in tqdm(data.iterrows(), total=data.shape[0]): - row_data = {} - for column in ['text', 'key', 'value']: - row_data[column] = _pre_process(row[column]) - data_list.append(row_data) - + with Progress() as progress: + preprocess_task = progress.add_task('Pre-processing dataset...', + total=data.shape[0]) + for idx, row in data.iterrows(): + row_data = {} + for column in ['text', 'key', 'value']: + row_data[column] = _pre_process(row[column]) + data_list.append(row_data) + progress.update(preprocess_task, advance=1) return pd.DataFrame(data=data_list) diff --git a/credentialdigger/generator/qlearning.py b/credentialdigger/generator/qlearning.py index 109792c9..5b2c62a4 100644 --- a/credentialdigger/generator/qlearning.py +++ b/credentialdigger/generator/qlearning.py @@ -2,8 +2,8 @@ import random import numpy as np +from rich.progress import Progress from sklearn.metrics.pairwise import cosine_similarity -from tqdm import tqdm from .stylometry import compute_vector, word_unigram_tf from .transform import (build_dummy_dict, choose_applicable_transformation, @@ -71,26 +71,32 @@ def compute_dataset(corpus, actions_n, states_n, alpha, gamma, epochs_basis=50, dataset = [] - # Apply Q-learning for each pattern - for pattern_index in tqdm(range(len(all_patterns))): - # Select a random extract and remove it from the corpus - reference_extract = corpus.pop(random.randrange(len(corpus))) - # Cut extracts too long - reference_extract = reference_extract[:extract_max_length] - - # Increase epochs for more complex patterns - epochs = int(epochs_basis * (1 + (pattern_index / len(all_patterns)))) - # Update epochs in args - args['epochs'] = epochs - - # Compute the optimal modifications to the basic patterns - final_transformation, modification_dict = _optimal_transformation( - reference_extract, all_patterns[pattern_index], args) - - # Generate the dataset, with optimal transformations - for i in range(epochs): - dataset += generate_data(all_patterns[pattern_index], - modification_dict) + with Progress() as progress: + patterns_count = len(all_patterns) + qlearn_task = progress.add_task('Apply Q-learning to patterns...', + total=patterns_count) + # Apply Q-learning for each pattern + for pattern_index in range(patterns_count): + # Select a random extract and remove it from the corpus + reference_extract = corpus.pop(random.randrange(len(corpus))) + # Cut extracts too long + reference_extract = reference_extract[:extract_max_length] + + # Increase epochs for more complex patterns + epochs = int(epochs_basis * + (1 + (pattern_index / patterns_count))) + # Update epochs in args + args['epochs'] = epochs + + # Compute the optimal modifications to the basic patterns + final_transformation, modification_dict = _optimal_transformation( + reference_extract, all_patterns[pattern_index], args) + + # Generate the dataset, with optimal transformations + for i in range(epochs): + dataset += generate_data(all_patterns[pattern_index], + modification_dict) + progress.update(qlearn_task, advance=1) return dataset diff --git a/credentialdigger/snippet_similarity.py b/credentialdigger/snippet_similarity.py index be12f686..e02ef931 100644 --- a/credentialdigger/snippet_similarity.py +++ b/credentialdigger/snippet_similarity.py @@ -2,13 +2,14 @@ import os import numpy as np +# In order not to raise tensorflow warnings, we need to set this environment +# variable before importing the `tensorflow` package +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import tensorflow as tf import tensorflow.keras.preprocessing.text import tensorflow_hub as hub import tensorflow_text -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - def build_embedding_model(): """ Build model by stacking up a preprocessing layer diff --git a/requirements.txt b/requirements.txt index e783187c..efea928e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,9 +15,8 @@ requests rich scikit-learn srsly>=2.4.0 -tensorflow==2.5.0; python_version >= "3.8" +tensorflow==2.5.1; python_version >= "3.8" tensorflow==2.4.2; python_version < "3.8" tensorflow-text==2.5.0; python_version >= "3.8" tensorflow-text==2.4.2; python_version < "3.8" -tf-models-official -tqdm +tf-models-official \ No newline at end of file diff --git a/setup.py b/setup.py index b28b29d2..d35368bb 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def requirements(): setuptools.setup( name='credentialdigger', - version='4.0.5', + version='4.1.0', author='SAP SE', maintainer='Marco Rosa, Slim Trabelsi', maintainer_email='marco.rosa@sap.com, slim.trabelsi@sap.com', diff --git a/tests/functional_tests/test_get_discoveries_postgres.py b/tests/functional_tests/test_get_discoveries_postgres.py new file mode 100644 index 00000000..150dddae --- /dev/null +++ b/tests/functional_tests/test_get_discoveries_postgres.py @@ -0,0 +1,146 @@ +import os +import random +import tempfile +import unittest + +import pandas as pd +from credentialdigger.cli import cli +from credentialdigger.client_postgres import PgClient +from parameterized import param, parameterized + +REPO_URL = ''.join(random.choice('0123456789ABCDEF') for i in range(16)) + + +class TestGetDiscoveries(unittest.TestCase): + dotenv = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.env') + + @classmethod + def setUpClass(cls): + # Add rules + cli.main(['', 'add_rules', '--dotenv', cls.dotenv, + 'tests/functional_tests/test_rules.yml']) + + # Set CSV temporary export path + cls.tmp_path = tempfile.mkdtemp() + cls.csv_path = os.path.join(cls.tmp_path, 'test.csv') + + # Set up Postgres client + client = PgClient(dbname=os.getenv('POSTGRES_DB'), + dbuser=os.getenv('POSTGRES_USER'), + dbpassword=os.getenv('POSTGRES_PASSWORD'), + dbhost=os.getenv('DBHOST'), + dbport=os.getenv('DBPORT')) + + client.add_repo(REPO_URL) + + # Insert fake discoveries + discoveries = [] + discoveries_count = 5 + for state in ['new', 'false_positive', 'addressing', + 'not_relevant', 'fixed']: + for i in range(discoveries_count): + discovery = { + 'file_name': 'danger' if state == 'new' else 'fake_file', + 'commit_id': '0xtmp_commit_id', + 'line_number': '1', + 'snippet': 'tmp_snippet', + 'rule_id': 1, + 'state': state, + 'timestamp': '2021-08-05T01:13', + } + discoveries.append(discovery) + discoveries_count += 1 + client.add_discoveries(discoveries, REPO_URL) + cls.client = client + + @classmethod + def tearDownClass(cls): + """ Remove the repo and all its discoveries. """ + cls.client.delete_repo(REPO_URL) + cls.client.delete_discoveries(REPO_URL) + os.remove(cls.csv_path) + + @parameterized.expand([ + param(state='new', count=5), + param(state='false_positive', count=6), + param(state='addressing', count=7), + param(state='not_relevant', count=8), + param(state='fixed', count=9) + ]) + def test_get_discoveries(self, state, count): + """ Test if we retrieve the correct number of discoveries for every + possible state value. + + Parameters + ---------- + state: str + The state to filter discoveries on + count: int + The expected number of discoveries to be returned + """ + with self.assertRaises(SystemExit) as cm: + cli.main( + [ + '', + 'get_discoveries', + REPO_URL, + '--save', + self.csv_path, + '--state', + state, + '--dotenv', + self.dotenv + ] + ) + self.assertEqual(cm.exception.code, count) + + @parameterized.expand([ + param(file='danger', count=5), + param(file='fake_file', count=30) + ]) + def test_get_discoveries_per_file(self, file, count): + """ Test if we retrieve the correct number of discoveries based on + filename input. + + Parameters + ---------- + file: str + The file name to filter discoveries on + count: int + The expected number of discoveries to be returned + """ + with self.assertRaises(SystemExit) as cm: + cli.main( + [ + '', + 'get_discoveries', + REPO_URL, + '--save', + self.csv_path, + '--filename', + file, + '--dotenv', + self.dotenv + ] + ) + self.assertEqual(cm.exception.code, count) + + def test_csv_written(self): + """ Test if the CLI command writes correctly the CSV file. """ + with self.assertRaises(SystemExit): + cli.main( + [ + '', + 'get_discoveries', + REPO_URL, + '--save', + self.csv_path, + '--dotenv', + self.dotenv + ] + ) + data_frame = pd.read_csv(self.csv_path) + try: + assert data_frame.notna().values.all() + except AssertionError: + assert False, 'CSV file contains NaN' diff --git a/tests/functional_tests/test_get_discoveries_sqlite.py b/tests/functional_tests/test_get_discoveries_sqlite.py new file mode 100644 index 00000000..b64e21a8 --- /dev/null +++ b/tests/functional_tests/test_get_discoveries_sqlite.py @@ -0,0 +1,136 @@ +import os +import tempfile +import unittest + +import pandas as pd +from credentialdigger.cli import cli +from credentialdigger.client_sqlite import SqliteClient +from parameterized import param, parameterized + + +class TestGetDiscoveries(unittest.TestCase): + + @classmethod + def setUpClass(cls): + """ Set up a database and insert fake discoveries into it for testing + purposes. + """ + # Set up sqlite database and CSV export temporary paths + cls.tmp_path = tempfile.mkdtemp() + cls.db_path = os.path.join(cls.tmp_path, 'test_db.sqlite') + cls.csv_path = os.path.join(cls.tmp_path, 'test.csv') + + # Set up sqlite database + client = SqliteClient(cls.db_path) + client.add_rules_from_file('tests/functional_tests/test_rules.yml') + client.add_repo('test_repo') + + # Insert fake discoveries + discoveries = [] + discoveries_count = 5 + for state in ['new', 'false_positive', 'addressing', + 'not_relevant', 'fixed']: + for i in range(discoveries_count): + discovery = { + 'file_name': 'danger' if state == 'new' else 'fake_file', + 'commit_id': '0xtmp_commit_id', + 'line_number': '1', + 'snippet': 'tmp_snippet', + 'rule_id': 1, + 'state': state, + 'timestamp': '2021-08-05T01:13', + } + discoveries.append(discovery) + discoveries_count += 1 + client.add_discoveries(discoveries, 'test_repo') + + @classmethod + def tearDownClass(cls): + """ Remove database and exported CSV after finishing tests. """ + os.remove(cls.db_path) + os.remove(cls.csv_path) + + @parameterized.expand([ + param(state='new', count=5), + param(state='false_positive', count=6), + param(state='addressing', count=7), + param(state='not_relevant', count=8), + param(state='fixed', count=9) + ]) + def test_get_discoveries(self, state, count): + """ Test if we retrieve the correct number of discoveries for every + possible state value. + + Parameters + ---------- + state: str + The state to filter discoveries on + count: int + The expected number of discoveries to be returned + """ + with self.assertRaises(SystemExit) as cm: + cli.main( + [ + '', + 'get_discoveries', + 'test_repo', + '--sqlite', + self.db_path, + '--save', + self.csv_path, + '--state', + state + ] + ) + self.assertEqual(cm.exception.code, count) + + @parameterized.expand([ + param(file='danger', count=5), + param(file='fake_file', count=30) + ]) + def test_get_discoveries_per_file(self, file, count): + """ Test if we retrieve the correct number of discoveries based on + filename input. + + Parameters + ---------- + file: str + The file name to filter discoveries on + count: int + The expected number of discoveries to be returned + """ + with self.assertRaises(SystemExit) as cm: + cli.main( + [ + '', + 'get_discoveries', + 'test_repo', + '--sqlite', + self.db_path, + '--save', + self.csv_path, + '--filename', + file + ] + ) + self.assertEqual(cm.exception.code, count) + + def test_csv_written(self): + """ Test if the CLI command writes correctly the CSV file. """ + with self.assertRaises(SystemExit): + cli.main( + [ + '', + 'get_discoveries', + 'test_repo', + '--sqlite', + self.db_path, + '--save', + self.csv_path + ] + ) + data_frame = pd.read_csv(self.csv_path) + try: + assert data_frame.notna().values.all() + except AssertionError: + assert False, 'CSV file contains NaN' diff --git a/ui/Dockerfile b/ui/Dockerfile index 3db8ad68..0ecde17b 100644 --- a/ui/Dockerfile +++ b/ui/Dockerfile @@ -5,6 +5,8 @@ RUN apt-get update && apt-get install -y libhyperscan5 libpq-dev gunicorn3 # Don't verify ssl for github enterprise RUN git config --global http.sslverify false +# Docker Windows support +RUN git config --global core.autocrlf false # Install Credential Digger RUN pip install credentialdigger @@ -19,4 +21,4 @@ ARG SSL_private_key COPY . /credential-digger-ui WORKDIR /credential-digger-ui/ RUN chmod +x run.sh -CMD [ "./run.sh" ] \ No newline at end of file +CMD [ "./run.sh" ] diff --git a/ui/backend/client_ui.py b/ui/backend/client_ui.py index 5051be8b..c2a39ea9 100644 --- a/ui/backend/client_ui.py +++ b/ui/backend/client_ui.py @@ -40,18 +40,39 @@ def get_discoveries_count(self, query, params): def get_all_discoveries_count(self): """ Get the repositories together with their total number of - discoveries and the number of "new" ones. + discoveries. Returns ------- list - A list of tuples containing (repo_url, total disc, new disc) + A list of tuples containing (repo_url, total discoveries, true + positives, false positives, + addressing, not_relevant, fixed) """ query = '''SELECT repo_url, - COUNT(*) as total, - sum(case when STATE='new' then 1 else 0 end) as tp - FROM discoveries - GROUP BY repo_url;''' + COUNT(*) AS total, + sum(CASE + WHEN STATE='new' THEN 1 + ELSE 0 + END) AS true_positive, + sum(CASE + WHEN STATE='false_positive' THEN 1 + ELSE 0 + END) AS false_positive, + sum(CASE + WHEN STATE='addressing' THEN 1 + ELSE 0 + END) AS addressing, + sum(CASE + WHEN STATE='not_relevant' THEN 1 + ELSE 0 + END) AS not_relevant, + sum(CASE + WHEN STATE='fixed' THEN 1 + ELSE 0 + END) AS fixed + FROM discoveries + GROUP BY repo_url;''' cursor = self.db.cursor() cursor.execute(query) result = cursor.fetchall() diff --git a/ui/res/js/exportForm.js b/ui/res/js/exportForm.js new file mode 100644 index 00000000..73d5bc4b --- /dev/null +++ b/ui/res/js/exportForm.js @@ -0,0 +1,37 @@ +function changeAll() { + let checkBoxes = document.getElementsByName("check"); + for (let i = 0; i < checkBoxes.length; i++) { + const cb = checkBoxes[i]; + a_field_id = cb.nextSibling.nextSibling.id; + a_field_value = document.getElementById(a_field_id).innerHTML; + if (a_field_value != "(0)") cb.checked = !$("#cbAll").prop("checked"); + } +} + +function alternateCheckBoxes() { + let checkBoxes = document.getElementsByName("check"); + for (let i = 0; i < checkBoxes.length; i++) { + let cb = checkBoxes[i]; + if (cb.checked) { + $("#cbAll").prop("checked", false); + return; + } + } +} + +/* + Make sure at least one checkbox is checked +*/ +function validateCheckBoxes() { + let checkBoxes = document.getElementsByName("check"); + for (let i = 0; i < checkBoxes.length; i++) { + let cb = checkBoxes[i]; + if (cb.checked) { + document.getElementById("error_msg").hidden = true; + return true; + } + } + checkbox_All = $("#cbAll").prop("checked"); + document.getElementById("error_msg").hidden = checkbox_All; + return checkbox_All; +} diff --git a/ui/res/js/repos.js b/ui/res/js/repos.js index dbb50a86..e51eeb28 100755 --- a/ui/res/js/repos.js +++ b/ui/res/js/repos.js @@ -11,6 +11,7 @@ document.addEventListener("DOMContentLoaded", function () { initDeleteRepo(); initModals(); initAlternanteScanRescan(); + initExportCSV(); }); /** @@ -127,6 +128,26 @@ function initReposDataTable() { item.scan_active ? `Scanning...` : `Rescan` } + + + + + +