Merge pull request #194 from SAP/develop

release v4.1
SAP · Aug 31, 2021 · 47aac27 · 47aac27
2 parents 0177635 + 7176b70
commit 47aac27
Show file tree

Hide file tree

Showing 19 changed files with 635 additions and 72 deletions.
diff --git a/THIRD-PARTY-NOTICES b/THIRD-PARTY-NOTICES
@@ -109,11 +109,6 @@ Licensor: ExplosionAI UG
 Website: https://github.com/explosion/srsly
 License: MIT License
 
-Component: tqdm
-Licensor: Casper da Costa-Luis, Google Inc., Noam Yorav-Raphael, and contributors
-Website: https://github.com/tqdm/tqdm
-License: MIT License and MPLv2.0
-
 
 
 --------------------------------------------------------------------------------------------------------------------------

diff --git a/credentialdigger/cli/get_discoveries.py b/credentialdigger/cli/get_discoveries.py
@@ -133,13 +133,12 @@ def discoveries_to_csv(discoveries):
         csv_writer.writeheader()
         csv_writer.writerows(discoveries)
         csv_data = stringIO.getvalue()
+        return csv_data
     except IndexError as error:
         logger.error(error)
     except Exception as exception:
         logger.exception(exception)
 
-    return csv_data
-
 
 def export_csv(discoveries, client, save=False):
     """ Export discoveries as a CSV file.

diff --git a/credentialdigger/client.py b/credentialdigger/client.py
@@ -8,13 +8,13 @@
 
 import yaml
 from github import Github
-from tqdm import tqdm
+from rich.progress import Progress
 
 from .generator import ExtractorGenerator
 from .models.model_manager import ModelManager
 from .scanners.file_scanner import FileScanner
-from .scanners.git_scanner import GitScanner
 from .scanners.git_file_scanner import GitFileScanner
+from .scanners.git_scanner import GitScanner
 from .snippet_similarity import (build_embedding_model, compute_similarity,
                                  compute_snippet_embedding)
 
@@ -1144,14 +1144,17 @@ def _scan(self, repo_url, scanner, models=None, force=False, debug=False,
         discoveries_ids = list()
         if debug:
             logger.debug('Update database with these discoveries.')
-            for i in tqdm(range(len(new_discoveries))):
-                curr_d = new_discoveries[i]
-                new_id = self.add_discovery(
-                    curr_d['file_name'], curr_d['commit_id'],
-                    curr_d['line_number'], curr_d['snippet'], repo_url,
-                    curr_d['rule_id'], curr_d['state'])
-                if new_id != -1 and curr_d['state'] != 'false_positive':
-                    discoveries_ids.append(new_id)
+            with Progress() as progress:
+                inserting_task = progress.add_task('Inserting discoveries...',
+                                                   total=len(new_discoveries))
+                for curr_d in new_discoveries:
+                    new_id = self.add_discovery(
+                        curr_d['file_name'], curr_d['commit_id'],
+                        curr_d['line_number'], curr_d['snippet'], repo_url,
+                        curr_d['rule_id'], curr_d['state'])
+                    if new_id != -1 and curr_d['state'] != 'false_positive':
+                        discoveries_ids.append(new_id)
+                    progress.update(inserting_task, advance=1)
             logger.debug(f'{len(discoveries_ids)} discoveries left for manual '
                          'review.')
         else:
@@ -1191,11 +1194,13 @@ def _analyze_discovery(d):
         if debug:
             model_name = model_manager.model.__class__.__name__
             logger.debug(f'Analyzing discoveries with model {model_name}')
-
             false_positives = 0
-            for i in tqdm(range(len(discoveries))):
-                false_positives += _analyze_discovery(discoveries[i])
-
+            with Progress() as progress:
+                scanning_task = progress.add_task('Scanning discoveries...',
+                                                  total=len(discoveries))
+                for curr_discovery in discoveries:
+                    false_positives += _analyze_discovery(curr_discovery)
+                    progress.update(scanning_task, advance=1)
             logger.debug(f'Model {model_name} classified {false_positives} '
                          'discoveries.')
         else:

diff --git a/credentialdigger/generator/generator.py b/credentialdigger/generator/generator.py
@@ -11,7 +11,7 @@
 import pkg_resources
 import string_utils
 from git import Repo as GitRepo
-from tqdm import tqdm
+from rich.progress import Progress
 
 from .qlearning import compute_dataset
 from .training import create_snippet_model
@@ -280,10 +280,13 @@ def _pre_process(raw_data):
 
         data_list = []
         # Preprocess the dataset with naming convention, etc.
-        for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
-            row_data = {}
-            for column in ['text', 'key', 'value']:
-                row_data[column] = _pre_process(row[column])
-            data_list.append(row_data)
-
+        with Progress() as progress:
+            preprocess_task = progress.add_task('Pre-processing dataset...',
+                                                total=data.shape[0])
+            for idx, row in data.iterrows():
+                row_data = {}
+                for column in ['text', 'key', 'value']:
+                    row_data[column] = _pre_process(row[column])
+                data_list.append(row_data)
+                progress.update(preprocess_task, advance=1)
         return pd.DataFrame(data=data_list)
diff --git a/credentialdigger/generator/qlearning.py b/credentialdigger/generator/qlearning.py
@@ -2,8 +2,8 @@
 import random
 
 import numpy as np
+from rich.progress import Progress
 from sklearn.metrics.pairwise import cosine_similarity
-from tqdm import tqdm
 
 from .stylometry import compute_vector, word_unigram_tf
 from .transform import (build_dummy_dict, choose_applicable_transformation,
@@ -71,26 +71,32 @@ def compute_dataset(corpus, actions_n, states_n, alpha, gamma, epochs_basis=50,
 
     dataset = []
 
-    # Apply Q-learning for each pattern
-    for pattern_index in tqdm(range(len(all_patterns))):
-        # Select a random extract and remove it from the corpus
-        reference_extract = corpus.pop(random.randrange(len(corpus)))
-        # Cut extracts too long
-        reference_extract = reference_extract[:extract_max_length]
-
-        # Increase epochs for more complex patterns
-        epochs = int(epochs_basis * (1 + (pattern_index / len(all_patterns))))
-        # Update epochs in args
-        args['epochs'] = epochs
-
-        # Compute the optimal modifications to the basic patterns
-        final_transformation, modification_dict = _optimal_transformation(
-            reference_extract, all_patterns[pattern_index], args)
-
-        # Generate the dataset, with optimal transformations
-        for i in range(epochs):
-            dataset += generate_data(all_patterns[pattern_index],
-                                     modification_dict)
+    with Progress() as progress:
+        patterns_count = len(all_patterns)
+        qlearn_task = progress.add_task('Apply Q-learning to patterns...',
+                                        total=patterns_count)
+        # Apply Q-learning for each pattern
+        for pattern_index in range(patterns_count):
+            # Select a random extract and remove it from the corpus
+            reference_extract = corpus.pop(random.randrange(len(corpus)))
+            # Cut extracts too long
+            reference_extract = reference_extract[:extract_max_length]
+
+            # Increase epochs for more complex patterns
+            epochs = int(epochs_basis *
+                         (1 + (pattern_index / patterns_count)))
+            # Update epochs in args
+            args['epochs'] = epochs
+
+            # Compute the optimal modifications to the basic patterns
+            final_transformation, modification_dict = _optimal_transformation(
+                reference_extract, all_patterns[pattern_index], args)
+
+            # Generate the dataset, with optimal transformations
+            for i in range(epochs):
+                dataset += generate_data(all_patterns[pattern_index],
+                                         modification_dict)
+            progress.update(qlearn_task, advance=1)
     return dataset
 
 

diff --git a/credentialdigger/snippet_similarity.py b/credentialdigger/snippet_similarity.py
@@ -2,13 +2,14 @@
 import os
 
 import numpy as np
+# In order not to raise tensorflow warnings, we need to set this environment
+# variable before importing the `tensorflow` package
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 import tensorflow as tf
 import tensorflow.keras.preprocessing.text
 import tensorflow_hub as hub
 import tensorflow_text
 
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
 
 def build_embedding_model():
     """ Build model by stacking up a preprocessing layer

diff --git a/requirements.txt b/requirements.txt
@@ -15,9 +15,8 @@ requests
 rich
 scikit-learn
 srsly>=2.4.0
-tensorflow==2.5.0; python_version >= "3.8"
+tensorflow==2.5.1; python_version >= "3.8"
 tensorflow==2.4.2; python_version < "3.8"
 tensorflow-text==2.5.0; python_version >= "3.8"
 tensorflow-text==2.4.2; python_version < "3.8"
-tf-models-official
-tqdm
+tf-models-official
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@ def requirements():
 
 setuptools.setup(
     name='credentialdigger',
-    version='4.0.5',
+    version='4.1.0',
     author='SAP SE',
     maintainer='Marco Rosa, Slim Trabelsi',
     maintainer_email='[email protected], [email protected]',

diff --git a/tests/functional_tests/test_get_discoveries_postgres.py b/tests/functional_tests/test_get_discoveries_postgres.py
@@ -0,0 +1,146 @@
+import os
+import random
+import tempfile
+import unittest
+
+import pandas as pd
+from credentialdigger.cli import cli
+from credentialdigger.client_postgres import PgClient
+from parameterized import param, parameterized
+
+REPO_URL = ''.join(random.choice('0123456789ABCDEF') for i in range(16))
+
+
+class TestGetDiscoveries(unittest.TestCase):
+    dotenv = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.env')
+
+    @classmethod
+    def setUpClass(cls):
+        # Add rules
+        cli.main(['', 'add_rules', '--dotenv', cls.dotenv,
+                  'tests/functional_tests/test_rules.yml'])
+
+        # Set CSV temporary export path
+        cls.tmp_path = tempfile.mkdtemp()
+        cls.csv_path = os.path.join(cls.tmp_path, 'test.csv')
+
+        # Set up Postgres client
+        client = PgClient(dbname=os.getenv('POSTGRES_DB'),
+                          dbuser=os.getenv('POSTGRES_USER'),
+                          dbpassword=os.getenv('POSTGRES_PASSWORD'),
+                          dbhost=os.getenv('DBHOST'),
+                          dbport=os.getenv('DBPORT'))
+
+        client.add_repo(REPO_URL)
+
+        # Insert fake discoveries
+        discoveries = []
+        discoveries_count = 5
+        for state in ['new', 'false_positive', 'addressing',
+                      'not_relevant', 'fixed']:
+            for i in range(discoveries_count):
+                discovery = {
+                    'file_name': 'danger' if state == 'new' else 'fake_file',
+                    'commit_id': '0xtmp_commit_id',
+                    'line_number': '1',
+                    'snippet': 'tmp_snippet',
+                    'rule_id': 1,
+                    'state': state,
+                    'timestamp': '2021-08-05T01:13',
+                }
+                discoveries.append(discovery)
+            discoveries_count += 1
+        client.add_discoveries(discoveries, REPO_URL)
+        cls.client = client
+
+    @classmethod
+    def tearDownClass(cls):
+        """ Remove the repo and all its discoveries. """
+        cls.client.delete_repo(REPO_URL)
+        cls.client.delete_discoveries(REPO_URL)
+        os.remove(cls.csv_path)
+
+    @parameterized.expand([
+        param(state='new', count=5),
+        param(state='false_positive', count=6),
+        param(state='addressing', count=7),
+        param(state='not_relevant', count=8),
+        param(state='fixed', count=9)
+    ])
+    def test_get_discoveries(self, state, count):
+        """ Test if we retrieve the correct number of discoveries for every
+        possible state value.
+
+        Parameters
+        ----------
+        state: str
+            The state to filter discoveries on
+        count: int
+            The expected number of discoveries to be returned
+        """
+        with self.assertRaises(SystemExit) as cm:
+            cli.main(
+                [
+                    '',
+                    'get_discoveries',
+                    REPO_URL,
+                    '--save',
+                    self.csv_path,
+                    '--state',
+                    state,
+                    '--dotenv',
+                    self.dotenv
+                ]
+            )
+        self.assertEqual(cm.exception.code, count)
+
+    @parameterized.expand([
+        param(file='danger', count=5),
+        param(file='fake_file', count=30)
+    ])
+    def test_get_discoveries_per_file(self, file, count):
+        """ Test if we retrieve the correct number of discoveries based on
+        filename input.
+
+        Parameters
+        ----------
+        file: str
+            The file name to filter discoveries on
+        count: int
+            The expected number of discoveries to be returned
+        """
+        with self.assertRaises(SystemExit) as cm:
+            cli.main(
+                [
+                    '',
+                    'get_discoveries',
+                    REPO_URL,
+                    '--save',
+                    self.csv_path,
+                    '--filename',
+                    file,
+                    '--dotenv',
+                    self.dotenv
+                ]
+            )
+        self.assertEqual(cm.exception.code, count)
+
+    def test_csv_written(self):
+        """ Test if the CLI command writes correctly the CSV file. """
+        with self.assertRaises(SystemExit):
+            cli.main(
+                [
+                    '',
+                    'get_discoveries',
+                    REPO_URL,
+                    '--save',
+                    self.csv_path,
+                    '--dotenv',
+                    self.dotenv
+                ]
+            )
+        data_frame = pd.read_csv(self.csv_path)
+        try:
+            assert data_frame.notna().values.all()
+        except AssertionError:
+            assert False, 'CSV file contains NaN'