Skip to content

Commit

Permalink
Merge pull request #194 from SAP/develop
Browse files Browse the repository at this point in the history
release v4.1
  • Loading branch information
marcorosa authored Aug 31, 2021
2 parents 0177635 + 7176b70 commit 47aac27
Show file tree
Hide file tree
Showing 19 changed files with 635 additions and 72 deletions.
5 changes: 0 additions & 5 deletions THIRD-PARTY-NOTICES
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,6 @@ Licensor: ExplosionAI UG
Website: https://github.com/explosion/srsly
License: MIT License

Component: tqdm
Licensor: Casper da Costa-Luis, Google Inc., Noam Yorav-Raphael, and contributors
Website: https://github.com/tqdm/tqdm
License: MIT License and MPLv2.0



--------------------------------------------------------------------------------------------------------------------------
Expand Down
3 changes: 1 addition & 2 deletions credentialdigger/cli/get_discoveries.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,13 +133,12 @@ def discoveries_to_csv(discoveries):
csv_writer.writeheader()
csv_writer.writerows(discoveries)
csv_data = stringIO.getvalue()
return csv_data
except IndexError as error:
logger.error(error)
except Exception as exception:
logger.exception(exception)

return csv_data


def export_csv(discoveries, client, save=False):
""" Export discoveries as a CSV file.
Expand Down
33 changes: 19 additions & 14 deletions credentialdigger/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@

import yaml
from github import Github
from tqdm import tqdm
from rich.progress import Progress

from .generator import ExtractorGenerator
from .models.model_manager import ModelManager
from .scanners.file_scanner import FileScanner
from .scanners.git_scanner import GitScanner
from .scanners.git_file_scanner import GitFileScanner
from .scanners.git_scanner import GitScanner
from .snippet_similarity import (build_embedding_model, compute_similarity,
compute_snippet_embedding)

Expand Down Expand Up @@ -1144,14 +1144,17 @@ def _scan(self, repo_url, scanner, models=None, force=False, debug=False,
discoveries_ids = list()
if debug:
logger.debug('Update database with these discoveries.')
for i in tqdm(range(len(new_discoveries))):
curr_d = new_discoveries[i]
new_id = self.add_discovery(
curr_d['file_name'], curr_d['commit_id'],
curr_d['line_number'], curr_d['snippet'], repo_url,
curr_d['rule_id'], curr_d['state'])
if new_id != -1 and curr_d['state'] != 'false_positive':
discoveries_ids.append(new_id)
with Progress() as progress:
inserting_task = progress.add_task('Inserting discoveries...',
total=len(new_discoveries))
for curr_d in new_discoveries:
new_id = self.add_discovery(
curr_d['file_name'], curr_d['commit_id'],
curr_d['line_number'], curr_d['snippet'], repo_url,
curr_d['rule_id'], curr_d['state'])
if new_id != -1 and curr_d['state'] != 'false_positive':
discoveries_ids.append(new_id)
progress.update(inserting_task, advance=1)
logger.debug(f'{len(discoveries_ids)} discoveries left for manual '
'review.')
else:
Expand Down Expand Up @@ -1191,11 +1194,13 @@ def _analyze_discovery(d):
if debug:
model_name = model_manager.model.__class__.__name__
logger.debug(f'Analyzing discoveries with model {model_name}')

false_positives = 0
for i in tqdm(range(len(discoveries))):
false_positives += _analyze_discovery(discoveries[i])

with Progress() as progress:
scanning_task = progress.add_task('Scanning discoveries...',
total=len(discoveries))
for curr_discovery in discoveries:
false_positives += _analyze_discovery(curr_discovery)
progress.update(scanning_task, advance=1)
logger.debug(f'Model {model_name} classified {false_positives} '
'discoveries.')
else:
Expand Down
17 changes: 10 additions & 7 deletions credentialdigger/generator/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pkg_resources
import string_utils
from git import Repo as GitRepo
from tqdm import tqdm
from rich.progress import Progress

from .qlearning import compute_dataset
from .training import create_snippet_model
Expand Down Expand Up @@ -280,10 +280,13 @@ def _pre_process(raw_data):

data_list = []
# Preprocess the dataset with naming convention, etc.
for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
row_data = {}
for column in ['text', 'key', 'value']:
row_data[column] = _pre_process(row[column])
data_list.append(row_data)

with Progress() as progress:
preprocess_task = progress.add_task('Pre-processing dataset...',
total=data.shape[0])
for idx, row in data.iterrows():
row_data = {}
for column in ['text', 'key', 'value']:
row_data[column] = _pre_process(row[column])
data_list.append(row_data)
progress.update(preprocess_task, advance=1)
return pd.DataFrame(data=data_list)
48 changes: 27 additions & 21 deletions credentialdigger/generator/qlearning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import random

import numpy as np
from rich.progress import Progress
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from .stylometry import compute_vector, word_unigram_tf
from .transform import (build_dummy_dict, choose_applicable_transformation,
Expand Down Expand Up @@ -71,26 +71,32 @@ def compute_dataset(corpus, actions_n, states_n, alpha, gamma, epochs_basis=50,

dataset = []

# Apply Q-learning for each pattern
for pattern_index in tqdm(range(len(all_patterns))):
# Select a random extract and remove it from the corpus
reference_extract = corpus.pop(random.randrange(len(corpus)))
# Cut extracts too long
reference_extract = reference_extract[:extract_max_length]

# Increase epochs for more complex patterns
epochs = int(epochs_basis * (1 + (pattern_index / len(all_patterns))))
# Update epochs in args
args['epochs'] = epochs

# Compute the optimal modifications to the basic patterns
final_transformation, modification_dict = _optimal_transformation(
reference_extract, all_patterns[pattern_index], args)

# Generate the dataset, with optimal transformations
for i in range(epochs):
dataset += generate_data(all_patterns[pattern_index],
modification_dict)
with Progress() as progress:
patterns_count = len(all_patterns)
qlearn_task = progress.add_task('Apply Q-learning to patterns...',
total=patterns_count)
# Apply Q-learning for each pattern
for pattern_index in range(patterns_count):
# Select a random extract and remove it from the corpus
reference_extract = corpus.pop(random.randrange(len(corpus)))
# Cut extracts too long
reference_extract = reference_extract[:extract_max_length]

# Increase epochs for more complex patterns
epochs = int(epochs_basis *
(1 + (pattern_index / patterns_count)))
# Update epochs in args
args['epochs'] = epochs

# Compute the optimal modifications to the basic patterns
final_transformation, modification_dict = _optimal_transformation(
reference_extract, all_patterns[pattern_index], args)

# Generate the dataset, with optimal transformations
for i in range(epochs):
dataset += generate_data(all_patterns[pattern_index],
modification_dict)
progress.update(qlearn_task, advance=1)
return dataset


Expand Down
5 changes: 3 additions & 2 deletions credentialdigger/snippet_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
import os

import numpy as np
# In order not to raise tensorflow warnings, we need to set this environment
# variable before importing the `tensorflow` package
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import tensorflow.keras.preprocessing.text
import tensorflow_hub as hub
import tensorflow_text

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


def build_embedding_model():
""" Build model by stacking up a preprocessing layer
Expand Down
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ requests
rich
scikit-learn
srsly>=2.4.0
tensorflow==2.5.0; python_version >= "3.8"
tensorflow==2.5.1; python_version >= "3.8"
tensorflow==2.4.2; python_version < "3.8"
tensorflow-text==2.5.0; python_version >= "3.8"
tensorflow-text==2.4.2; python_version < "3.8"
tf-models-official
tqdm
tf-models-official
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def requirements():

setuptools.setup(
name='credentialdigger',
version='4.0.5',
version='4.1.0',
author='SAP SE',
maintainer='Marco Rosa, Slim Trabelsi',
maintainer_email='[email protected], [email protected]',
Expand Down
146 changes: 146 additions & 0 deletions tests/functional_tests/test_get_discoveries_postgres.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import os
import random
import tempfile
import unittest

import pandas as pd
from credentialdigger.cli import cli
from credentialdigger.client_postgres import PgClient
from parameterized import param, parameterized

REPO_URL = ''.join(random.choice('0123456789ABCDEF') for i in range(16))


class TestGetDiscoveries(unittest.TestCase):
dotenv = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.env')

@classmethod
def setUpClass(cls):
# Add rules
cli.main(['', 'add_rules', '--dotenv', cls.dotenv,
'tests/functional_tests/test_rules.yml'])

# Set CSV temporary export path
cls.tmp_path = tempfile.mkdtemp()
cls.csv_path = os.path.join(cls.tmp_path, 'test.csv')

# Set up Postgres client
client = PgClient(dbname=os.getenv('POSTGRES_DB'),
dbuser=os.getenv('POSTGRES_USER'),
dbpassword=os.getenv('POSTGRES_PASSWORD'),
dbhost=os.getenv('DBHOST'),
dbport=os.getenv('DBPORT'))

client.add_repo(REPO_URL)

# Insert fake discoveries
discoveries = []
discoveries_count = 5
for state in ['new', 'false_positive', 'addressing',
'not_relevant', 'fixed']:
for i in range(discoveries_count):
discovery = {
'file_name': 'danger' if state == 'new' else 'fake_file',
'commit_id': '0xtmp_commit_id',
'line_number': '1',
'snippet': 'tmp_snippet',
'rule_id': 1,
'state': state,
'timestamp': '2021-08-05T01:13',
}
discoveries.append(discovery)
discoveries_count += 1
client.add_discoveries(discoveries, REPO_URL)
cls.client = client

@classmethod
def tearDownClass(cls):
""" Remove the repo and all its discoveries. """
cls.client.delete_repo(REPO_URL)
cls.client.delete_discoveries(REPO_URL)
os.remove(cls.csv_path)

@parameterized.expand([
param(state='new', count=5),
param(state='false_positive', count=6),
param(state='addressing', count=7),
param(state='not_relevant', count=8),
param(state='fixed', count=9)
])
def test_get_discoveries(self, state, count):
""" Test if we retrieve the correct number of discoveries for every
possible state value.
Parameters
----------
state: str
The state to filter discoveries on
count: int
The expected number of discoveries to be returned
"""
with self.assertRaises(SystemExit) as cm:
cli.main(
[
'',
'get_discoveries',
REPO_URL,
'--save',
self.csv_path,
'--state',
state,
'--dotenv',
self.dotenv
]
)
self.assertEqual(cm.exception.code, count)

@parameterized.expand([
param(file='danger', count=5),
param(file='fake_file', count=30)
])
def test_get_discoveries_per_file(self, file, count):
""" Test if we retrieve the correct number of discoveries based on
filename input.
Parameters
----------
file: str
The file name to filter discoveries on
count: int
The expected number of discoveries to be returned
"""
with self.assertRaises(SystemExit) as cm:
cli.main(
[
'',
'get_discoveries',
REPO_URL,
'--save',
self.csv_path,
'--filename',
file,
'--dotenv',
self.dotenv
]
)
self.assertEqual(cm.exception.code, count)

def test_csv_written(self):
""" Test if the CLI command writes correctly the CSV file. """
with self.assertRaises(SystemExit):
cli.main(
[
'',
'get_discoveries',
REPO_URL,
'--save',
self.csv_path,
'--dotenv',
self.dotenv
]
)
data_frame = pd.read_csv(self.csv_path)
try:
assert data_frame.notna().values.all()
except AssertionError:
assert False, 'CSV file contains NaN'
Loading

0 comments on commit 47aac27

Please sign in to comment.