Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
41ec494
S3 support. Alternating cache files instead of renaming.
mazhurin Mar 5, 2021
0048aa5
aws jars
mazhurin Mar 5, 2021
baa8c27
s3 fix. Delete the previous parquet since s3 is eventually consistent.
mazhurin Mar 5, 2021
e957165
No s3 deletion.
mazhurin Mar 5, 2021
a2b3be6
use_storage option for request_set_cache
mazhurin Mar 5, 2021
c8582a7
a typo
mazhurin Mar 5, 2021
6def54b
Fix in filter_by
mazhurin Mar 5, 2021
4f4b641
rdd.count in request_cache to trigger DAG
mazhurin Mar 5, 2021
1389293
Collect() workaround in request_set_cache for memory only option.
mazhurin Mar 9, 2021
a01e95e
Extra count() removed
mazhurin Mar 9, 2021
90d8d34
First draft(not tested)
mazhurin Feb 15, 2021
ae0dd1f
Fix in cache.filter_by() for memory only
mazhurin Feb 15, 2021
b37d297
load test: do not challenge duplicated traffic
mkaranasou Feb 5, 2021
19e7bfb
split challenge to handle load test
mkaranasou Feb 5, 2021
232815c
bug fix in filtering out load test data
mkaranasou Feb 5, 2021
fde161f
dashboard models
mkaranasou Feb 11, 2021
364c52d
task changes in attack detection
mkaranasou Feb 11, 2021
91febac
Feedback pipeline - WIP
mkaranasou Feb 11, 2021
92380b5
helpers
mkaranasou Feb 11, 2021
ea0dba9
pipeline factory
mkaranasou Feb 11, 2021
3f9105e
id_request_sets -> uuid_request_set + additional dashboard models
mkaranasou Feb 12, 2021
dfb04f2
uuid_request_set
mkaranasou Feb 12, 2021
9cbbd4f
dashboard models changes
mkaranasou Feb 15, 2021
a9672bd
user config
mkaranasou Feb 16, 2021
36c3789
attack link to org and feedback save task - first pass
mkaranasou Feb 17, 2021
622a576
functional feedback pipeline
mkaranasou Feb 25, 2021
8175d55
er diagram
mkaranasou Mar 5, 2021
eb7eaf1
updating requirements
mkaranasou Mar 11, 2021
fb23de2
missed runtime filename
mkaranasou Mar 16, 2021
806b117
model transfer - do not link request sets
mkaranasou Mar 17, 2021
e866171
model transfer
mkaranasou Mar 17, 2021
0de11a5
model transfer request sets set to empty list
mkaranasou Mar 17, 2021
4bf9130
fix rebase
mkaranasou Mar 17, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ dmypy.json

# Pyre type checker
.pyre/

ip_cache/
6 changes: 3 additions & 3 deletions alembic/versions/88eb5854154f_add_id_group_in_request_sets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""add id_request_sets in request_sets
"""add uuid_request_set in request_sets

Revision ID: 88eb5854154f
Revises:
Expand All @@ -16,8 +16,8 @@


def upgrade():
op.add_column('request_sets', sa.Column('id_request_sets', sa.TEXT))
op.add_column('request_sets', sa.Column('uuid_request_set', sa.TEXT))


def downgrade():
op.op.drop_column('request_sets', 'id_request_sets')
op.op.drop_column('request_sets', 'uuid_request_set')
Binary file added data/Baskerville ER Diagram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/jars/aws-java-sdk-1.7.4.jar
Binary file not shown.
Binary file added data/jars/hadoop-aws-2.7.1.jar
Binary file not shown.
19 changes: 19 additions & 0 deletions data/samples/sample_feedback_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"name": "FeedbackSchema",
"properties": {
"id_context": {
"type": "string"
},
"uuid_organization": {
"type": "string"
},
"feedback_context": {
"type": "object"
},
"feedback": {
"type": "object"
}
},
"required": ["id_context", "uuid_organization", "feedback_context", "feedback"],
"additionalProperties": false
}
10 changes: 4 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
jinja2==2.10
pgpubsub
jinja2>=2.10.1
numpy==1.14.3
PyYAML==3.12
cryptography==2.2.2
Expand All @@ -8,28 +7,27 @@ python-geoip==1.2
python-geoip-geolite2==2015.303
certifi==2018.4.16
ua-parser==0.8.0
bokeh==0.12.16
# bokeh==0.12.16
pandas==0.23.0
pycountry==18.2.23
scipy==1.1.0
matplotlib==2.2.2
seaborn==0.8.1
hdbscan==0.8.13
alembic==1.0.8
enum34==1.1.6
tzwhere==3.0.3
pytz==2014.10
sqlalchemy_utils==0.33.3
pyspark==2.4.4
es_retriever==1.0.0
# es_retriever==1.0.0
psutil==5.4.6
psycopg2==2.7.5
yellowbrick==0.8
dateparser==0.7.0
pymisp==2.4.93
attrs==18.1.0
warlock==1.3.0
jsonschema==2.6
jsonschema==2.6.0
stringcase==1.2.0
prometheus_client==0.5.0
grafanalib==0.5.3
Expand Down
1 change: 0 additions & 1 deletion requirements_unit_tests.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
jinja2==2.10
pgpubsub
numpy==1.14.3
PyYAML==3.12
Expand Down
182 changes: 182 additions & 0 deletions src/baskerville/db/dashboard_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# Copyright (c) 2020, eQualit.ie inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from baskerville.db import Base
from baskerville.db.models import utcnow, SerializableMixin
from sqlalchemy import Column, Integer, ForeignKey, DateTime, Enum, String, \
Boolean, BigInteger, Float, JSON, Text, TEXT
from sqlalchemy.orm import relationship
from passlib.apps import custom_app_context as pwd_context

from baskerville.util.enums import UserCategoryEnum, FeedbackEnum, \
FeedbackContextTypeEnum


class UserCategory(Base, SerializableMixin):
__tablename__ = 'user_categories'
id = Column(Integer, primary_key=True, autoincrement=True)
category = Column(Enum(UserCategoryEnum))
# a user can belong to more than one category
users = relationship(
'User', uselist=True, back_populates='category'
)


class Organization(Base, SerializableMixin):
__tablename__ = 'organizations'
id = Column(BigInteger(), primary_key=True, autoincrement=True, unique=True)
uuid = Column(String(300), primary_key=True, unique=True)
name = Column(String(200), index=True)
details = Column(TEXT())
registered = Column(Boolean(), default=False)
created_at = Column(DateTime(timezone=True), server_default=utcnow())
updated_at = Column(
DateTime(timezone=True), nullable=True, onupdate=utcnow()
)
users = relationship(
'User', uselist=False, back_populates='organization'
)


class User(Base, SerializableMixin):
__tablename__ = 'users'
id = Column(BigInteger(), primary_key=True, autoincrement=True, unique=True)
id_organization = Column(BigInteger(), ForeignKey('organizations.id'))
id_category = Column(Integer, ForeignKey('user_categories.id'), nullable=False)
username = Column(String(200), index=True)
first_name = Column(String(200), index=True)
last_name = Column(String(200), index=True)
email = Column(String(256), unique=True, nullable=False)
password_hash = Column(String(128))
is_active = Column(Boolean())
is_gitlab_login = Column(Boolean(), default=False)
is_admin = Column(Boolean(), default=False)
created_at = Column(DateTime(timezone=True), server_default=utcnow())
updated_at = Column(
DateTime(timezone=True), nullable=True, onupdate=utcnow()
)

# users * - 1 category
category = relationship(
'UserCategory',
foreign_keys=id_category, back_populates='users'
)
organization = relationship(
'Organization',
foreign_keys=id_organization, back_populates='users'
)
runtimes = relationship(
'Runtime',
uselist=False,
# back_populates='user'
)

_remove = ['password_hash']

def hash_password(self, password):
self.password_hash = pwd_context.encrypt(password)
return self.password_hash

def verify_password(self, password):
return pwd_context.verify(password, self.password_hash)


class FeedbackContext(Base, SerializableMixin):
__tablename__ = 'feedback_contexts'
id = Column(BigInteger, primary_key=True, autoincrement=True, unique=True)
uuid_organization = Column(String(300), nullable=False)
reason = Column(Enum(FeedbackContextTypeEnum))
reason_descr = Column(TEXT())
start = Column(DateTime(timezone=True))
stop = Column(DateTime(timezone=True))
ip_count = Column(Integer)
notes = Column(TEXT)
progress_report = Column(TEXT)
pending = Column(Boolean(), default=True)


class Feedback(Base, SerializableMixin):
__tablename__ = 'feedback'

id = Column(BigInteger, primary_key=True, autoincrement=True, unique=True)
id_feedback_context = Column(BigInteger(), ForeignKey('feedback_contexts.id'), nullable=False)
id_user = Column(BigInteger(), ForeignKey('users.id'), nullable=False)
uuid_request_set = Column(TEXT(), nullable=False)
prediction = Column(Integer, nullable=False)
score = Column(Float, nullable=False)
attack_prediction = Column(Float, nullable=False)
low_rate = Column(Boolean(), nullable=True)
ip = Column(String, nullable=False)
target = Column(String, nullable=False)
features = Column(JSON, nullable=False)
feedback = Column(Enum(FeedbackEnum))
start = Column(DateTime(timezone=True), nullable=False)
stop = Column(DateTime(timezone=True), nullable=False)
submitted = Column(Boolean(), default=False)
created_at = Column(DateTime(timezone=True), server_default=utcnow())
updated_at = Column(
DateTime(timezone=True), nullable=True, onupdate=utcnow()
)

user = relationship(
'User',
foreign_keys=id_user
)
request_set = relationship(
'RequestSet',
primaryjoin='foreign(Feedback.uuid_request_set) == remote(RequestSet.uuid_request_set)'
)
feedback_context = relationship(
'FeedbackContext',
foreign_keys=id_feedback_context
)


class SubmittedFeedback(Base, SerializableMixin):
__tablename__ = 'submitted_feedback'

id = Column(BigInteger, primary_key=True, autoincrement=True, unique=True)
# not all feedback is part of an attack
id_context = Column(BigInteger(), ForeignKey('feedback_contexts.id'), nullable=False)
uuid_organization = Column(String(300), nullable=False)
uuid_request_set = Column(TEXT(), nullable=False)
prediction = Column(Integer, nullable=False)
score = Column(Float, nullable=False)
attack_prediction = Column(Float, nullable=False)
low_rate = Column(Boolean(), nullable=True)
features = Column(JSON, nullable=True)
feedback = Column(Enum(FeedbackEnum))
start = Column(DateTime(timezone=True), nullable=True)
stop = Column(DateTime(timezone=True), nullable=True)
submitted_at = Column(DateTime(timezone=True))
created_at = Column(DateTime(timezone=True), server_default=utcnow())
updated_at = Column(
DateTime(timezone=True), nullable=True, onupdate=utcnow()
)

organization = relationship(
'Organization',
primaryjoin='foreign(SubmittedFeedback.uuid_organization) == remote(Organization.uuid)'
)
request_set = relationship(
'RequestSet',
primaryjoin='foreign(SubmittedFeedback.uuid_request_set) == remote(RequestSet.uuid_request_set)'
)
columns = [
'id',
'id_context',
'uuid_organization',
'uuid_request_set',
'prediction',
'score',
'attack_prediction',
'low_rate',
'features',
'feedback',
'start',
'submitted_at',
'updated_at'
]
50 changes: 20 additions & 30 deletions src/baskerville/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sqlalchemy.sql import expression

from baskerville.db import Base
from baskerville.util.helpers import SerializableMixin


LONG_TEXT_LEN = 4294000000
Expand All @@ -28,33 +29,6 @@ def pg_utcnow(element, compiler, **kw):
return "TIMEZONE('utc', CURRENT_TIMESTAMP)"


class SerializableMixin(object):
def as_dict(self, extra_cols=(), remove=()):
"""

:param set extra_cols:
:param set remove:
:return:
:rtype: dict[str, T]
"""
basic_attrs = {c.name: getattr(self, c.name)
for c in self.__table__.columns
if c not in remove}
extra_attrs = {}
if len(extra_cols) > 0:
for attr in extra_cols:
d = getattr(self, attr)
if d is None:
continue
if isinstance(d, list):
extra_attrs[attr] = [each.as_dict() for each in d]
else:
extra_attrs[attr] = d.as_dict()
basic_attrs.update(extra_attrs)

return basic_attrs


class Encryption(Base, SerializableMixin):
__tablename__ = 'encryption'

Expand All @@ -69,6 +43,7 @@ class Runtime(Base, SerializableMixin):

id = Column(BigInteger, primary_key=True)
id_encryption = Column(BigInteger, ForeignKey('encryption.id'))
id_user = Column(BigInteger, ForeignKey('users.id'))
start = Column(DateTime(timezone=True))
stop = Column(DateTime(timezone=True))
target = Column(TEXT(), nullable=True)
Expand All @@ -89,14 +64,23 @@ class Runtime(Base, SerializableMixin):
'Encryption',
foreign_keys=id_encryption, back_populates='runtimes'
)
# runtimes * - 1 users
try:
from baskerville.db.dashboard_models import User
except:
pass
user = relationship(
'User',
foreign_keys=id_user, back_populates='runtimes'
)


class RequestSet(Base, SerializableMixin):
__tablename__ = 'request_sets'

id = Column(BigInteger, primary_key=True)
id_runtime = Column(BigInteger, ForeignKey('runtimes.id'), nullable=True)
id_request_sets = Column(TEXT())
uuid_request_set = Column(TEXT())
target = Column(TEXT())
target_original = Column(TEXT())
ip = Column(String(45))
Expand Down Expand Up @@ -152,7 +136,7 @@ class RequestSet(Base, SerializableMixin):
)

columns = [
'id_request_sets',
'uuid_request_set',
'ip',
'target',
'target_original',
Expand Down Expand Up @@ -226,8 +210,9 @@ class ModelTrainingSetLink(Base, SerializableMixin):
class Attack(Base, SerializableMixin):
__tablename__ = 'attacks'

id = Column(BigInteger, primary_key=True)
id = Column(BigInteger, primary_key=True, autoincrement=True)
id_misp = Column(BigInteger)
uuid_org = Column(TEXT())
date = Column(DateTime(timezone=True))
start = Column(DateTime(timezone=True))
stop = Column(DateTime(timezone=True))
Expand All @@ -240,6 +225,7 @@ class Attack(Base, SerializableMixin):
sync_stop = Column(DateTime(timezone=True))
processed = Column(Integer)
notes = Column(TEXT)
progress_report = Column(TEXT)
analysis_notebook = Column(TEXT)

request_sets = relationship(
Expand All @@ -250,6 +236,10 @@ class Attack(Base, SerializableMixin):
'Attribute', secondary='attribute_attack_link',
back_populates='attacks'
)
organization = relationship(
'Organization',
primaryjoin='foreign(Attack.uuid_org) == remote(Organization.uuid)'
)


class Attribute(Base, SerializableMixin):
Expand Down
2 changes: 1 addition & 1 deletion src/baskerville/models/anomaly_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,4 +240,4 @@ def load(self, path, spark_session=None):
self.indexes = {}
for feature in self.categorical_string_features():
self.indexes[feature] = StringIndexerModel.load(self._get_index_path(path, feature))
return self
return self
Loading