Skip to content

Commit

Permalink
[no ci] Merge remote-tracking branch 'upstream/main' into pydriller
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Dec 8, 2023
2 parents 6a4bf7c + 6b78cf2 commit 87e1a80
Show file tree
Hide file tree
Showing 59 changed files with 4,676 additions and 6,475 deletions.
3 changes: 3 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,6 @@ ignore_missing_imports = True
[mypy-pydriller.*]
ignore_missing_imports = True

[mypy-base62.*]
ignore_missing_imports = True

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

CredSweeper is a tool to detect credentials in any directories or files.
CredSweeper could help users to detect unwanted exposure of credentials
(such as personal information, token, passwords, api keys etc.) in advance.
(such as tokens, passwords, api keys etc.) in advance.
By scanning lines, filtering, and using AI model as option,
CredSweeper reports lines with possible credentials, where the line is,
and expected type of the credential as a result.
Expand Down
14 changes: 7 additions & 7 deletions cicd/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
DATA: 19434458 valid lines. MARKUP: 74639 items
DATA: 19434458 valid lines. MARKUP: 74402 items
Category Positives Negatives Template
-------------------------- ----------- ----------- ----------
Authentication Key & Token 70 1 31
Generic Secret 1056 15 203
Generic Token 333 45 558
Other 1076 63510 635
Other 839 63510 635
Password 1405 110 4170
Predefined Pattern 326 2 40
Private Key 1001 1 3
Seed, Salt, Nonce 40 4 4
TOTAL: 5307 63688 5644
Detected Credentials: 5996
credsweeper result_cnt : 5338, lost_cnt : 0, true_cnt : 4441, false_cnt : 897
TOTAL: 5070 63688 5644
Detected Credentials: 5730
credsweeper result_cnt : 5102, lost_cnt : 0, true_cnt : 4207, false_cnt : 895
Category TP FP TN FN FPR FNR ACC PRC RCL F1
-------------------------- ---- ---- -------- ---- --------- --------- -------- -------- -------- --------
Authentication Key & Token 54 4 28 16 0.125 0.228571 0.803922 0.931034 0.771429 0.84375
Generic Secret 973 3 215 83 0.0137615 0.0785985 0.932496 0.996926 0.921402 0.957677
Generic Token 289 7 596 44 0.0116086 0.132132 0.945513 0.976351 0.867868 0.918919
Other 818 749 63396 258 0.0116767 0.239777 0.98456 0.522017 0.760223 0.618994
Other 584 747 63398 255 0.0116455 0.303933 0.984581 0.438768 0.696067 0.538249
Password 995 130 4150 410 0.0303738 0.291815 0.905013 0.884444 0.708185 0.786561
Predefined Pattern 309 2 40 17 0.0476191 0.0521472 0.94837 0.993569 0.947853 0.970173
Private Key 967 0 4 34 0.033966 0.966169 1 0.966034 0.982724
Seed, Salt, Nonce 36 2 6 4 0.25 0.1 0.875 0.947368 0.9 0.923077
4441 897 19428254 866 4.617e-05 0.163181 0.999909 0.83196 0.836819 0.834382
4207 895 19428493 863 4.606e-05 0.170217 0.99991 0.824579 0.829783 0.827173
2 changes: 1 addition & 1 deletion credsweeper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
'__version__'
]

__version__ = "1.5.9"
__version__ = "1.5.10"
3 changes: 2 additions & 1 deletion credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ def export_results(self) -> None:
x.line_data_list[0].line_num, #
x.severity, #
x.rule_name, #
x.line_data_list[0].value #
x.line_data_list[0].value_start, #
x.line_data_list[0].value_end #
))

if self.json_filename:
Expand Down
1 change: 1 addition & 0 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class DiffRowType(Enum):
UTF_8 = "utf_8"
UTF_16 = "utf_16"
LATIN_1 = "latin_1"
ASCII = "ascii"

DEFAULT_ENCODING = UTF_8

Expand Down
12 changes: 10 additions & 2 deletions credsweeper/deep_scanner/deep_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .byte_scanner import ByteScanner
from .bzip2_scanner import Bzip2Scanner
from .docx_scanner import DocxScanner
from .eml_scanner import EmlScanner
from .encoder_scanner import EncoderScanner
from .gzip_scanner import GzipScanner
from .html_scanner import HtmlScanner
Expand Down Expand Up @@ -70,7 +71,7 @@ def scanner(self) -> Scanner:
return self.__scanner

@staticmethod
def get_deep_scanners(data: bytes) -> List[Any]:
def get_deep_scanners(data: bytes, file_type: Optional[str] = None) -> List[Any]:
"""Returns possibly scan methods for the data depends on content"""
deep_scanners: List[Any] = []
if Util.is_zip(data):
Expand All @@ -90,6 +91,13 @@ def get_deep_scanners(data: bytes) -> List[Any]:
deep_scanners.append(JksScanner)
elif Util.is_asn1(data):
deep_scanners.append(Pkcs12Scanner)
elif file_type in [".eml", ".mht"]:
if Util.is_eml(data):
deep_scanners.append(EmlScanner)
elif Util.is_html(data):
deep_scanners.append(HtmlScanner)
else:
deep_scanners = [ByteScanner]
else:
deep_scanners = [ByteScanner, EncoderScanner, HtmlScanner, XmlScanner, LangScanner]
return deep_scanners
Expand Down Expand Up @@ -130,7 +138,7 @@ def scan(self,
file_type=content_provider.file_type,
info=content_provider.file_path)
# iterate for all possibly scanner methods WITHOUT ByteContentProvider for TextContentProvider
scanner_classes = self.get_deep_scanners(data)
scanner_classes = self.get_deep_scanners(data, content_provider.file_type)
for scan_class in scanner_classes:
new_candidates = scan_class.data_scan(self, data_provider, depth - 1, recursive_limit_size - len(data))
augment_candidates(candidates, new_candidates)
Expand Down
52 changes: 52 additions & 0 deletions credsweeper/deep_scanner/eml_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import email
import logging
from abc import ABC
from typing import List

from credsweeper.credentials import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.byte_content_provider import ByteContentProvider
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider

logger = logging.getLogger(__name__)


class EmlScanner(AbstractScanner, ABC):
"""Implements eml scanning"""

def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> List[Candidate]:
"""Tries to scan EML with text representation"""
candidates = []

try:
msg = email.message_from_bytes(data_provider.data)
for part in msg.walk():
content_type = part.get_content_type()
body = part.get_payload(decode=True)

if "text/plain" == content_type:
eml_text_data_provider = ByteContentProvider(content=body,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|EML-TEXT")
eml_candidates = self.scanner.scan(eml_text_data_provider)
candidates.extend(eml_candidates)
elif "text/html" == content_type:
html_data_provider = DataContentProvider(data=body)
if html_data_provider.represent_as_html(depth, recursive_limit_size,
self.scanner.keywords_required_substrings_check):
string_data_provider = StringContentProvider(lines=html_data_provider.lines,
line_numbers=html_data_provider.line_numbers,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|EML-HTML")
html_candidates = self.scanner.scan(string_data_provider)
candidates.extend(html_candidates)
except Exception as eml_exc:
logger.error(f"{data_provider.file_path}:{eml_exc}")
return candidates
4 changes: 2 additions & 2 deletions credsweeper/file_handler/data_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import yaml
from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning

from credsweeper.common.constants import DEFAULT_ENCODING
from credsweeper.common.constants import DEFAULT_ENCODING, ASCII
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.utils import Util
Expand Down Expand Up @@ -343,7 +343,7 @@ def represent_as_encoded(self) -> bool:
return False
try:
self.decoded = base64.b64decode( #
self.data.decode(encoding="ascii", errors="strict"). #
self.data.decode(encoding=ASCII, errors="strict"). #
translate(str.maketrans("", "", string.whitespace)), #
validate=True) #
except Exception as exc:
Expand Down
6 changes: 1 addition & 5 deletions credsweeper/filters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from credsweeper.filters.value_base64_data_check import ValueBase64DataCheck
from credsweeper.filters.value_blocklist_check import ValueBlocklistCheck
from credsweeper.filters.value_camel_case_check import ValueCamelCaseCheck
from credsweeper.filters.value_card_number_check import ValueCardNumberCheck
from credsweeper.filters.value_couple_keyword_check import ValueCoupleKeywordCheck
from credsweeper.filters.value_dictionary_keyword_check import ValueDictionaryKeywordCheck
from credsweeper.filters.value_dictionary_value_length_check import ValueDictionaryValueLengthCheck
Expand All @@ -17,8 +16,8 @@
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.filters.value_file_path_check import ValueFilePathCheck
from credsweeper.filters.value_first_word_check import ValueFirstWordCheck
from credsweeper.filters.value_github_check import ValueGitHubCheck
from credsweeper.filters.value_grafana_check import ValueGrafanaCheck
from credsweeper.filters.value_iban_check import ValueIbanCheck
from credsweeper.filters.value_ip_check import ValueIPCheck
from credsweeper.filters.value_json_web_token_check import ValueJsonWebTokenCheck
from credsweeper.filters.value_last_word_check import ValueLastWordCheck
Expand All @@ -30,8 +29,6 @@
from credsweeper.filters.value_pattern_check import ValuePatternCheck
from credsweeper.filters.value_pattern_length_check import ValuePatternLengthCheck
from credsweeper.filters.value_pem_pattern_check import ValuePemPatternCheck
from credsweeper.filters.value_pii_check import ValuePIICheck
from credsweeper.filters.value_phone_check import ValuePhoneCheck
from credsweeper.filters.value_similarity_check import ValueSimilarityCheck
from credsweeper.filters.value_split_keyword_check import ValueSplitKeywordCheck
from credsweeper.filters.value_string_type_check import ValueStringTypeCheck
Expand All @@ -41,5 +38,4 @@
from credsweeper.filters.value_token_base64_check import ValueTokenBase64Check
from credsweeper.filters.value_token_check import ValueTokenCheck
from credsweeper.filters.value_useless_word_check import ValueUselessWordCheck
from credsweeper.filters.value_vin_check import ValueVinCheck
from credsweeper.filters.variable_not_allowed_pattern_check import VariableNotAllowedPatternCheck
3 changes: 2 additions & 1 deletion credsweeper/filters/value_allowlist_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ class ValueAllowlistCheck(Filter):
"""Check that patterns from the list is not present in the candidate value."""

ALLOWED = [
r"ENC\(.*\)", r"ENC\[.*\]", r"\$\{.*\}", r"#\{.*\}", r"\{\{.+\}\}", r"([.a-z0-9]|->)+\(.*\)", r"\S{0,5}\*{5,}"
r"ENC\(.*\)", r"ENC\[.*\]", r"\$\{.*\}", r"#\{.*\}", r"\{\{.+\}\}", r"([.a-z0-9]|->)+\(.*\)", r"\S{0,5}\*{5,}",
r".*@@@hl@@@(암호|비번|PW|PASS)@@@endhl@@@.*"
]
ALLOWED_PATTERN = re.compile( #
Util.get_regex_combine_or(ALLOWED), #
Expand Down
11 changes: 1 addition & 10 deletions credsweeper/filters/value_base64_data_check.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import base64
import contextlib
import string

Expand Down Expand Up @@ -40,14 +39,6 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
return True
# check whether decoded bytes have enough entropy
with contextlib.suppress(Exception):
value_len = len(value)
if 0x3 & value_len:
# Bitbucket client id is 18 chars length
pad_len = 4 - (0x3 & value_len)
value = value + ''.join(['='] * pad_len)
if '-' in value or '_' in value:
decoded = base64.urlsafe_b64decode(value)
else:
decoded = base64.standard_b64decode(value)
decoded = Util.decode_base64(value, padding_safe=True, urlsafe_detect=True)
return Util.is_ascii_entropy_validate(decoded)
return True
45 changes: 0 additions & 45 deletions credsweeper/filters/value_card_number_check.py

This file was deleted.

43 changes: 43 additions & 0 deletions credsweeper/filters/value_github_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import binascii
import contextlib

import base62

from credsweeper.common.constants import ASCII
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters import Filter


class ValueGitHubCheck(Filter):
"""GitHub Classic Token validation"""

def __init__(self, config: Config = None) -> None:
pass

def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""Run filter checks on received token which might be structured.
Args:
line_data: credential candidate data
target: multiline target from which line data was obtained
Return:
True, when need to filter candidate and False if left
"""
# https://github.blog/2021-04-05-behind-githubs-new-authentication-token-formats/
if not line_data.value:
return True
with contextlib.suppress(Exception):
if line_data.value.startswith("gh") and '_' == line_data.value[3]:
token = line_data.value[4:-6]
data = token.encode(ASCII, errors="strict")
crc32sum = binascii.crc32(data)
base62_crc32 = line_data.value[-6:]
sign_b = base62.decodebytes(base62_crc32)
crc32sign = int.from_bytes(sign_b, "big")
if crc32sign == crc32sum:
return False
return True
6 changes: 3 additions & 3 deletions credsweeper/filters/value_grafana_check.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import base64
import contextlib
import json

from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters import Filter
from credsweeper.utils import Util


class ValueGrafanaCheck(Filter):
Expand All @@ -30,11 +30,11 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
with contextlib.suppress(Exception):
if line_data.value.startswith("glc_"):
# Grafana Access Policy Token
decoded = base64.b64decode(line_data.value[4:])
decoded = Util.decode_base64(line_data.value[4:], padding_safe=True, urlsafe_detect=True)
keys = ["o", "n", "k", "m"]
else:
# Grafana Provisioned API Key
decoded = base64.b64decode(line_data.value)
decoded = Util.decode_base64(line_data.value, padding_safe=True, urlsafe_detect=True)
keys = ["n", "k", "id"]
if payload := json.loads(decoded):
for key in keys:
Expand Down
Loading

0 comments on commit 87e1a80

Please sign in to comment.