Filters, value sanitizer improvements (#585)

* improvements * upd * BM scores fix * Apply suggestions from code review
Samsung · Jul 16, 2024 · 16dd8ac · 16dd8ac
1 parent 9800cb6
commit 16dd8ac
Show file tree

Hide file tree

Showing 19 changed files with 528 additions and 89 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -114,15 +114,10 @@ jobs:
           ref: ${{ github.event.pull_request.head.sha }}
           path: temp/CredSweeper
 
-      - name: Patch benchmark for PR work
-        run: |
-          sed -i 's|CREDSWEEPER = "https://github.com/Samsung/CredSweeper.git"|CREDSWEEPER = "dummy://github.com/Samsung/CredSweeper.git"|' benchmark/common/constants.py
-          grep --with-filename --line-number 'dummy://github.com/Samsung/CredSweeper.git' benchmark/common/constants.py
-
       - name: Install CredSweeper
         run: |
           python -m pip install temp/CredSweeper
-          credsweeper_head=
+          python -m credsweeper --banner
 
       - name: Run CredSweeper tool
         run: |

diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -187,9 +187,9 @@ jobs:
             file_crc32_int=$((16#${file_crc32_hex}))
             crc32_int=$(( ${crc32_int} ^ ${file_crc32_int} ))
             done
-        version_with_crc="$(credsweeper --version | head -1) crc32:$(printf '%x' ${crc32_int})"
+        version_with_crc="$(python -m credsweeper --version | head -1) crc32:$(printf '%x' ${crc32_int})"
         echo "version_with_crc = '${version_with_crc}'"
-        banner=$(credsweeper --banner --path requirements.txt | head -1)
+        banner=$(python -m credsweeper --banner | head -1)
         echo "banner = '${banner}'"
         if ! [ -n "${version_with_crc}" ] && [ -n "${banner}" ] && [ "${version_with_crc}" == "${banner}" ]; then
             echo "'${version_with_crc}' != '${banner}'"

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
@@ -84,7 +84,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .jenkinsfile                1            58            1            7
 .jinja2                     1            64                         2
 .js                       659        536413          541         2645          336
-.json                     861      13670751          914        11012          143
+.json                     861      13670751          917        11012          143
 .jsp                       13          3202            1           42
 .jsx                        7           857                        19
 .jwt                        6             8            7
@@ -122,7 +122,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .mqh                        1          1023                         2
 .msg                        1         26644            1            1
 .mysql                      1            36                                      2
-.ndjson                     2          5006           72          352            2
+.ndjson                     2          5006           73          350            2
 .nix                        4           211                        12
 .nolint                     1             2                         1
 .odd                        1          1281                        57
@@ -222,23 +222,23 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .yml                      418         36162          467          920          384
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
-TOTAL:                  10333      16988573         8373        60441         5233
-credsweeper result_cnt : 7795, lost_cnt : 0, true_cnt : 7226, false_cnt : 569
+TOTAL:                  10333      16988573         8377        60439         5233
+credsweeper result_cnt : 7800, lost_cnt : 0, true_cnt : 7231, false_cnt : 569
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
 API                                     123         3163          185         112   109     3   3345    14  0.000896  0.113821  0.995102  0.973214  0.886179  0.927660
 AWS Client ID                           168           13            0         160   160     0     13     8  0.000000  0.047619  0.955801  1.000000  0.952381  0.975610
 AWS Multi                                75           12            0          87    75    11      1     0  0.916667  0.000000  0.873563  0.872093  1.000000  0.931677
 AWS S3 Bucket                            61           25            0          87    61    24      1     0  0.960000  0.000000  0.720930  0.717647  1.000000  0.835616
 Atlassian Old PAT token                  27          212            3          12     3     8    207    24  0.037209  0.888889  0.867769  0.272727  0.111111  0.157895
-Auth                                    406         2726           77         371   350    21   2782    56  0.007492  0.137931  0.976005  0.943396  0.862069  0.900901
+Auth                                    407         2725           77         372   351    21   2781    56  0.007495  0.137592  0.976005  0.943548  0.862408  0.901155
 Azure Access Token                       19            0            0                 0     0      0    19            1.000000  0.000000            0.000000
 BASE64 Private Key                        7            2            0           7     7     0      2     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5     5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
 Bitbucket Client ID                     142         1813            9          46    27    18   1804   115  0.009879  0.809859  0.932281  0.600000  0.190141  0.288770
 Bitbucket Client Secret                 230          535           10          44    33    11    534   197  0.020183  0.856522  0.731613  0.750000  0.143478  0.240876
 Certificate                              25          459            1          21    20     1    459     5  0.002174  0.200000  0.987629  0.952381  0.800000  0.869565
-Credential                               91          155           74          87    84     3    226     7  0.013100  0.076923  0.968750  0.965517  0.923077  0.943820
+Credential                               91          155           74          90    87     3    226     4  0.013100  0.043956  0.978125  0.966667  0.956044  0.961326
 Docker Swarm Token                        2            0            0           2     2     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Dropbox App secret                       62          114            0          46    36     9    105    26  0.078947  0.419355  0.801136  0.800000  0.580645  0.672897
 Facebook Access Token                     0            1            0                 0     0      1     0  0.000000            1.000000
@@ -255,16 +255,16 @@ IPv6                                     33          131            0          3
 JSON Web Token                          284           11            2         274   271     3     10    13  0.230769  0.045775  0.946128  0.989051  0.954225  0.971326
 Jira / Confluence PAT token               0            4            0                 0     0      4     0  0.000000            1.000000
 Jira 2FA                                 14            6            0          10    10     0      6     4  0.000000  0.285714  0.800000  1.000000  0.714286  0.833333
-Key                                     483         8494          464         444   435     9   8949    48  0.001005  0.099379  0.993963  0.979730  0.900621  0.938511
+Key                                     483         8494          464         445   436     9   8949    47  0.001005  0.097308  0.994068  0.979775  0.902692  0.939655
 Nonce                                    83           53            0          85    79     6     47     4  0.113208  0.048193  0.926471  0.929412  0.951807  0.940476
 Other                                     0            0            5                 0     0      5     0  0.000000            1.000000
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1820         7475         2752        1681  1614    67  10160   206  0.006551  0.113187  0.977339  0.960143  0.886813  0.922022
+Password                               1823         7474         2752        1681  1614    67  10159   209  0.006552  0.114646  0.977094  0.960143  0.885354  0.921233
 Salt                                     42           76            2          38    38     0     78     4  0.000000  0.095238  0.966667  1.000000  0.904762  0.950000
 Secret                                 1358        28497          869        1234  1229     5  29361   129  0.000170  0.094993  0.995639  0.995948  0.905007  0.948302
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 Token                                   585         3972          439         519   511     8   4403    74  0.001814  0.126496  0.983587  0.984586  0.873504  0.925725
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
 URL Credentials                         194          125          251         184   184     0    376    10  0.000000  0.051546  0.982456  1.000000  0.948454  0.973545
-                                       8373        60441         5233        7937  7226   569  59872  1147  0.009414  0.136988  0.975063  0.927004  0.863012  0.893864
+                                       8377        60439         5233        7942  7231   569  59870  1146  0.009414  0.136803  0.975078  0.927051  0.863197  0.893985
diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
@@ -5,7 +5,7 @@
 
 class KeywordPattern:
     """Pattern set of keyword types"""
-    key_left = r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
+    key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
                r"(?P<keyword>"
     # there will be inserted a keyword
     key_right = r")" \

diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
@@ -31,6 +31,7 @@ class LineData:
     quotation_marks = ('"', "'", '`')
     comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
     bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
+    line_endings = re.compile(r"\\{1,8}[nr]")
     url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
     # some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
     # \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
@@ -180,6 +181,10 @@ def clean_bash_parameters(self) -> None:
             #  and value can be split by bash special characters
             if len(value_spl) > 1:
                 self.value = value_spl[0]
+        if ' ' not in self.value and ("\\n" in self.value or "\\r" in self.value):
+            value_whsp = self.line_endings.split(self.value)
+            if len(value_whsp) > 1:
+                self.value = value_whsp[0]
 
     def sanitize_variable(self) -> None:
         """Remove trailing spaces, dashes and quotations around the variable. Correct position."""

diff --git a/credsweeper/filters/value_atlassian_token_check.py b/credsweeper/filters/value_atlassian_token_check.py
@@ -32,8 +32,13 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
             if value.startswith("BBDC-"):
                 # Bitbucket HTTP Access Token
                 return ValueAtlassianTokenCheck.check_atlassian_struct(value[5:])
-            elif value.startswith("ATBB"):
+            elif value.startswith("AT"):
                 # Bitbucket App password
+                while "\\=" in value or "%3d" in value or "%3D" in value:
+                    # = sign may be escaped in URL https://www.rfc-editor.org/rfc/rfc3986
+                    value = value.replace('\\', '')
+                    value = value.replace('%3d', '=')
+                    value = value.replace('%3D', '=')
                 return ValueAtlassianTokenCheck.check_crc32_struct(value)
             else:
                 # Jira / Confluence PAT token
@@ -43,9 +48,10 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
     @staticmethod
     def check_crc32_struct(value: str) -> bool:
         """Returns False if value is valid for bitbucket app password structure 'payload:crc32'"""
-        crc32 = int(value[28:], 16)
-        data = value[:28].encode(ASCII)
-        if crc32 == binascii.crc32(data):
+        crc32 = int(value[-8:], 16)
+        data = value[:-8].encode(ASCII)
+        data_crc32 = binascii.crc32(data)
+        if crc32 == data_crc32:
             return False
         return True
 

diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py
@@ -12,6 +12,9 @@ class ValueFilePathCheck(Filter):
     Check if a value contains either '/' or ':\' separators (but not both)
     and do not have any special characters ( !$@`&*()+)
     """
+    base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value)
+    unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~"
+    unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\"
 
     def __init__(self, config: Config = None) -> None:
         pass
@@ -30,25 +33,32 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         value = line_data.value
         contains_unix_separator = '/' in value
         if contains_unix_separator:
+            if "://" in value or value.startswith("~/") or value.startswith("./") or "../" in value or "/.." in value:
+                # common case for url definition or aliases
+                return True
             # base64 encoded data might look like linux path
             min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value))
             # get minimal entropy to compare with shannon entropy of found value
             # min_entropy == 0 means that the value cannot be checked with the entropy due high variance
-            if 0 == min_entropy or min_entropy > Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value):
-                for i in value:
-                    if i not in Chars.BASE64STD_CHARS.value:
-                        # value contains wrong BASE64STD_CHARS symbols
-                        break
-                else:
-                    # all symbols are from base64 alphabet
-                    contains_unix_separator = 1 < value.count('/')
+            for i in value:
+                if i not in self.base64_possible_set:
+                    # value contains wrong BASE64STD_CHARS symbols like .
+                    break
             else:
-                # high entropy means base64 encoded data
-                contains_unix_separator = False
+                # all symbols are from base64 alphabet
+                entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)
+                if 0 == min_entropy or min_entropy > entropy:
+                    contains_unix_separator = 1 < value.count('/')
+                else:
+                    # high entropy means base64 encoded data
+                    contains_unix_separator = False
+
             # low shannon entropy points that the value maybe not a high randomized value in base64
         contains_windows_separator = ':\\' in value
         if contains_unix_separator or contains_windows_separator:
-            for i in " !$@`&*()[]{}+=;,":
+            unusual_symbols_in_path = self.unusual_linux_symbols_in_path if contains_unix_separator \
+                else self.unusual_windows_symbols_in_path
+            for i in unusual_symbols_in_path:
                 if i in value:
                     # the symbols which not passed in a path usually
                     break

diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
@@ -220,8 +220,9 @@ def validate_groups(self, group_list: List[Tuple[CandidateKey, List[Candidate]]]
             probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
                                                             features_list)
         is_cred = probability > self.threshold
-        for i in range(len(is_cred)):
-            logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 8),
-                         group_list[i][0])
+        if logger.isEnabledFor(logging.DEBUG):
+            for i in range(len(is_cred)):
+                logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], probability[i],
+                             group_list[i][0])
         # apply cast to float to avoid json export issue
         return is_cred, probability.astype(float)
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
@@ -911,9 +911,10 @@
   confidence: strong
   type: pattern
   values:
-    - (?<![0-9A-Za-z_+-])(?P<value>ATCTT3xFfGN0[a-zA-Z0-9_-]{171}=[A-F0-9]{8})(?![=0-9A-Za-z_+-])
-  filter_type: TokenPattern
-  min_line_len: 183
+    - (?<![0-9A-Za-z_+-])(?P<value>ATCTT3xFfGN0[a-zA-Z0-9_-]{80,800}(\\?=|%3[dD])[A-F0-9]{8})(?![=0-9A-Za-z_+-])
+  filter_type:
+    - ValueAtlassianTokenCheck
+  min_line_len: 160
   required_substrings:
     - ATCTT3xFfGN0
   target:
@@ -997,9 +998,10 @@
   confidence: strong
   type: pattern
   values:
-    - (?<![0-9A-Za-z_+-])(?P<value>ATATT3xFfGF0[a-zA-Z0-9_-]{171}=[A-F0-9]{8})(?![=0-9A-Za-z_+-])
-  filter_type: TokenPattern
-  min_line_len: 191
+    - (?<![0-9A-Za-z_+-])(?P<value>ATATT3xFfGF0[a-zA-Z0-9_-]{80,800}(\\?=|%3[dD])[A-F0-9]{8})(?![=0-9A-Za-z_+-])
+  filter_type:
+    - ValueAtlassianTokenCheck
+  min_line_len: 160
   required_substrings:
     - ATATT3xFfGF0
   target:

diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,7 @@ PyYAML==6.0.1
 python-docx==1.1.0
 requests==2.32.0
 typing_extensions==4.9.0
-whatthepatch==1.0.5
+whatthepatch==1.0.6
 pdfminer.six==20231228
 password-strength==0.0.3.post2
 python-dateutil==2.8.2

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -7,14 +7,14 @@
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 425
-SAMPLES_CRED_LINE_COUNT: int = 442
+SAMPLES_CRED_COUNT: int = 429
+SAMPLES_CRED_LINE_COUNT: int = 446
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 383
+SAMPLES_POST_CRED_COUNT: int = 387
 
 # with option --doc
-SAMPLES_IN_DOC = 407
+SAMPLES_IN_DOC = 410
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 25