Line info enchancement (#660)

babenek · web-flow · commit 06c54e7d455c · 2025-01-17T14:56:03.000+02:00
* line info refactoring

* excel style for info

* bump actions/upload-artifact

* optimize --doc scan

* style
diff --git a/.github/workflows/action.yml b/.github/workflows/action.yml
@@ -44,7 +44,7 @@ jobs:
 
     - name: CredSweeper report
       if: always()
-      uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+      uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
       with:
         name: output.json
         path: output.json
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -144,21 +144,21 @@ jobs:
 
       - name: Upload CredSweeper log
         if: always()
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: credsweeper
           path: credsweeper.${{ github.event.pull_request.head.sha }}.log
 
       - name: Upload CredSweeper report
         if: always()
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: report
           path: report.${{ github.event.pull_request.head.sha }}.json
 
       - name: Upload benchmark output
         if: always()
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: benchmark
           path: benchmark.${{ github.event.pull_request.head.sha }}.log
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -183,7 +183,7 @@ jobs:
 
     - name: FLAKE 8 reports
       if: ${{ failure() && steps.test_flake8.conclusion == 'failure' }}
-      uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+      uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
       with:
         name: flake8_report
         path: flake8.txt
diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
@@ -54,7 +54,7 @@ jobs:
 
     - name: Store coverage report
       if: always()
-      uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+      uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
       with:
         name: htmlcov
         path: htmlcov
@@ -81,7 +81,7 @@ jobs:
 
     - name: New corpus upload
       if: ${{ env.NEW_CORPUS > 0 }}
-      uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+      uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
       with:
         name: new_corpus
         path: new_corpus
@@ -104,7 +104,7 @@ jobs:
 
     - name: Crash corpus upload
       if: ${{ env.CRASH_CORPUS > 0 }}
-      uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+      uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
       with:
         name: crash_corpus
         path: crash_corpus
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -160,7 +160,7 @@ jobs:
 
     - name: HTML coverage reports
       if: always()
-      uses: actions/upload-artifact@ff15f0306b3f739f7b6fd43fb5d26cd321bd4de5 # v3.2.1
+      uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
       with:
         name: coverage_html-${{ matrix.python-version }}
         path: coverage_html
diff --git a/credsweeper/deep_scanner/bzip2_scanner.py b/credsweeper/deep_scanner/bzip2_scanner.py
@@ -29,7 +29,7 @@ def data_scan(
             bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
                                                          file_path=new_path,
                                                          file_type=Util.get_extension(new_path),
-                                                         info=f"{data_provider.info}|BZIP2|{new_path}")
+                                                         info=f"{data_provider.info}|BZIP2:{new_path}")
             new_limit = recursive_limit_size - len(bzip2_content_provider.data)
             bzip2_candidates = self.recursive_scan(bzip2_content_provider, depth, new_limit)
             return bzip2_candidates
diff --git a/credsweeper/deep_scanner/deep_scanner.py b/credsweeper/deep_scanner/deep_scanner.py
@@ -1,6 +1,5 @@
 import datetime
 import logging
-from pathlib import Path
 from typing import List, Optional, Any, Tuple, Union
 
 from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION
@@ -77,22 +76,26 @@ def scanner(self) -> Scanner:
         return self.__scanner
 
     @staticmethod
-    def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
+    def get_deep_scanners(data: bytes, file_type: str, depth: int) -> List[Any]:
         """Returns possibly scan methods for the data depends on content"""
         deep_scanners: List[Any] = []
         if Util.is_zip(data):
-            deep_scanners.append(ZipScanner)
+            if 0 < depth:
+                deep_scanners.append(ZipScanner)
             # probably, there might be a docx, xlxs and so on.
             # It might be scanned with text representation in third-party libraries.
             deep_scanners.append(XlsxScanner)
             deep_scanners.append(DocxScanner)
             deep_scanners.append(PptxScanner)
         elif Util.is_bzip2(data):
-            deep_scanners.append(Bzip2Scanner)
+            if 0 < depth:
+                deep_scanners.append(Bzip2Scanner)
         elif Util.is_tar(data):
-            deep_scanners.append(TarScanner)
+            if 0 < depth:
+                deep_scanners.append(TarScanner)
         elif Util.is_gzip(data):
-            deep_scanners.append(GzipScanner)
+            if 0 < depth:
+                deep_scanners.append(GzipScanner)
         elif Util.is_pdf(data):
             deep_scanners.append(PdfScanner)
         elif Util.is_jks(data):
@@ -113,7 +116,10 @@ def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
                 deep_scanners.append(MxfileScanner)
             deep_scanners.append(XmlScanner)
         else:
-            deep_scanners = [EncoderScanner, LangScanner, ByteScanner]
+            if 0 < depth:
+                deep_scanners.append(EncoderScanner)
+                deep_scanners.append(LangScanner)
+            deep_scanners.append(ByteScanner)
         return deep_scanners
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -136,27 +142,29 @@ def scan(self,
         if isinstance(content_provider, TextContentProvider) or isinstance(content_provider, ByteContentProvider):
             # Feature to scan files which might be containers
             data = content_provider.data
+            info = "FILE"
         elif isinstance(content_provider, DiffContentProvider) and content_provider.diff:
             candidates = self.scanner.scan(content_provider)
             # Feature to scan binary diffs
             diff = content_provider.diff[0].get("line")
             # the check for legal fix mypy issue
             if isinstance(diff, bytes):
                 data = diff
+            info = "DIFF"
         else:
             logger.warning(f"Content provider {type(content_provider)} does not support deep scan")
+            info = "NA"
 
         if data:
             data_provider = DataContentProvider(data=data,
                                                 file_path=content_provider.file_path,
                                                 file_type=content_provider.file_type,
-                                                info=Path(content_provider.file_path).as_posix())
+                                                info=content_provider.info or info)
             # iterate for all possibly scanner methods WITHOUT ByteContentProvider for TextContentProvider
-            scanner_classes = self.get_deep_scanners(data, content_provider.file_type)
+            scanner_classes = self.get_deep_scanners(data, content_provider.file_type, depth)
             fallback = True
             for scan_class in scanner_classes:
-                if new_candidates := scan_class.data_scan(self, data_provider, depth - 1,
-                                                          recursive_limit_size - len(data)):
+                if new_candidates := scan_class.data_scan(self, data_provider, depth, recursive_limit_size - len(data)):
                     augment_candidates(candidates, new_candidates)
                     fallback = False
             if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data):
@@ -196,7 +204,7 @@ def recursive_scan(
         else:
             fallback = True
             # iterate for all possibly scanner methods
-            scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type)
+            scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
             for scanner_class in scanner_classes:
                 if new_candidates := scanner_class.data_scan(self, data_provider, depth, recursive_limit_size):
                     augment_candidates(candidates, new_candidates)
diff --git a/credsweeper/deep_scanner/encoder_scanner.py b/credsweeper/deep_scanner/encoder_scanner.py
@@ -22,7 +22,7 @@ def data_scan(
             decoded_data_provider = DataContentProvider(data=data_provider.decoded,
                                                         file_path=data_provider.file_path,
                                                         file_type=data_provider.file_type,
-                                                        info=f"{data_provider.info}|ENCODED")
+                                                        info=f"{data_provider.info}|BASE64")
             new_limit = recursive_limit_size - len(decoded_data_provider.data)
             return self.recursive_scan(decoded_data_provider, depth, new_limit)
         return None
diff --git a/credsweeper/deep_scanner/gzip_scanner.py b/credsweeper/deep_scanner/gzip_scanner.py
@@ -31,7 +31,7 @@ def data_scan(
                 gzip_content_provider = DataContentProvider(data=f.read(),
                                                             file_path=new_path,
                                                             file_type=Util.get_extension(new_path),
-                                                            info=f"{data_provider.info}|GZIP|{new_path}")
+                                                            info=f"{data_provider.info}|GZIP:{new_path}")
                 new_limit = recursive_limit_size - len(gzip_content_provider.data)
                 gzip_candidates = self.recursive_scan(gzip_content_provider, depth, new_limit)
                 return gzip_candidates
diff --git a/credsweeper/deep_scanner/jks_scanner.py b/credsweeper/deep_scanner/jks_scanner.py
@@ -27,11 +27,11 @@ def data_scan(
                 if keystore.private_keys or keystore.secret_keys:
                     candidate = Candidate.get_dummy_candidate(self.config, data_provider.file_path,
                                                               data_provider.file_type,
-                                                              f"{data_provider.info}:'{pw_probe}' - has keys")
+                                                              f"{data_provider.info}|JKS:'{pw_probe}' - has keys")
                 else:
-                    candidate = Candidate.get_dummy_candidate(self.config, data_provider.file_path,
-                                                              data_provider.file_type,
-                                                              f"{data_provider.info}:'{pw_probe}' - default password")
+                    candidate = Candidate.get_dummy_candidate(
+                        self.config, data_provider.file_path, data_provider.file_type,
+                        f"{data_provider.info}|JKS:'{pw_probe}' - default password")
                 candidates.append(candidate)
             except Exception as jks_exc:
                 logger.debug(f"{data_provider.file_path}:{pw_probe}:{jks_exc}")
diff --git a/credsweeper/deep_scanner/pdf_scanner.py b/credsweeper/deep_scanner/pdf_scanner.py
@@ -27,35 +27,31 @@ def data_scan(
         # pdfminer.six - splits text in table to many lines. Allows to walk through elements
         try:
             candidates = []
-            pdf_lines = []
             for page in extract_pages(io.BytesIO(data_provider.data), laparams=LAParams()):
                 for element in page:
                     if isinstance(element, LTText):
                         element_text = element.get_text().strip()
-                        if element_text:
-                            element_candidates = []
+                        if 0 < depth and element_text:
                             if MIN_DATA_LEN < len(element_text):
                                 pdf_content_provider = DataContentProvider(
                                     data=element_text.encode(),
                                     file_path=data_provider.file_path,
                                     file_type=data_provider.file_type,
-                                    info=f"{data_provider.info}|PDF[{page.pageid}]")
+                                    info=f"{data_provider.info}|PDF:{page.pageid}")
                                 new_limit = recursive_limit_size - len(pdf_content_provider.data)
                                 element_candidates = self.recursive_scan(pdf_content_provider, depth, new_limit)
                                 candidates.extend(element_candidates)
-                            if not element_candidates:
-                                # skip to decrease duplicates of candidates
-                                pdf_lines.append(element_text)
+                        else:
+                            string_data_provider = StringContentProvider(lines=[element_text],
+                                                                         file_path=data_provider.file_path,
+                                                                         file_type=data_provider.file_type,
+                                                                         info=f"{data_provider.info}|PDF:{page.pageid}")
+                            pdf_candidates = self.scanner.scan(string_data_provider)
+                            candidates.extend(pdf_candidates)
                     elif isinstance(element, LTItem):
                         pass
                     else:
                         logger.error(f"Unsupported {element}")
-                string_data_provider = StringContentProvider(lines=pdf_lines,
-                                                             file_path=data_provider.file_path,
-                                                             file_type=data_provider.file_type,
-                                                             info=f"{data_provider.info}|PDF[{page.pageid}]")
-                pdf_candidates = self.scanner.scan(string_data_provider)
-                candidates.extend(pdf_candidates)
             return candidates
         except Exception as pdf_exc:
             logger.error(f"{data_provider.file_path}:{pdf_exc}")
diff --git a/credsweeper/deep_scanner/pkcs12_scanner.py b/credsweeper/deep_scanner/pkcs12_scanner.py
@@ -31,13 +31,13 @@ def data_scan(
                         self.config,  #
                         data_provider.file_path,  #
                         data_provider.file_type,  #
-                        f"{data_provider.info}:'{pw_probe.decode()}' - has keys PKCS12")
+                        f"{data_provider.info}|PKCS12:'{pw_probe.decode()}' - has keys PKCS12")
                 else:
                     candidate = Candidate.get_dummy_candidate(
                         self.config,  #
                         data_provider.file_path,  #
                         data_provider.file_type,  #
-                        f"{data_provider.info}:'{pw_probe.decode()}' - default password PKCS12")
+                        f"{data_provider.info}|PKCS12:'{pw_probe.decode()}' - default password PKCS12")
                 candidates.append(candidate)
             except Exception as pkcs_exc:
                 logger.debug(f"{data_provider.file_path}:{pw_probe.decode()}:{pkcs_exc}")
diff --git a/credsweeper/deep_scanner/pptx_scanner.py b/credsweeper/deep_scanner/pptx_scanner.py
@@ -34,7 +34,7 @@ def data_scan(
                 string_data_provider = StringContentProvider(lines=pptx_lines,
                                                              file_path=data_provider.file_path,
                                                              file_type=data_provider.file_type,
-                                                             info=f"{data_provider.info}|PPTX[{n+1}]")
+                                                             info=f"{data_provider.info}|PPTX:{n+1}")
                 pptx_candidates = self.scanner.scan(string_data_provider)
                 candidates.extend(pptx_candidates)
             return candidates
diff --git a/credsweeper/deep_scanner/tar_scanner.py b/credsweeper/deep_scanner/tar_scanner.py
@@ -39,7 +39,7 @@ def data_scan(
                         tar_content_provider = DataContentProvider(data=f.read(),
                                                                    file_path=data_provider.file_path,
                                                                    file_type=Util.get_extension(tfi.name),
-                                                                   info=f"{data_provider.info}|TAR|{tfi.name}")
+                                                                   info=f"{data_provider.info}|TAR:{tfi.name}")
                         # Nevertheless, use extracted data size
                         new_limit = recursive_limit_size - len(tar_content_provider.data)
                         tar_candidates = self.recursive_scan(tar_content_provider, depth, new_limit)
diff --git a/credsweeper/deep_scanner/xlsx_scanner.py b/credsweeper/deep_scanner/xlsx_scanner.py
@@ -6,9 +6,11 @@
 import pandas as pd
 
 from credsweeper.credentials import Candidate
+from credsweeper.credentials.augment_candidates import augment_candidates
 from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
 from credsweeper.file_handler.data_content_provider import DataContentProvider
 from credsweeper.file_handler.string_content_provider import StringContentProvider
+from credsweeper.utils import Util
 
 logger = logging.getLogger(__name__)
 
@@ -26,15 +28,26 @@ def data_scan(
             candidates = []
             book = pd.read_excel(io.BytesIO(data_provider.data), sheet_name=None, header=None)
             for sheet_name, sheet_data in book.items():
+                sheet_info = f"{data_provider.info}|{sheet_name}"
                 # replace open xml carriage returns _x000D_ before line feed only
                 df = sheet_data.replace(to_replace="_x000D_\n", value='\n', regex=True).fillna('').astype(str)
-                sheet_lines = ['\t'.join(x) for x in df.values]
-                string_data_provider = StringContentProvider(lines=sheet_lines,
-                                                             file_path=data_provider.file_path,
-                                                             file_type=data_provider.file_type,
-                                                             info=f"{data_provider.info}|xlsx:{sheet_name}")
-                sheet_candidates = self.scanner.scan(string_data_provider)
-                candidates.extend(sheet_candidates)
+                for row_pos, row in enumerate(df.values):
+                    for col_pos, cell in enumerate(row):
+                        cell_info = f"{sheet_info}:{Util.get_excel_column_name(col_pos)}{row_pos + 1}"
+                        cell_provider = StringContentProvider(lines=cell.splitlines(),
+                                                              file_path=data_provider.file_path,
+                                                              file_type=data_provider.file_type,
+                                                              info=cell_info)
+                        cell_candidates = self.scanner.scan(cell_provider)
+                        candidates.extend(cell_candidates)
+                    row_line = '\t'.join(row)
+                    row_provider = StringContentProvider(lines=[row_line],
+                                                         file_path=data_provider.file_path,
+                                                         file_type=data_provider.file_type,
+                                                         info=f"{sheet_info}:R{row_pos + 1}")
+                    row_candidates = self.scanner.scan(row_provider)
+                    augment_candidates(candidates, row_candidates)
+
             return candidates
         except Exception as xlsx_exc:
             logger.error(f"{data_provider.file_path}:{xlsx_exc}")
diff --git a/credsweeper/deep_scanner/zip_scanner.py b/credsweeper/deep_scanner/zip_scanner.py
diff --git a/credsweeper/file_handler/data_content_provider.py b/credsweeper/file_handler/data_content_provider.py
diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
diff --git a/tests/data/doc.json b/tests/data/doc.json
diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py