Skip to content

Commit 06c54e7

Browse files
authored
Line info enchancement (#660)
* line info refactoring * excel style for info * bump actions/upload-artifact * optimize --doc scan * style
1 parent 7ee8853 commit 06c54e7

22 files changed

+1619
-1577
lines changed

.github/workflows/action.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444
4545
- name: CredSweeper report
4646
if: always()
47-
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
47+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
4848
with:
4949
name: output.json
5050
path: output.json

.github/workflows/benchmark.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -144,21 +144,21 @@ jobs:
144144
145145
- name: Upload CredSweeper log
146146
if: always()
147-
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
147+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
148148
with:
149149
name: credsweeper
150150
path: credsweeper.${{ github.event.pull_request.head.sha }}.log
151151

152152
- name: Upload CredSweeper report
153153
if: always()
154-
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
154+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
155155
with:
156156
name: report
157157
path: report.${{ github.event.pull_request.head.sha }}.json
158158

159159
- name: Upload benchmark output
160160
if: always()
161-
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
161+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
162162
with:
163163
name: benchmark
164164
path: benchmark.${{ github.event.pull_request.head.sha }}.log

.github/workflows/check.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ jobs:
183183
184184
- name: FLAKE 8 reports
185185
if: ${{ failure() && steps.test_flake8.conclusion == 'failure' }}
186-
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
186+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
187187
with:
188188
name: flake8_report
189189
path: flake8.txt

.github/workflows/fuzz.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jobs:
5454
5555
- name: Store coverage report
5656
if: always()
57-
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
57+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
5858
with:
5959
name: htmlcov
6060
path: htmlcov
@@ -81,7 +81,7 @@ jobs:
8181
8282
- name: New corpus upload
8383
if: ${{ env.NEW_CORPUS > 0 }}
84-
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
84+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
8585
with:
8686
name: new_corpus
8787
path: new_corpus
@@ -104,7 +104,7 @@ jobs:
104104
105105
- name: Crash corpus upload
106106
if: ${{ env.CRASH_CORPUS > 0 }}
107-
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
107+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
108108
with:
109109
name: crash_corpus
110110
path: crash_corpus

.github/workflows/test.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ jobs:
160160
161161
- name: HTML coverage reports
162162
if: always()
163-
uses: actions/upload-artifact@ff15f0306b3f739f7b6fd43fb5d26cd321bd4de5 # v3.2.1
163+
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
164164
with:
165165
name: coverage_html-${{ matrix.python-version }}
166166
path: coverage_html

credsweeper/deep_scanner/bzip2_scanner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def data_scan(
2929
bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
3030
file_path=new_path,
3131
file_type=Util.get_extension(new_path),
32-
info=f"{data_provider.info}|BZIP2|{new_path}")
32+
info=f"{data_provider.info}|BZIP2:{new_path}")
3333
new_limit = recursive_limit_size - len(bzip2_content_provider.data)
3434
bzip2_candidates = self.recursive_scan(bzip2_content_provider, depth, new_limit)
3535
return bzip2_candidates

credsweeper/deep_scanner/deep_scanner.py

+20-12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import datetime
22
import logging
3-
from pathlib import Path
43
from typing import List, Optional, Any, Tuple, Union
54

65
from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION
@@ -77,22 +76,26 @@ def scanner(self) -> Scanner:
7776
return self.__scanner
7877

7978
@staticmethod
80-
def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
79+
def get_deep_scanners(data: bytes, file_type: str, depth: int) -> List[Any]:
8180
"""Returns possibly scan methods for the data depends on content"""
8281
deep_scanners: List[Any] = []
8382
if Util.is_zip(data):
84-
deep_scanners.append(ZipScanner)
83+
if 0 < depth:
84+
deep_scanners.append(ZipScanner)
8585
# probably, there might be a docx, xlxs and so on.
8686
# It might be scanned with text representation in third-party libraries.
8787
deep_scanners.append(XlsxScanner)
8888
deep_scanners.append(DocxScanner)
8989
deep_scanners.append(PptxScanner)
9090
elif Util.is_bzip2(data):
91-
deep_scanners.append(Bzip2Scanner)
91+
if 0 < depth:
92+
deep_scanners.append(Bzip2Scanner)
9293
elif Util.is_tar(data):
93-
deep_scanners.append(TarScanner)
94+
if 0 < depth:
95+
deep_scanners.append(TarScanner)
9496
elif Util.is_gzip(data):
95-
deep_scanners.append(GzipScanner)
97+
if 0 < depth:
98+
deep_scanners.append(GzipScanner)
9699
elif Util.is_pdf(data):
97100
deep_scanners.append(PdfScanner)
98101
elif Util.is_jks(data):
@@ -113,7 +116,10 @@ def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
113116
deep_scanners.append(MxfileScanner)
114117
deep_scanners.append(XmlScanner)
115118
else:
116-
deep_scanners = [EncoderScanner, LangScanner, ByteScanner]
119+
if 0 < depth:
120+
deep_scanners.append(EncoderScanner)
121+
deep_scanners.append(LangScanner)
122+
deep_scanners.append(ByteScanner)
117123
return deep_scanners
118124

119125
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -136,27 +142,29 @@ def scan(self,
136142
if isinstance(content_provider, TextContentProvider) or isinstance(content_provider, ByteContentProvider):
137143
# Feature to scan files which might be containers
138144
data = content_provider.data
145+
info = "FILE"
139146
elif isinstance(content_provider, DiffContentProvider) and content_provider.diff:
140147
candidates = self.scanner.scan(content_provider)
141148
# Feature to scan binary diffs
142149
diff = content_provider.diff[0].get("line")
143150
# the check for legal fix mypy issue
144151
if isinstance(diff, bytes):
145152
data = diff
153+
info = "DIFF"
146154
else:
147155
logger.warning(f"Content provider {type(content_provider)} does not support deep scan")
156+
info = "NA"
148157

149158
if data:
150159
data_provider = DataContentProvider(data=data,
151160
file_path=content_provider.file_path,
152161
file_type=content_provider.file_type,
153-
info=Path(content_provider.file_path).as_posix())
162+
info=content_provider.info or info)
154163
# iterate for all possibly scanner methods WITHOUT ByteContentProvider for TextContentProvider
155-
scanner_classes = self.get_deep_scanners(data, content_provider.file_type)
164+
scanner_classes = self.get_deep_scanners(data, content_provider.file_type, depth)
156165
fallback = True
157166
for scan_class in scanner_classes:
158-
if new_candidates := scan_class.data_scan(self, data_provider, depth - 1,
159-
recursive_limit_size - len(data)):
167+
if new_candidates := scan_class.data_scan(self, data_provider, depth, recursive_limit_size - len(data)):
160168
augment_candidates(candidates, new_candidates)
161169
fallback = False
162170
if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data):
@@ -196,7 +204,7 @@ def recursive_scan(
196204
else:
197205
fallback = True
198206
# iterate for all possibly scanner methods
199-
scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type)
207+
scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
200208
for scanner_class in scanner_classes:
201209
if new_candidates := scanner_class.data_scan(self, data_provider, depth, recursive_limit_size):
202210
augment_candidates(candidates, new_candidates)

credsweeper/deep_scanner/encoder_scanner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def data_scan(
2222
decoded_data_provider = DataContentProvider(data=data_provider.decoded,
2323
file_path=data_provider.file_path,
2424
file_type=data_provider.file_type,
25-
info=f"{data_provider.info}|ENCODED")
25+
info=f"{data_provider.info}|BASE64")
2626
new_limit = recursive_limit_size - len(decoded_data_provider.data)
2727
return self.recursive_scan(decoded_data_provider, depth, new_limit)
2828
return None

credsweeper/deep_scanner/gzip_scanner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def data_scan(
3131
gzip_content_provider = DataContentProvider(data=f.read(),
3232
file_path=new_path,
3333
file_type=Util.get_extension(new_path),
34-
info=f"{data_provider.info}|GZIP|{new_path}")
34+
info=f"{data_provider.info}|GZIP:{new_path}")
3535
new_limit = recursive_limit_size - len(gzip_content_provider.data)
3636
gzip_candidates = self.recursive_scan(gzip_content_provider, depth, new_limit)
3737
return gzip_candidates

credsweeper/deep_scanner/jks_scanner.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ def data_scan(
2727
if keystore.private_keys or keystore.secret_keys:
2828
candidate = Candidate.get_dummy_candidate(self.config, data_provider.file_path,
2929
data_provider.file_type,
30-
f"{data_provider.info}:'{pw_probe}' - has keys")
30+
f"{data_provider.info}|JKS:'{pw_probe}' - has keys")
3131
else:
32-
candidate = Candidate.get_dummy_candidate(self.config, data_provider.file_path,
33-
data_provider.file_type,
34-
f"{data_provider.info}:'{pw_probe}' - default password")
32+
candidate = Candidate.get_dummy_candidate(
33+
self.config, data_provider.file_path, data_provider.file_type,
34+
f"{data_provider.info}|JKS:'{pw_probe}' - default password")
3535
candidates.append(candidate)
3636
except Exception as jks_exc:
3737
logger.debug(f"{data_provider.file_path}:{pw_probe}:{jks_exc}")

credsweeper/deep_scanner/pdf_scanner.py

+9-13
Original file line numberDiff line numberDiff line change
@@ -27,35 +27,31 @@ def data_scan(
2727
# pdfminer.six - splits text in table to many lines. Allows to walk through elements
2828
try:
2929
candidates = []
30-
pdf_lines = []
3130
for page in extract_pages(io.BytesIO(data_provider.data), laparams=LAParams()):
3231
for element in page:
3332
if isinstance(element, LTText):
3433
element_text = element.get_text().strip()
35-
if element_text:
36-
element_candidates = []
34+
if 0 < depth and element_text:
3735
if MIN_DATA_LEN < len(element_text):
3836
pdf_content_provider = DataContentProvider(
3937
data=element_text.encode(),
4038
file_path=data_provider.file_path,
4139
file_type=data_provider.file_type,
42-
info=f"{data_provider.info}|PDF[{page.pageid}]")
40+
info=f"{data_provider.info}|PDF:{page.pageid}")
4341
new_limit = recursive_limit_size - len(pdf_content_provider.data)
4442
element_candidates = self.recursive_scan(pdf_content_provider, depth, new_limit)
4543
candidates.extend(element_candidates)
46-
if not element_candidates:
47-
# skip to decrease duplicates of candidates
48-
pdf_lines.append(element_text)
44+
else:
45+
string_data_provider = StringContentProvider(lines=[element_text],
46+
file_path=data_provider.file_path,
47+
file_type=data_provider.file_type,
48+
info=f"{data_provider.info}|PDF:{page.pageid}")
49+
pdf_candidates = self.scanner.scan(string_data_provider)
50+
candidates.extend(pdf_candidates)
4951
elif isinstance(element, LTItem):
5052
pass
5153
else:
5254
logger.error(f"Unsupported {element}")
53-
string_data_provider = StringContentProvider(lines=pdf_lines,
54-
file_path=data_provider.file_path,
55-
file_type=data_provider.file_type,
56-
info=f"{data_provider.info}|PDF[{page.pageid}]")
57-
pdf_candidates = self.scanner.scan(string_data_provider)
58-
candidates.extend(pdf_candidates)
5955
return candidates
6056
except Exception as pdf_exc:
6157
logger.error(f"{data_provider.file_path}:{pdf_exc}")

credsweeper/deep_scanner/pkcs12_scanner.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,13 @@ def data_scan(
3131
self.config, #
3232
data_provider.file_path, #
3333
data_provider.file_type, #
34-
f"{data_provider.info}:'{pw_probe.decode()}' - has keys PKCS12")
34+
f"{data_provider.info}|PKCS12:'{pw_probe.decode()}' - has keys PKCS12")
3535
else:
3636
candidate = Candidate.get_dummy_candidate(
3737
self.config, #
3838
data_provider.file_path, #
3939
data_provider.file_type, #
40-
f"{data_provider.info}:'{pw_probe.decode()}' - default password PKCS12")
40+
f"{data_provider.info}|PKCS12:'{pw_probe.decode()}' - default password PKCS12")
4141
candidates.append(candidate)
4242
except Exception as pkcs_exc:
4343
logger.debug(f"{data_provider.file_path}:{pw_probe.decode()}:{pkcs_exc}")

credsweeper/deep_scanner/pptx_scanner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def data_scan(
3434
string_data_provider = StringContentProvider(lines=pptx_lines,
3535
file_path=data_provider.file_path,
3636
file_type=data_provider.file_type,
37-
info=f"{data_provider.info}|PPTX[{n+1}]")
37+
info=f"{data_provider.info}|PPTX:{n+1}")
3838
pptx_candidates = self.scanner.scan(string_data_provider)
3939
candidates.extend(pptx_candidates)
4040
return candidates

credsweeper/deep_scanner/tar_scanner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def data_scan(
3939
tar_content_provider = DataContentProvider(data=f.read(),
4040
file_path=data_provider.file_path,
4141
file_type=Util.get_extension(tfi.name),
42-
info=f"{data_provider.info}|TAR|{tfi.name}")
42+
info=f"{data_provider.info}|TAR:{tfi.name}")
4343
# Nevertheless, use extracted data size
4444
new_limit = recursive_limit_size - len(tar_content_provider.data)
4545
tar_candidates = self.recursive_scan(tar_content_provider, depth, new_limit)

credsweeper/deep_scanner/xlsx_scanner.py

+20-7
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
import pandas as pd
77

88
from credsweeper.credentials import Candidate
9+
from credsweeper.credentials.augment_candidates import augment_candidates
910
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
1011
from credsweeper.file_handler.data_content_provider import DataContentProvider
1112
from credsweeper.file_handler.string_content_provider import StringContentProvider
13+
from credsweeper.utils import Util
1214

1315
logger = logging.getLogger(__name__)
1416

@@ -26,15 +28,26 @@ def data_scan(
2628
candidates = []
2729
book = pd.read_excel(io.BytesIO(data_provider.data), sheet_name=None, header=None)
2830
for sheet_name, sheet_data in book.items():
31+
sheet_info = f"{data_provider.info}|{sheet_name}"
2932
# replace open xml carriage returns _x000D_ before line feed only
3033
df = sheet_data.replace(to_replace="_x000D_\n", value='\n', regex=True).fillna('').astype(str)
31-
sheet_lines = ['\t'.join(x) for x in df.values]
32-
string_data_provider = StringContentProvider(lines=sheet_lines,
33-
file_path=data_provider.file_path,
34-
file_type=data_provider.file_type,
35-
info=f"{data_provider.info}|xlsx:{sheet_name}")
36-
sheet_candidates = self.scanner.scan(string_data_provider)
37-
candidates.extend(sheet_candidates)
34+
for row_pos, row in enumerate(df.values):
35+
for col_pos, cell in enumerate(row):
36+
cell_info = f"{sheet_info}:{Util.get_excel_column_name(col_pos)}{row_pos + 1}"
37+
cell_provider = StringContentProvider(lines=cell.splitlines(),
38+
file_path=data_provider.file_path,
39+
file_type=data_provider.file_type,
40+
info=cell_info)
41+
cell_candidates = self.scanner.scan(cell_provider)
42+
candidates.extend(cell_candidates)
43+
row_line = '\t'.join(row)
44+
row_provider = StringContentProvider(lines=[row_line],
45+
file_path=data_provider.file_path,
46+
file_type=data_provider.file_type,
47+
info=f"{sheet_info}:R{row_pos + 1}")
48+
row_candidates = self.scanner.scan(row_provider)
49+
augment_candidates(candidates, row_candidates)
50+
3851
return candidates
3952
except Exception as xlsx_exc:
4053
logger.error(f"{data_provider.file_path}:{xlsx_exc}")

0 commit comments

Comments
 (0)