Skip to content

Commit

Permalink
parsing docx in --doc mode as text
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Oct 23, 2023
1 parent e20b010 commit af55e38
Show file tree
Hide file tree
Showing 21 changed files with 358 additions and 39 deletions.
3 changes: 3 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,6 @@ ignore_missing_imports = True

[mypy-password_strength.*]
ignore_missing_imports = True

[mypy-docx.*]
ignore_missing_imports = True
1 change: 1 addition & 0 deletions credsweeper/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(self, config: Dict[str, Any]) -> None:
self.exclude_patterns: List[re.Pattern] = [re.compile(pattern) for pattern in config["exclude"]["pattern"]]
self.exclude_paths: List[str] = config["exclude"]["path"]
self.exclude_containers: List[str] = config["exclude"]["containers"]
self.exclude_documents: List[str] = config["exclude"]["documents"]
self.exclude_extensions: List[str] = config["exclude"]["extension"]
self.exclude_lines: Set[str] = set(config["exclude"].get("lines", []))
self.exclude_values: Set[str] = set(config["exclude"].get("values", []))
Expand Down
3 changes: 3 additions & 0 deletions credsweeper/deep_scanner/deep_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from credsweeper.utils import Util
from .byte_scanner import ByteScanner
from .bzip2_scanner import Bzip2Scanner
from .docx_scanner import DocxScanner
from .encoder_scanner import EncoderScanner
from .gzip_scanner import GzipScanner
from .html_scanner import HtmlScanner
Expand All @@ -34,6 +35,7 @@
class DeepScanner(
ByteScanner, #
Bzip2Scanner, #
DocxScanner, #
EncoderScanner, #
GzipScanner, #
HtmlScanner, #
Expand Down Expand Up @@ -71,6 +73,7 @@ def get_deep_scanners(data: bytes) -> List[Any]:
deep_scanners.append(ZipScanner)
# probably, there might be a docx, xlxs and so on.
# It might be scanned with text representation in third-party libraries.
deep_scanners.append(DocxScanner)
elif Util.is_bzip2(data):
deep_scanners.append(Bzip2Scanner)
elif Util.is_tar(data):
Expand Down
44 changes: 44 additions & 0 deletions credsweeper/deep_scanner/docx_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import io
import logging
from abc import ABC
from typing import List

import docx

from credsweeper.credentials import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider

logger = logging.getLogger(__name__)


class DocxScanner(AbstractScanner, ABC):
"""Implements pdf scanning"""

def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> List[Candidate]:
"""Tries to scan PDF elements recursively and the whole text on page as strings"""
candidates = []

try:
docx_lines = []

doc = docx.Document(io.BytesIO(data_provider.data))
for paragraph in doc.paragraphs:
for line in paragraph.text.splitlines():
if line:
docx_lines.append(line)

string_data_provider = StringContentProvider(lines=docx_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|DOCX")
pdf_candidates = self.scanner.scan(string_data_provider)
candidates.extend(pdf_candidates)
except Exception as docx_exc:
logger.debug(f"{data_provider.file_path}:{docx_exc}")
return candidates
3 changes: 3 additions & 0 deletions credsweeper/file_handler/file_path_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ def check_exclude_file(config: Config, path: str) -> bool:
return True
if not config.depth and file_extension in config.exclude_containers:
return True
# --depth or --doc enables scan for all documents extensions
if not (config.depth or config.doc) and file_extension in config.exclude_documents:
return True
return False

@staticmethod
Expand Down
9 changes: 6 additions & 3 deletions credsweeper/secret/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
"containers": [
".apk",
".bz2",
".docx",
".gz",
".pdf",
".tar",
".xlsx",
".zip"
],
"documents": [
".docx",
".pdf",
".xlsx"
],
"extension": [
".7z",
".aac",
Expand Down Expand Up @@ -71,6 +73,7 @@
"/__pycache__/",
"/node_modules/",
"/target/",
"/.venv/",
"/venv/"
],
"lines": [],
Expand Down
6 changes: 6 additions & 0 deletions docs/source/overall_architecture.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ When paths to scan are entered, get the files in that paths and the files are ex
- exclude
- pattern: Regex patterns to exclude scan.
- containers: Extensions in lower case of container files which might be scan with --depth option
- containers: Extensions in lower case of container files which might be scan with --doc and --depth option
- extension: Extensions in lower case to exclude scan.
- path: Paths to exclude scan.
- source_ext: List of extensions for scanning categorized as source files.
Expand All @@ -36,6 +37,11 @@ When paths to scan are entered, get the files in that paths and the files are ex
".zip",
...
],
"documents": [
".docx",
".pdf",
...
],
"extension": [
".7z",
".jpg",
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ openpyxl==3.1.2
pandas==2.0.3
# ^ the version supports by python 3.8
PyYAML==6.0.1
python-docx==1.0.1
requests==2.31.0
schwifty==2023.9.0
typing_extensions==4.8.0
Expand Down Expand Up @@ -48,3 +49,4 @@ types-python-dateutil
types-regex
types-humanfriendly
yapf

1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"password-strength", #
"pdfminer.six", #
"PyYAML", #
"python-docx", #
"requests", #
"scipy", #
"schwifty", #
Expand Down
6 changes: 3 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 120
SAMPLES_FILES_COUNT: int = 123

# credentials count after scan
SAMPLES_CRED_COUNT: int = 383
Expand All @@ -11,10 +11,10 @@
SAMPLES_POST_CRED_COUNT: int = 293

# with option --doc
SAMPLES_IN_DOC = 426
SAMPLES_IN_DOC = 431

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 16
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 21
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 3

Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def config() -> Config:
config_dict["validation"]["api_validation"] = False
config_dict["use_filters"] = True
config_dict["find_by_ext"] = False
config_dict["exclude"]["containers"] = [".gz", ".zip"]
config_dict["exclude"]["documents"] = [".docx", ".pdf"]
config_dict["exclude"]["extension"] = [".jpg", ".bmp"]
config_dict["depth"] = 0
config_dict["doc"] = False
config_dict["find_by_ext_list"] = [".txt", ".inf"]
Expand Down
168 changes: 144 additions & 24 deletions tests/data/depth_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -6266,30 +6266,6 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.97709,
"rule": "Password",
"severity": "medium",
"line_data_list": [
{
"line": "password = Xdj@jcN834b.",
"line_num": 2,
"path": "tests/samples/password.docx",
"info": "tests/samples/password.docx|ZIP|word/document.xml|HTML",
"value": "Xdj@jcN834b.",
"value_start": 11,
"value_end": 23,
"variable": "password",
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 2.8208020839342964,
"valid": false
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
Expand Down Expand Up @@ -8144,6 +8120,102 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.94412,
"rule": "Password",
"severity": "medium",
"line_data_list": [
{
"line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : Password = WeR15tr0n6",
"line_num": 1,
"path": "tests/samples/sample.docx",
"info": "tests/samples/sample.docx|ZIP|word/document.xml|XML",
"value": "WeR15tr0n6",
"value_start": 77,
"value_end": 87,
"variable": "Password",
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 3.321928094887362,
"valid": false
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Github Token",
"severity": "high",
"line_data_list": [
{
"line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
"line_num": 2,
"path": "tests/samples/sample.docx",
"info": "tests/samples/sample.docx|DOCX",
"value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
"value_start": 0,
"value_end": 40,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.632263329852917,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.94412,
"rule": "Password",
"severity": "medium",
"line_data_list": [
{
"line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : Password = WeR15tr0n6",
"line_num": 1,
"path": "tests/samples/sample.docx.gz",
"info": "tests/samples/sample.docx.gz|GZIP|tests/samples/sample.docx|ZIP|word/document.xml|XML",
"value": "WeR15tr0n6",
"value_start": 77,
"value_end": 87,
"variable": "Password",
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 3.321928094887362,
"valid": false
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Github Token",
"severity": "high",
"line_data_list": [
{
"line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
"line_num": 2,
"path": "tests/samples/sample.docx.gz",
"info": "tests/samples/sample.docx.gz|GZIP|tests/samples/sample.docx|DOCX",
"value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
"value_start": 0,
"value_end": 40,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.632263329852917,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
Expand Down Expand Up @@ -8192,6 +8264,30 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Github Token",
"severity": "high",
"line_data_list": [
{
"line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd",
"line_num": 1,
"path": "tests/samples/sample.pdf",
"info": "tests/samples/sample.pdf|PDF:1|RAW",
"value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd",
"value_start": 0,
"value_end": 40,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.732263329852917,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
Expand Down Expand Up @@ -8336,6 +8432,30 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Azure Secret Value",
"severity": "high",
"line_data_list": [
{
"line": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P",
"line_num": 1,
"path": "tests/samples/small.pdf",
"info": "tests/samples/small.pdf|PDF:1|RAW",
"value": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P",
"value_start": 0,
"value_end": 40,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.620007704961091,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
Expand Down
Loading

0 comments on commit af55e38

Please sign in to comment.