Skip to content

[SP-2587] Add directory simhash, modify concatenated names to remove extensions #119

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Upcoming changes...

## [1.27.0] - 2025-06-30
### Added
- Add directory hash calculation to folder hasher
- Add rank-threshold option to folder scan command

## [1.26.3] - 2025-06-26
### Fixed
- Fixed crash in inspect subcommand when processing components that lack license information
Expand Down Expand Up @@ -570,4 +575,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[1.26.0]: https://github.com/scanoss/scanoss.py/compare/v1.25.2...v1.26.0
[1.26.1]: https://github.com/scanoss/scanoss.py/compare/v1.26.0...v1.26.1
[1.26.2]: https://github.com/scanoss/scanoss.py/compare/v1.26.1...v1.26.2
[1.26.3]: https://github.com/scanoss/scanoss.py/compare/v1.26.2...v1.26.3
[1.26.3]: https://github.com/scanoss/scanoss.py/compare/v1.26.2...v1.26.3
[1.27.0]: https://github.com/scanoss/scanoss.py/compare/v1.26.3...v1.27.0
9 changes: 9 additions & 0 deletions CLIENT_HELP.md
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,15 @@ The new `folder-scan` subcommand performs a comprehensive scan on an entire dire
scanoss-py folder-scan /path/to/folder -o folder-scan-results.json
```

**Options:**
- `--rank-threshold`: Filter results to only show those with rank value at or below this threshold (e.g., `--rank-threshold 3` returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.
- `--format`: Result output format (json or cyclonedx, default: json)

**Example with rank threshold:**
```shell
scanoss-py folder-scan /path/to/folder --rank-threshold 3 -o folder-scan-results.json
```

### Container-Scan a Docker Image

The `container-scan` subcommand allows you to scan Docker container images for dependencies. This command extracts and analyzes dependencies from container images, helping you identify open source components within containerized applications.
Expand Down
8 changes: 3 additions & 5 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -249,13 +249,11 @@ Performs a comprehensive scan of a directory using folder hashing to identify co
* - --output <file name>, -o <file name>
- Output result file name (optional - default STDOUT)
* - --format <format>, -f <format>
- Output format: {json} (optional - default json)
- Output format: {json, cyclonedx} (optional - default json)
* - --timeout <seconds>, -M <seconds>
- Timeout in seconds for API communication (optional - default 600)
* - --best-match, -bm
- Enable best match mode (optional - default: False)
* - --threshold <1-100>
- Threshold for result matching (optional - default: 100)
* - --rank-threshold <number>
- Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.
* - --settings <file>, -st <file>
- Settings file to use for scanning (optional - default scanoss.json)
* - --skip-settings-file, -stf
Expand Down
2 changes: 1 addition & 1 deletion src/scanoss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
THE SOFTWARE.
"""

__version__ = '1.26.3'
__version__ = '1.27.0'
52 changes: 29 additions & 23 deletions src/scanoss/api/scanning/v2/scanoss_scanning_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 16 additions & 25 deletions src/scanoss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from .components import Components
from .constants import (
DEFAULT_API_TIMEOUT,
DEFAULT_HFH_RANK_THRESHOLD,
DEFAULT_POST_SIZE,
DEFAULT_RETRY,
DEFAULT_TIMEOUT,
Expand Down Expand Up @@ -623,24 +624,16 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
'--format',
'-f',
type=str,
choices=['json'],
choices=['json', 'cyclonedx'],
default='json',
help='Result output format (optional - default: json)',
)
p_folder_scan.add_argument(
'--best-match',
'-bm',
action='store_true',
default=False,
help='Enable best match mode (optional - default: False)',
)
p_folder_scan.add_argument(
'--threshold',
'--rank-threshold',
type=int,
choices=range(1, 101),
metavar='1-100',
default=100,
help='Threshold for result matching (optional - default: 100)',
default=DEFAULT_HFH_RANK_THRESHOLD,
help='Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 '
'returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.',
)
p_folder_scan.set_defaults(func=folder_hashing_scan)

Expand Down Expand Up @@ -1455,7 +1448,7 @@ def utils_certloc(*_):
Run the "utils certloc" sub-command
:param _: ignored/unused
"""
import certifi # noqa: PLC0415,I001
import certifi # noqa: PLC0415,I001

print(f'CA Cert File: {certifi.where()}')

Expand All @@ -1466,11 +1459,11 @@ def utils_cert_download(_, args): # pylint: disable=PLR0912 # noqa: PLR0912
:param _: ignore/unused
:param args: Parsed arguments
"""
import socket # noqa: PLC0415,I001
import traceback # noqa: PLC0415,I001
from urllib.parse import urlparse # noqa: PLC0415,I001
import socket # noqa: PLC0415,I001
import traceback # noqa: PLC0415,I001
from urllib.parse import urlparse # noqa: PLC0415,I001

from OpenSSL import SSL, crypto # noqa: PLC0415,I001
from OpenSSL import SSL, crypto # noqa: PLC0415,I001

file = sys.stdout
if args.output:
Expand Down Expand Up @@ -1518,7 +1511,7 @@ def utils_pac_proxy(_, args):
:param _: ignore/unused
:param args: Parsed arguments
"""
from pypac.resolver import ProxyResolver # noqa: PLC0415,I001
from pypac.resolver import ProxyResolver # noqa: PLC0415,I001

if not args.pac:
print_stderr('Error: No pac file option specified.')
Expand Down Expand Up @@ -1592,7 +1585,7 @@ def crypto_algorithms(parser, args):
sys.exit(1)
except Exception as e:
if args.debug:
import traceback # noqa: PLC0415,I001
import traceback # noqa: PLC0415,I001

traceback.print_exc()
print_stderr(f'ERROR: {e}')
Expand Down Expand Up @@ -1634,7 +1627,7 @@ def crypto_hints(parser, args):
sys.exit(1)
except Exception as e:
if args.debug:
import traceback # noqa: PLC0415,I001
import traceback # noqa: PLC0415,I001

traceback.print_exc()
print_stderr(f'ERROR: {e}')
Expand Down Expand Up @@ -1676,7 +1669,7 @@ def crypto_versions_in_range(parser, args):
sys.exit(1)
except Exception as e:
if args.debug:
import traceback # noqa: PLC0415,I001
import traceback # noqa: PLC0415,I001

traceback.print_exc()
print_stderr(f'ERROR: {e}')
Expand Down Expand Up @@ -1965,11 +1958,9 @@ def folder_hashing_scan(parser, args):
config=scanner_config,
client=client,
scanoss_settings=scanoss_settings,
rank_threshold=args.rank_threshold,
)

scanner.best_match = args.best_match
scanner.threshold = args.threshold

if scanner.scan():
scanner.present(output_file=args.output, output_format=args.format)
except ScanossGrpcError as e:
Expand Down
2 changes: 2 additions & 0 deletions src/scanoss/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@
DEFAULT_URL2 = 'https://api.scanoss.com' # default premium service URL

DEFAULT_API_TIMEOUT = 600

DEFAULT_HFH_RANK_THRESHOLD = 5
9 changes: 7 additions & 2 deletions src/scanoss/file_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import os
import sys
from pathlib import Path
from typing import List
from typing import List, Optional

from pathspec import GitIgnoreSpec

Expand Down Expand Up @@ -511,7 +511,7 @@ def get_filtered_files_from_folder(self, root: str) -> List[str]:
# Now filter the files and return the reduced list
return self.get_filtered_files_from_files(all_files, str(root_path))

def get_filtered_files_from_files(self, files: List[str], scan_root: str = None) -> List[str]:
def get_filtered_files_from_files(self, files: List[str], scan_root: Optional[str] = None) -> List[str]:
"""
Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings.

Expand Down Expand Up @@ -615,8 +615,13 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]:
# Default patterns for skipping directories
if not self.all_folders:
DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
DEFAULT_SKIPPED_DIR_EXT_LIST = (
DEFAULT_SKIPPED_DIR_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIR_EXT
)
for dir_name in DEFAULT_SKIPPED_DIR_LIST:
patterns.append(f'{dir_name}/')
for dir_extension in DEFAULT_SKIPPED_DIR_EXT_LIST:
patterns.append(f'*{dir_extension}/')

# Custom patterns added in SCANOSS settings file
if self.scanoss_settings:
Expand Down
Loading