diff --git a/CHANGELOG.md b/CHANGELOG.md index 113cbca6..e14cfc90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Upcoming changes... +## [1.27.0] - 2025-06-30 +### Added +- Add directory hash calculation to folder hasher +- Add rank-threshold option to folder scan command + ## [1.26.3] - 2025-06-26 ### Fixed - Fixed crash in inspect subcommand when processing components that lack license information @@ -570,4 +575,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 [1.26.0]: https://github.com/scanoss/scanoss.py/compare/v1.25.2...v1.26.0 [1.26.1]: https://github.com/scanoss/scanoss.py/compare/v1.26.0...v1.26.1 [1.26.2]: https://github.com/scanoss/scanoss.py/compare/v1.26.1...v1.26.2 -[1.26.3]: https://github.com/scanoss/scanoss.py/compare/v1.26.2...v1.26.3 \ No newline at end of file +[1.26.3]: https://github.com/scanoss/scanoss.py/compare/v1.26.2...v1.26.3 +[1.27.0]: https://github.com/scanoss/scanoss.py/compare/v1.26.3...v1.27.0 diff --git a/CLIENT_HELP.md b/CLIENT_HELP.md index 16e6aef0..12e2f164 100644 --- a/CLIENT_HELP.md +++ b/CLIENT_HELP.md @@ -485,6 +485,15 @@ The new `folder-scan` subcommand performs a comprehensive scan on an entire dire scanoss-py folder-scan /path/to/folder -o folder-scan-results.json ``` +**Options:** +- `--rank-threshold`: Filter results to only show those with rank value at or below this threshold (e.g., `--rank-threshold 3` returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches. +- `--format`: Result output format (json or cyclonedx, default: json) + +**Example with rank threshold:** +```shell +scanoss-py folder-scan /path/to/folder --rank-threshold 3 -o folder-scan-results.json +``` + ### Container-Scan a Docker Image The `container-scan` subcommand allows you to scan Docker container images for dependencies. This command extracts and analyzes dependencies from container images, helping you identify open source components within containerized applications. diff --git a/docs/source/index.rst b/docs/source/index.rst index f70f93d7..eb6a98f8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -249,13 +249,11 @@ Performs a comprehensive scan of a directory using folder hashing to identify co * - --output , -o - Output result file name (optional - default STDOUT) * - --format , -f - - Output format: {json} (optional - default json) + - Output format: {json, cyclonedx} (optional - default json) * - --timeout , -M - Timeout in seconds for API communication (optional - default 600) - * - --best-match, -bm - - Enable best match mode (optional - default: False) - * - --threshold <1-100> - - Threshold for result matching (optional - default: 100) + * - --rank-threshold + - Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches. * - --settings , -st - Settings file to use for scanning (optional - default scanoss.json) * - --skip-settings-file, -stf diff --git a/src/scanoss/__init__.py b/src/scanoss/__init__.py index 55786eb0..b6b742b4 100644 --- a/src/scanoss/__init__.py +++ b/src/scanoss/__init__.py @@ -22,4 +22,4 @@ THE SOFTWARE. """ -__version__ = '1.26.3' +__version__ = '1.27.0' diff --git a/src/scanoss/api/scanning/v2/scanoss_scanning_pb2.py b/src/scanoss/api/scanning/v2/scanoss_scanning_pb2.py index b81bf7e9..114a0bf8 100644 --- a/src/scanoss/api/scanning/v2/scanoss_scanning_pb2.py +++ b/src/scanoss/api/scanning/v2/scanoss_scanning_pb2.py @@ -16,28 +16,34 @@ from protoc_gen_swagger.options import annotations_pb2 as protoc__gen__swagger_dot_options_dot_annotations__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n.scanoss/api/scanning/v2/scanoss-scanning.proto\x12\x17scanoss.api.scanning.v2\x1a*scanoss/api/common/v2/scanoss-common.proto\x1a\x1cgoogle/api/annotations.proto\x1a,protoc-gen-swagger/options/annotations.proto\"\xff\x01\n\nHFHRequest\x12\x12\n\nbest_match\x18\x01 \x01(\x08\x12\x11\n\tthreshold\x18\x02 \x01(\x05\x12:\n\x04root\x18\x03 \x01(\x0b\x32,.scanoss.api.scanning.v2.HFHRequest.Children\x1a\x8d\x01\n\x08\x43hildren\x12\x0f\n\x07path_id\x18\x01 \x01(\t\x12\x16\n\x0esim_hash_names\x18\x02 \x01(\t\x12\x18\n\x10sim_hash_content\x18\x03 \x01(\t\x12>\n\x08\x63hildren\x18\x04 \x03(\x0b\x32,.scanoss.api.scanning.v2.HFHRequest.Children\"\xc1\x02\n\x0bHFHResponse\x12<\n\x07results\x18\x01 \x03(\x0b\x32+.scanoss.api.scanning.v2.HFHResponse.Result\x12\x35\n\x06status\x18\x02 \x01(\x0b\x32%.scanoss.api.common.v2.StatusResponse\x1a\x39\n\tComponent\x12\x0c\n\x04purl\x18\x01 \x01(\t\x12\x10\n\x08versions\x18\x02 \x03(\t\x12\x0c\n\x04rank\x18\x03 \x01(\x05\x1a\x81\x01\n\x06Result\x12\x0f\n\x07path_id\x18\x01 \x01(\t\x12\x42\n\ncomponents\x18\x02 \x03(\x0b\x32..scanoss.api.scanning.v2.HFHResponse.Component\x12\x13\n\x0bprobability\x18\x03 \x01(\x02\x12\r\n\x05stage\x18\x04 \x01(\x05\x32\x81\x02\n\x08Scanning\x12q\n\x04\x45\x63ho\x12\".scanoss.api.common.v2.EchoRequest\x1a#.scanoss.api.common.v2.EchoResponse\" \x82\xd3\xe4\x93\x02\x1a\"\x15/api/v2/scanning/echo:\x01*\x12\x81\x01\n\x0e\x46olderHashScan\x12#.scanoss.api.scanning.v2.HFHRequest\x1a$.scanoss.api.scanning.v2.HFHResponse\"$\x82\xd3\xe4\x93\x02\x1e\"\x19/api/v2/scanning/hfh/scan:\x01*B\x8a\x02Z1github.com/scanoss/papi/api/scanningv2;scanningv2\x92\x41\xd3\x01\x12m\n\x18SCANOSS Scanning Service\"L\n\x10scanoss-scanning\x12#https://github.com/scanoss/scanning\x1a\x13support@scanoss.com2\x03\x32.0*\x01\x01\x32\x10\x61pplication/json:\x10\x61pplication/jsonR;\n\x03\x34\x30\x34\x12\x34\n*Returned when the resource does not exist.\x12\x06\n\x04\x9a\x02\x01\x07\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n.scanoss/api/scanning/v2/scanoss-scanning.proto\x12\x17scanoss.api.scanning.v2\x1a*scanoss/api/common/v2/scanoss-common.proto\x1a\x1cgoogle/api/annotations.proto\x1a,protoc-gen-swagger/options/annotations.proto\"\xc5\x03\n\nHFHRequest\x12:\n\x04root\x18\x01 \x01(\x0b\x32,.scanoss.api.scanning.v2.HFHRequest.Children\x12\x16\n\x0erank_threshold\x18\x02 \x01(\x05\x12\x10\n\x08\x63\x61tegory\x18\x03 \x01(\t\x12\x13\n\x0bquery_limit\x18\x04 \x01(\x05\x1a\xbb\x02\n\x08\x43hildren\x12\x0f\n\x07path_id\x18\x01 \x01(\t\x12\x16\n\x0esim_hash_names\x18\x02 \x01(\t\x12\x18\n\x10sim_hash_content\x18\x03 \x01(\t\x12>\n\x08\x63hildren\x18\x04 \x03(\x0b\x32,.scanoss.api.scanning.v2.HFHRequest.Children\x12\x1a\n\x12sim_hash_dir_names\x18\x05 \x01(\t\x12Y\n\x0flang_extensions\x18\x06 \x03(\x0b\x32@.scanoss.api.scanning.v2.HFHRequest.Children.LangExtensionsEntry\x1a\x35\n\x13LangExtensionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x05:\x02\x38\x01\"\xa3\x03\n\x0bHFHResponse\x12<\n\x07results\x18\x01 \x03(\x0b\x32+.scanoss.api.scanning.v2.HFHResponse.Result\x12\x35\n\x06status\x18\x02 \x01(\x0b\x32%.scanoss.api.common.v2.StatusResponse\x1a)\n\x07Version\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x1a\x94\x01\n\tComponent\x12\x0c\n\x04purl\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0e\n\x06vendor\x18\x03 \x01(\t\x12>\n\x08versions\x18\x04 \x03(\x0b\x32,.scanoss.api.scanning.v2.HFHResponse.Version\x12\x0c\n\x04rank\x18\x05 \x01(\x05\x12\r\n\x05order\x18\x06 \x01(\x05\x1a]\n\x06Result\x12\x0f\n\x07path_id\x18\x01 \x01(\t\x12\x42\n\ncomponents\x18\x02 \x03(\x0b\x32..scanoss.api.scanning.v2.HFHResponse.Component2\x81\x02\n\x08Scanning\x12q\n\x04\x45\x63ho\x12\".scanoss.api.common.v2.EchoRequest\x1a#.scanoss.api.common.v2.EchoResponse\" \x82\xd3\xe4\x93\x02\x1a\"\x15/api/v2/scanning/echo:\x01*\x12\x81\x01\n\x0e\x46olderHashScan\x12#.scanoss.api.scanning.v2.HFHRequest\x1a$.scanoss.api.scanning.v2.HFHResponse\"$\x82\xd3\xe4\x93\x02\x1e\"\x19/api/v2/scanning/hfh/scan:\x01*B\x8a\x02Z1github.com/scanoss/papi/api/scanningv2;scanningv2\x92\x41\xd3\x01\x12m\n\x18SCANOSS Scanning Service\"L\n\x10scanoss-scanning\x12#https://github.com/scanoss/scanning\x1a\x13support@scanoss.com2\x03\x32.0*\x01\x01\x32\x10\x61pplication/json:\x10\x61pplication/jsonR;\n\x03\x34\x30\x34\x12\x34\n*Returned when the resource does not exist.\x12\x06\n\x04\x9a\x02\x01\x07\x62\x06proto3') -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'scanoss.api.scanning.v2.scanoss_scanning_pb2', globals()) -if _descriptor._USE_C_DESCRIPTORS == False: - - DESCRIPTOR._options = None - DESCRIPTOR._serialized_options = b'Z1github.com/scanoss/papi/api/scanningv2;scanningv2\222A\323\001\022m\n\030SCANOSS Scanning Service\"L\n\020scanoss-scanning\022#https://github.com/scanoss/scanning\032\023support@scanoss.com2\0032.0*\001\0012\020application/json:\020application/jsonR;\n\003404\0224\n*Returned when the resource does not exist.\022\006\n\004\232\002\001\007' - _SCANNING.methods_by_name['Echo']._options = None - _SCANNING.methods_by_name['Echo']._serialized_options = b'\202\323\344\223\002\032\"\025/api/v2/scanning/echo:\001*' - _SCANNING.methods_by_name['FolderHashScan']._options = None - _SCANNING.methods_by_name['FolderHashScan']._serialized_options = b'\202\323\344\223\002\036\"\031/api/v2/scanning/hfh/scan:\001*' - _HFHREQUEST._serialized_start=196 - _HFHREQUEST._serialized_end=451 - _HFHREQUEST_CHILDREN._serialized_start=310 - _HFHREQUEST_CHILDREN._serialized_end=451 - _HFHRESPONSE._serialized_start=454 - _HFHRESPONSE._serialized_end=775 - _HFHRESPONSE_COMPONENT._serialized_start=586 - _HFHRESPONSE_COMPONENT._serialized_end=643 - _HFHRESPONSE_RESULT._serialized_start=646 - _HFHRESPONSE_RESULT._serialized_end=775 - _SCANNING._serialized_start=778 - _SCANNING._serialized_end=1035 +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'scanoss.api.scanning.v2.scanoss_scanning_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None + _globals['DESCRIPTOR']._serialized_options = b'Z1github.com/scanoss/papi/api/scanningv2;scanningv2\222A\323\001\022m\n\030SCANOSS Scanning Service\"L\n\020scanoss-scanning\022#https://github.com/scanoss/scanning\032\023support@scanoss.com2\0032.0*\001\0012\020application/json:\020application/jsonR;\n\003404\0224\n*Returned when the resource does not exist.\022\006\n\004\232\002\001\007' + _globals['_HFHREQUEST_CHILDREN_LANGEXTENSIONSENTRY']._loaded_options = None + _globals['_HFHREQUEST_CHILDREN_LANGEXTENSIONSENTRY']._serialized_options = b'8\001' + _globals['_SCANNING'].methods_by_name['Echo']._loaded_options = None + _globals['_SCANNING'].methods_by_name['Echo']._serialized_options = b'\202\323\344\223\002\032\"\025/api/v2/scanning/echo:\001*' + _globals['_SCANNING'].methods_by_name['FolderHashScan']._loaded_options = None + _globals['_SCANNING'].methods_by_name['FolderHashScan']._serialized_options = b'\202\323\344\223\002\036\"\031/api/v2/scanning/hfh/scan:\001*' + _globals['_HFHREQUEST']._serialized_start=196 + _globals['_HFHREQUEST']._serialized_end=649 + _globals['_HFHREQUEST_CHILDREN']._serialized_start=334 + _globals['_HFHREQUEST_CHILDREN']._serialized_end=649 + _globals['_HFHREQUEST_CHILDREN_LANGEXTENSIONSENTRY']._serialized_start=596 + _globals['_HFHREQUEST_CHILDREN_LANGEXTENSIONSENTRY']._serialized_end=649 + _globals['_HFHRESPONSE']._serialized_start=652 + _globals['_HFHRESPONSE']._serialized_end=1071 + _globals['_HFHRESPONSE_VERSION']._serialized_start=784 + _globals['_HFHRESPONSE_VERSION']._serialized_end=825 + _globals['_HFHRESPONSE_COMPONENT']._serialized_start=828 + _globals['_HFHRESPONSE_COMPONENT']._serialized_end=976 + _globals['_HFHRESPONSE_RESULT']._serialized_start=978 + _globals['_HFHRESPONSE_RESULT']._serialized_end=1071 + _globals['_SCANNING']._serialized_start=1074 + _globals['_SCANNING']._serialized_end=1331 # @@protoc_insertion_point(module_scope) diff --git a/src/scanoss/cli.py b/src/scanoss/cli.py index 418a68e8..fc618eb4 100644 --- a/src/scanoss/cli.py +++ b/src/scanoss/cli.py @@ -54,6 +54,7 @@ from .components import Components from .constants import ( DEFAULT_API_TIMEOUT, + DEFAULT_HFH_RANK_THRESHOLD, DEFAULT_POST_SIZE, DEFAULT_RETRY, DEFAULT_TIMEOUT, @@ -623,24 +624,16 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915 '--format', '-f', type=str, - choices=['json'], + choices=['json', 'cyclonedx'], default='json', help='Result output format (optional - default: json)', ) p_folder_scan.add_argument( - '--best-match', - '-bm', - action='store_true', - default=False, - help='Enable best match mode (optional - default: False)', - ) - p_folder_scan.add_argument( - '--threshold', + '--rank-threshold', type=int, - choices=range(1, 101), - metavar='1-100', - default=100, - help='Threshold for result matching (optional - default: 100)', + default=DEFAULT_HFH_RANK_THRESHOLD, + help='Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 ' + 'returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.', ) p_folder_scan.set_defaults(func=folder_hashing_scan) @@ -1455,7 +1448,7 @@ def utils_certloc(*_): Run the "utils certloc" sub-command :param _: ignored/unused """ - import certifi # noqa: PLC0415,I001 + import certifi # noqa: PLC0415,I001 print(f'CA Cert File: {certifi.where()}') @@ -1466,11 +1459,11 @@ def utils_cert_download(_, args): # pylint: disable=PLR0912 # noqa: PLR0912 :param _: ignore/unused :param args: Parsed arguments """ - import socket # noqa: PLC0415,I001 - import traceback # noqa: PLC0415,I001 - from urllib.parse import urlparse # noqa: PLC0415,I001 + import socket # noqa: PLC0415,I001 + import traceback # noqa: PLC0415,I001 + from urllib.parse import urlparse # noqa: PLC0415,I001 - from OpenSSL import SSL, crypto # noqa: PLC0415,I001 + from OpenSSL import SSL, crypto # noqa: PLC0415,I001 file = sys.stdout if args.output: @@ -1518,7 +1511,7 @@ def utils_pac_proxy(_, args): :param _: ignore/unused :param args: Parsed arguments """ - from pypac.resolver import ProxyResolver # noqa: PLC0415,I001 + from pypac.resolver import ProxyResolver # noqa: PLC0415,I001 if not args.pac: print_stderr('Error: No pac file option specified.') @@ -1592,7 +1585,7 @@ def crypto_algorithms(parser, args): sys.exit(1) except Exception as e: if args.debug: - import traceback # noqa: PLC0415,I001 + import traceback # noqa: PLC0415,I001 traceback.print_exc() print_stderr(f'ERROR: {e}') @@ -1634,7 +1627,7 @@ def crypto_hints(parser, args): sys.exit(1) except Exception as e: if args.debug: - import traceback # noqa: PLC0415,I001 + import traceback # noqa: PLC0415,I001 traceback.print_exc() print_stderr(f'ERROR: {e}') @@ -1676,7 +1669,7 @@ def crypto_versions_in_range(parser, args): sys.exit(1) except Exception as e: if args.debug: - import traceback # noqa: PLC0415,I001 + import traceback # noqa: PLC0415,I001 traceback.print_exc() print_stderr(f'ERROR: {e}') @@ -1965,11 +1958,9 @@ def folder_hashing_scan(parser, args): config=scanner_config, client=client, scanoss_settings=scanoss_settings, + rank_threshold=args.rank_threshold, ) - scanner.best_match = args.best_match - scanner.threshold = args.threshold - if scanner.scan(): scanner.present(output_file=args.output, output_format=args.format) except ScanossGrpcError as e: diff --git a/src/scanoss/constants.py b/src/scanoss/constants.py index 1dd9bd61..92fc15b7 100644 --- a/src/scanoss/constants.py +++ b/src/scanoss/constants.py @@ -12,3 +12,5 @@ DEFAULT_URL2 = 'https://api.scanoss.com' # default premium service URL DEFAULT_API_TIMEOUT = 600 + +DEFAULT_HFH_RANK_THRESHOLD = 5 \ No newline at end of file diff --git a/src/scanoss/file_filters.py b/src/scanoss/file_filters.py index f7c1950b..d52ea386 100644 --- a/src/scanoss/file_filters.py +++ b/src/scanoss/file_filters.py @@ -25,7 +25,7 @@ import os import sys from pathlib import Path -from typing import List +from typing import List, Optional from pathspec import GitIgnoreSpec @@ -511,7 +511,7 @@ def get_filtered_files_from_folder(self, root: str) -> List[str]: # Now filter the files and return the reduced list return self.get_filtered_files_from_files(all_files, str(root_path)) - def get_filtered_files_from_files(self, files: List[str], scan_root: str = None) -> List[str]: + def get_filtered_files_from_files(self, files: List[str], scan_root: Optional[str] = None) -> List[str]: """ Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings. @@ -615,8 +615,13 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]: # Default patterns for skipping directories if not self.all_folders: DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS + DEFAULT_SKIPPED_DIR_EXT_LIST = ( + DEFAULT_SKIPPED_DIR_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIR_EXT + ) for dir_name in DEFAULT_SKIPPED_DIR_LIST: patterns.append(f'{dir_name}/') + for dir_extension in DEFAULT_SKIPPED_DIR_EXT_LIST: + patterns.append(f'*{dir_extension}/') # Custom patterns added in SCANOSS settings file if self.scanoss_settings: diff --git a/src/scanoss/scanners/folder_hasher.py b/src/scanoss/scanners/folder_hasher.py index f8654859..ad1bad32 100644 --- a/src/scanoss/scanners/folder_hasher.py +++ b/src/scanoss/scanners/folder_hasher.py @@ -15,7 +15,7 @@ MINIMUM_FILE_COUNT = 8 MINIMUM_CONCATENATED_NAME_LENGTH = 32 -MINIMUM_FILE_NAME_LENGTH = 32 +MAXIMUM_FILE_NAME_LENGTH = 32 class DirectoryNode: @@ -35,7 +35,7 @@ class DirectoryFile: Represents a file in the directory tree for folder hashing. """ - def __init__(self, path: str, key: bytes, key_str: str): + def __init__(self, path: str, key: List[bytes], key_str: str): self.path = path self.key = key self.key_str = key_str @@ -77,7 +77,7 @@ class FolderHasher: def __init__( self, scan_dir: str, - config: Optional[FolderHasherConfig] = None, + config: FolderHasherConfig, scanoss_settings: Optional[ScanossSettings] = None, ): self.base = ScanossBase( @@ -140,7 +140,7 @@ def _build_root_node(self, path: str) -> DirectoryNode: root_node = DirectoryNode(str(root)) all_files = [ - f for f in root.rglob('*') if f.is_file() and len(f.name.encode('utf-8')) <= MINIMUM_FILE_NAME_LENGTH + f for f in root.rglob('*') if f.is_file() and len(f.name.encode('utf-8')) <= MAXIMUM_FILE_NAME_LENGTH ] filtered_files = self.file_filters.get_filtered_files_from_files(all_files, str(root)) @@ -185,7 +185,7 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict: Recursively compute folder hash data for a directory node. The hash data includes the path identifier, simhash for file names, - simhash for file content, and children node hash information. + simhash for file content, directory hash, language extensions, and children node hash information. Args: node (DirectoryNode): The directory node to compute the hash for. @@ -194,11 +194,22 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict: dict: The computed hash data for the node. """ hash_data = self._hash_calc(node) + + # Safely calculate relative path + try: + node_path = Path(node.path).resolve() + scan_dir_path = Path(self.scan_dir).resolve() + rel_path = node_path.relative_to(scan_dir_path) + except ValueError: + # If relative_to fails, use the node path as is or a fallback + rel_path = Path(node.path).name if node.path else Path('.') return { - 'path_id': node.path, + 'path_id': str(rel_path), 'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None, 'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None, + 'sim_hash_dir_names': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None, + 'lang_extensions': hash_data['lang_extensions'], 'children': [self._hash_calc_from_node(child) for child in node.children.values()], } @@ -215,9 +226,12 @@ def _hash_calc(self, node: DirectoryNode) -> dict: node (DirectoryNode): The directory node containing file items. Returns: - dict: A dictionary with 'name_hash' and 'content_hash' keys. + dict: A dictionary with 'name_hash', 'content_hash', 'dir_hash', and 'lang_extensions' keys. """ processed_hashes = set() + unique_file_names = set() + unique_directories = set() + extension_map = {} file_hashes = [] selected_names = [] @@ -225,37 +239,64 @@ def _hash_calc(self, node: DirectoryNode) -> dict: key_str = file.key_str if key_str in processed_hashes: continue - processed_hashes.add(key_str) - selected_names.append(os.path.basename(file.path)) + file_name = os.path.basename(file.path) + + file_name_without_extension, extension = os.path.splitext(file_name) + current_directory = os.path.dirname(file.path) + + if extension and len(extension) > 1: + ext_without_dot = extension[1:] + extension_map[ext_without_dot] = extension_map.get(ext_without_dot, 0) + 1 + + current_directory.replace(self.scan_dir, '', 1).lstrip(os.path.sep) + parts = current_directory.split(os.path.sep) + for d in parts: + if d in {'', '.', '..'}: + continue + unique_directories.add(d) - file_key = bytes(file.key) - file_hashes.append(file_key) + processed_hashes.add(key_str) + unique_file_names.add(file_name_without_extension) + selected_names.append(file_name) + file_hashes.append(file.key) if len(selected_names) < MINIMUM_FILE_COUNT: - return { - 'name_hash': None, - 'content_hash': None, - } + return {'name_hash': None, 'content_hash': None, 'dir_hash': None, 'lang_extensions': None} selected_names.sort() concatenated_names = ''.join(selected_names) if len(concatenated_names.encode('utf-8')) < MINIMUM_CONCATENATED_NAME_LENGTH: - return { - 'name_hash': None, - 'content_hash': None, - } + return {'name_hash': None, 'content_hash': None, 'dir_hash': None, 'lang_extensions': None} + + # Concatenate the unique file names without the extensions, adding a space and sorting them alphabetically + unique_file_names_list = list(unique_file_names) + unique_file_names_list.sort() + concatenated_names = ' '.join(unique_file_names_list) + + # We do the same for the directory names, adding a space and sorting them alphabetically + unique_directories_list = list(unique_directories) + unique_directories_list.sort() + concatenated_directories = ' '.join(unique_directories_list) names_simhash = simhash(WordFeatureSet(concatenated_names.encode('utf-8'))) + dir_simhash = simhash(WordFeatureSet(concatenated_directories.encode('utf-8'))) content_simhash = fingerprint(vectorize_bytes(file_hashes)) + # Debug logging similar to Go implementation + self.base.print_debug(f'Unique file names: {unique_file_names_list}') + self.base.print_debug(f'Unique directories: {unique_directories_list}') + self.base.print_debug(f'{dir_simhash:x}/{names_simhash:x} - {content_simhash:x} - {extension_map}') + return { 'name_hash': names_simhash, 'content_hash': content_simhash, + 'dir_hash': dir_simhash, + 'lang_extensions': extension_map, } - def present(self, output_format: str = None, output_file: str = None): + def present(self, output_format: Optional[str] = None, output_file: Optional[str] = None): """Present the hashed tree in the selected format""" self.presenter.present(output_format=output_format, output_file=output_file) diff --git a/src/scanoss/scanners/scanner_hfh.py b/src/scanoss/scanners/scanner_hfh.py index 4b573845..092003d0 100644 --- a/src/scanoss/scanners/scanner_hfh.py +++ b/src/scanoss/scanners/scanner_hfh.py @@ -29,6 +29,8 @@ from progress.spinner import Spinner +from scanoss.constants import DEFAULT_HFH_RANK_THRESHOLD +from scanoss.cyclonedx import CycloneDx from scanoss.file_filters import FileFilters from scanoss.scanners.folder_hasher import FolderHasher from scanoss.scanners.scanner_config import ScannerConfig @@ -52,6 +54,7 @@ def __init__( config: ScannerConfig, client: Optional[ScanossGrpc] = None, scanoss_settings: Optional[ScanossSettings] = None, + rank_threshold: int = DEFAULT_HFH_RANK_THRESHOLD, ): """ Initialize the ScannerHFH. @@ -61,6 +64,7 @@ def __init__( config (ScannerConfig): Configuration parameters for the scanner. client (ScanossGrpc): gRPC client for communicating with the scanning service. scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss. + rank_threshold (int): Get results with rank below this threshold (default: 5). """ self.base = ScanossBase( debug=config.debug, @@ -88,8 +92,7 @@ def __init__( self.scan_dir = scan_dir self.client = client self.scan_results = None - self.best_match = False - self.threshold = 100 + self.rank_threshold = rank_threshold def scan(self) -> Optional[Dict]: """ @@ -100,8 +103,7 @@ def scan(self) -> Optional[Dict]: """ hfh_request = { 'root': self.folder_hasher.hash_directory(self.scan_dir), - 'threshold': self.threshold, - 'best_match': self.best_match, + 'rank_threshold': self.rank_threshold, } spinner = Spinner('Scanning folder...') @@ -161,7 +163,50 @@ def _format_plain_output(self) -> str: ) def _format_cyclonedx_output(self) -> str: - raise NotImplementedError('CycloneDX output is not implemented') + if not self.scanner.scan_results: + return '' + try: + if 'results' not in self.scanner.scan_results or not self.scanner.scan_results['results']: + self.base.print_stderr('ERROR: No scan results found') + return '' + + first_result = self.scanner.scan_results['results'][0] + + best_match_components = [c for c in first_result.get('components', []) if c.get('order') == 1] + if not best_match_components: + self.base.print_stderr('ERROR: No best match component found') + return '' + + best_match_component = best_match_components[0] + if not best_match_component.get('versions'): + self.base.print_stderr('ERROR: No versions found for best match component') + return '' + + best_match_version = best_match_component['versions'][0] + purl = best_match_component['purl'] + + get_dependencies_json_request = { + 'files': [ + { + 'file': f'{best_match_component["name"]}:{best_match_version["version"]}', + 'purls': [{'purl': purl, 'requirement': best_match_version['version']}], + } + ] + } + + decorated_scan_results = self.scanner.client.get_dependencies(get_dependencies_json_request) + + cdx = CycloneDx(self.base.debug, self.output_file) + scan_results = {} + for f in decorated_scan_results['files']: + scan_results[f['file']] = [f] + if not cdx.produce_from_json(scan_results, self.output_file): + error_msg = 'ERROR: Failed to produce CycloneDX output' + self.base.print_stderr(error_msg) + raise ValueError(error_msg) + except Exception as e: + self.base.print_stderr(f'ERROR: Failed to get license information: {e}') + return None def _format_spdxlite_output(self) -> str: raise NotImplementedError('SPDXlite output is not implemented')