diff --git a/src/scanoss/file_filters.py b/src/scanoss/file_filters.py index f7c1950..d52ea38 100644 --- a/src/scanoss/file_filters.py +++ b/src/scanoss/file_filters.py @@ -25,7 +25,7 @@ import os import sys from pathlib import Path -from typing import List +from typing import List, Optional from pathspec import GitIgnoreSpec @@ -511,7 +511,7 @@ def get_filtered_files_from_folder(self, root: str) -> List[str]: # Now filter the files and return the reduced list return self.get_filtered_files_from_files(all_files, str(root_path)) - def get_filtered_files_from_files(self, files: List[str], scan_root: str = None) -> List[str]: + def get_filtered_files_from_files(self, files: List[str], scan_root: Optional[str] = None) -> List[str]: """ Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings. @@ -615,8 +615,13 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]: # Default patterns for skipping directories if not self.all_folders: DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS + DEFAULT_SKIPPED_DIR_EXT_LIST = ( + DEFAULT_SKIPPED_DIR_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIR_EXT + ) for dir_name in DEFAULT_SKIPPED_DIR_LIST: patterns.append(f'{dir_name}/') + for dir_extension in DEFAULT_SKIPPED_DIR_EXT_LIST: + patterns.append(f'*{dir_extension}/') # Custom patterns added in SCANOSS settings file if self.scanoss_settings: diff --git a/src/scanoss/scanners/folder_hasher.py b/src/scanoss/scanners/folder_hasher.py index f865485..46cb753 100644 --- a/src/scanoss/scanners/folder_hasher.py +++ b/src/scanoss/scanners/folder_hasher.py @@ -35,7 +35,7 @@ class DirectoryFile: Represents a file in the directory tree for folder hashing. """ - def __init__(self, path: str, key: bytes, key_str: str): + def __init__(self, path: str, key: List[bytes], key_str: str): self.path = path self.key = key self.key_str = key_str @@ -77,7 +77,7 @@ class FolderHasher: def __init__( self, scan_dir: str, - config: Optional[FolderHasherConfig] = None, + config: FolderHasherConfig, scanoss_settings: Optional[ScanossSettings] = None, ): self.base = ScanossBase( @@ -194,11 +194,13 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict: dict: The computed hash data for the node. """ hash_data = self._hash_calc(node) + rel_path = Path(node.path).relative_to(self.scan_dir) return { - 'path_id': node.path, + 'path_id': str(rel_path), 'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None, 'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None, + 'sim_hash_dir_names': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None, 'children': [self._hash_calc_from_node(child) for child in node.children.values()], } @@ -218,6 +220,8 @@ def _hash_calc(self, node: DirectoryNode) -> dict: dict: A dictionary with 'name_hash' and 'content_hash' keys. """ processed_hashes = set() + unique_file_names = set() + unique_directories = set() file_hashes = [] selected_names = [] @@ -225,37 +229,45 @@ def _hash_calc(self, node: DirectoryNode) -> dict: key_str = file.key_str if key_str in processed_hashes: continue - processed_hashes.add(key_str) - selected_names.append(os.path.basename(file.path)) + file_name = os.path.basename(file.path) + file_name_without_extension, _ = os.path.splitext(file_name) + current_directory = os.path.dirname(file.path) + + last_directory = Path(current_directory).name or Path(self.scan_dir).name - file_key = bytes(file.key) - file_hashes.append(file_key) + processed_hashes.add(key_str) + unique_file_names.add(file_name_without_extension) + unique_directories.add(last_directory) + selected_names.append(file_name) + file_hashes.append(file.key) if len(selected_names) < MINIMUM_FILE_COUNT: - return { - 'name_hash': None, - 'content_hash': None, - } + return {'name_hash': None, 'content_hash': None, 'dir_hash': None} selected_names.sort() concatenated_names = ''.join(selected_names) if len(concatenated_names.encode('utf-8')) < MINIMUM_CONCATENATED_NAME_LENGTH: - return { - 'name_hash': None, - 'content_hash': None, - } + return {'name_hash': None, 'content_hash': None, 'dir_hash': None} + + # Concatenate the unique file names without the extensions, adding a space and sorting them alphabetically + unique_file_names_list = list(unique_file_names) + unique_file_names_list.sort() + concatenated_names = ' '.join(unique_file_names_list) + + # We do the same for the directory names, adding a space and sorting them alphabetically + unique_directories_list = list(unique_directories) + unique_directories_list.sort() + concatenated_directories = ' '.join(unique_directories_list) names_simhash = simhash(WordFeatureSet(concatenated_names.encode('utf-8'))) + dir_simhash = simhash(WordFeatureSet(concatenated_directories.encode('utf-8'))) content_simhash = fingerprint(vectorize_bytes(file_hashes)) - return { - 'name_hash': names_simhash, - 'content_hash': content_simhash, - } + return {'name_hash': names_simhash, 'content_hash': content_simhash, 'dir_hash': dir_simhash} - def present(self, output_format: str = None, output_file: str = None): + def present(self, output_format: Optional[str] = None, output_file: Optional[str] = None): """Present the hashed tree in the selected format""" self.presenter.present(output_format=output_format, output_file=output_file)