Skip to content

[SP-2587] Add directory simhash, modify concatenated names to remove extensions #119

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/scanoss/file_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import os
import sys
from pathlib import Path
from typing import List
from typing import List, Optional

from pathspec import GitIgnoreSpec

Expand Down Expand Up @@ -511,7 +511,7 @@ def get_filtered_files_from_folder(self, root: str) -> List[str]:
# Now filter the files and return the reduced list
return self.get_filtered_files_from_files(all_files, str(root_path))

def get_filtered_files_from_files(self, files: List[str], scan_root: str = None) -> List[str]:
def get_filtered_files_from_files(self, files: List[str], scan_root: Optional[str] = None) -> List[str]:
"""
Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings.

Expand Down Expand Up @@ -615,8 +615,13 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]:
# Default patterns for skipping directories
if not self.all_folders:
DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
DEFAULT_SKIPPED_DIR_EXT_LIST = (
DEFAULT_SKIPPED_DIR_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIR_EXT
)
for dir_name in DEFAULT_SKIPPED_DIR_LIST:
patterns.append(f'{dir_name}/')
for dir_extension in DEFAULT_SKIPPED_DIR_EXT_LIST:
patterns.append(f'*{dir_extension}/')

# Custom patterns added in SCANOSS settings file
if self.scanoss_settings:
Expand Down
52 changes: 32 additions & 20 deletions src/scanoss/scanners/folder_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class DirectoryFile:
Represents a file in the directory tree for folder hashing.
"""

def __init__(self, path: str, key: bytes, key_str: str):
def __init__(self, path: str, key: List[bytes], key_str: str):
self.path = path
self.key = key
self.key_str = key_str
Expand Down Expand Up @@ -77,7 +77,7 @@ class FolderHasher:
def __init__(
self,
scan_dir: str,
config: Optional[FolderHasherConfig] = None,
config: FolderHasherConfig,
scanoss_settings: Optional[ScanossSettings] = None,
):
self.base = ScanossBase(
Expand Down Expand Up @@ -194,11 +194,13 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict:
dict: The computed hash data for the node.
"""
hash_data = self._hash_calc(node)
rel_path = Path(node.path).relative_to(self.scan_dir)

return {
'path_id': node.path,
'path_id': str(rel_path),
'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None,
'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None,
'sim_hash_dir_names': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None,
'children': [self._hash_calc_from_node(child) for child in node.children.values()],
}

Expand All @@ -218,44 +220,54 @@ def _hash_calc(self, node: DirectoryNode) -> dict:
dict: A dictionary with 'name_hash' and 'content_hash' keys.
"""
processed_hashes = set()
unique_file_names = set()
unique_directories = set()
file_hashes = []
selected_names = []

for file in node.files:
key_str = file.key_str
if key_str in processed_hashes:
continue
processed_hashes.add(key_str)

selected_names.append(os.path.basename(file.path))
file_name = os.path.basename(file.path)
file_name_without_extension, _ = os.path.splitext(file_name)
current_directory = os.path.dirname(file.path)

last_directory = Path(current_directory).name or Path(self.scan_dir).name

file_key = bytes(file.key)
file_hashes.append(file_key)
processed_hashes.add(key_str)
unique_file_names.add(file_name_without_extension)
unique_directories.add(last_directory)
selected_names.append(file_name)
file_hashes.append(file.key)

if len(selected_names) < MINIMUM_FILE_COUNT:
return {
'name_hash': None,
'content_hash': None,
}
return {'name_hash': None, 'content_hash': None, 'dir_hash': None}

selected_names.sort()
concatenated_names = ''.join(selected_names)

if len(concatenated_names.encode('utf-8')) < MINIMUM_CONCATENATED_NAME_LENGTH:
return {
'name_hash': None,
'content_hash': None,
}
return {'name_hash': None, 'content_hash': None, 'dir_hash': None}

# Concatenate the unique file names without the extensions, adding a space and sorting them alphabetically
unique_file_names_list = list(unique_file_names)
unique_file_names_list.sort()
concatenated_names = ' '.join(unique_file_names_list)

# We do the same for the directory names, adding a space and sorting them alphabetically
unique_directories_list = list(unique_directories)
unique_directories_list.sort()
concatenated_directories = ' '.join(unique_directories_list)

names_simhash = simhash(WordFeatureSet(concatenated_names.encode('utf-8')))
dir_simhash = simhash(WordFeatureSet(concatenated_directories.encode('utf-8')))
content_simhash = fingerprint(vectorize_bytes(file_hashes))

return {
'name_hash': names_simhash,
'content_hash': content_simhash,
}
return {'name_hash': names_simhash, 'content_hash': content_simhash, 'dir_hash': dir_simhash}

def present(self, output_format: str = None, output_file: str = None):
def present(self, output_format: Optional[str] = None, output_file: Optional[str] = None):
"""Present the hashed tree in the selected format"""
self.presenter.present(output_format=output_format, output_file=output_file)

Expand Down