Skip to content

Commit 830a3f9

Browse files
committed
feat: SP-2487 Implement path obfuscation on Winnowing class
1 parent ea534cf commit 830a3f9

File tree

2 files changed

+127
-1
lines changed

2 files changed

+127
-1
lines changed

src/main/java/com/scanoss/Winnowing.java

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,11 @@
2828
import lombok.*;
2929
import lombok.extern.slf4j.Slf4j;
3030
import org.apache.commons.codec.digest.DigestUtils;
31+
import org.apache.commons.io.FilenameUtils;
3132
import org.apache.tika.Tika;
3233
import org.apache.tika.mime.MediaType;
3334
import org.apache.tika.mime.MediaTypeRegistry;
35+
import org.jetbrains.annotations.NotNull;
3436

3537
import java.io.ByteArrayInputStream;
3638
import java.io.File;
@@ -68,6 +70,18 @@ public class Winnowing {
6870
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
6971
@Builder.Default
7072
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation
73+
@Builder.Default
74+
private Map<String, String> obfuscationMap = new HashMap<>();
75+
76+
/**
77+
* Resolves the real file path for a given obfuscated path.
78+
*
79+
* @param obfuscatedPath the obfuscated path
80+
* @return the real file path corresponding to the provided obfuscated path, or null if no mapping exists
81+
*/
82+
public String deobfuscateFilePath(@NotNull String obfuscatedPath) {
83+
return obfuscationMap.get(obfuscatedPath);
84+
}
7185

7286
/**
7387
* Calculate the WFP (fingerprint) for the given file
@@ -112,7 +126,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
112126
char[] fileContents = (new String(contents, Charset.defaultCharset())).toCharArray();
113127
String fileMD5 = DigestUtils.md5Hex(contents);
114128
StringBuilder wfpBuilder = new StringBuilder();
115-
// TODO add obfuscation of the filename here
129+
130+
if (obfuscate) {
131+
filename = obfuscateFilePath(filename);
132+
}
133+
116134
wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename));
117135
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
118136
return wfpBuilder.toString();
@@ -180,6 +198,42 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
180198
return wfpBuilder.toString();
181199
}
182200

201+
/**
202+
* Obfuscates the given file path by replacing it with a generated unique identifier while
203+
* retaining its original file extension. The obfuscated path can be used to mask
204+
* sensitive or easily guessable file names.
205+
*
206+
* @param originalPath the original file path to be obfuscated; must not be null
207+
* @return the obfuscated file path with a unique identifier and the original file extension
208+
*/
209+
private String obfuscateFilePath(@NotNull String originalPath) {
210+
final String extension = extractExtension(originalPath);
211+
212+
// Generate a unique identifier for the obfuscated file
213+
final int mapIndex = obfuscationMap.size();
214+
215+
final String obfuscatedPath = mapIndex + extension;
216+
this.obfuscationMap.put(obfuscatedPath, originalPath);
217+
return obfuscatedPath;
218+
}
219+
220+
/**
221+
* Extracts file extension from the given path, including the leading dot.
222+
*
223+
* @param path the file path or name (must not be null)
224+
* @return the file extension with leading dot (e.g., ".txt") or empty string if no extension
225+
*/
226+
private String extractExtension(@NotNull String path) {
227+
try {
228+
String extractedExtension = FilenameUtils.getExtension(path).trim();
229+
return extractedExtension.isEmpty() ? "" : "." + extractedExtension;
230+
} catch (IllegalArgumentException e) {
231+
log.debug("Could not extract extension from filename '{}': {}",
232+
path, e.getMessage());
233+
return "";
234+
}
235+
}
236+
183237
/**
184238
* Determine if a file/contents should be skipped for snippet generation or not
185239
* @param filename filename for the contents (optional)

src/test/java/com/scanoss/TestWinnowing.java

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525

2626
import com.scanoss.exceptions.WinnowingException;
27+
import com.scanoss.utils.WinnowingUtils;
2728
import lombok.extern.slf4j.Slf4j;
2829
import org.junit.After;
2930
import org.junit.Before;
@@ -265,4 +266,75 @@ public void TestWinnowingFileFailures() {
265266

266267
log.info("Finished {} -->", methodName);
267268
}
269+
270+
@Test
271+
public void TestWinnowingObfuscationFileWithExtension() {
272+
String methodName = new Object() {
273+
}.getClass().getEnclosingMethod().getName();
274+
log.info("<-- Starting {}", methodName);
275+
276+
Winnowing winnowing = Winnowing.builder().obfuscate(true).build();
277+
278+
String fileWithExtension = "testing/data/test-file.txt";
279+
280+
String wfpWithExtension = winnowing.wfpForFile(fileWithExtension, fileWithExtension);
281+
assertNotNull("Expected a result from WFP with extension", wfpWithExtension);
282+
283+
String obfuscatedPathWithExtension = WinnowingUtils.extractFilePathFromWFP(wfpWithExtension);
284+
assertNotNull("Should have found an obfuscated path in WFP with extension", obfuscatedPathWithExtension);
285+
286+
String originalPathWithExtension = winnowing.deobfuscateFilePath(obfuscatedPathWithExtension);
287+
assertNotNull("Should be able to retrieve original path with extension", originalPathWithExtension);
288+
assertEquals("Original path should match input file with extension", fileWithExtension, originalPathWithExtension);
289+
290+
log.info("Finished {} -->", methodName);
291+
}
292+
293+
@Test
294+
public void TestWinnowingObfuscationFileWithoutExtension() {
295+
String methodName = new Object() {
296+
}.getClass().getEnclosingMethod().getName();
297+
log.info("<-- Starting {}", methodName);
298+
299+
Winnowing winnowing = Winnowing.builder().obfuscate(true).build();
300+
301+
String fileWithoutExtension = "testing/data/nbproject";
302+
303+
String wfpWithoutExtension = winnowing.wfpForFile(fileWithoutExtension, fileWithoutExtension);
304+
305+
String obfuscatedPathWithoutExtension = WinnowingUtils.extractFilePathFromWFP(wfpWithoutExtension);
306+
assertNotNull("Should have found an obfuscated path in WFP without extension", obfuscatedPathWithoutExtension);
307+
assertEquals("Obfuscated path should be a string with value '0'", "0", obfuscatedPathWithoutExtension);
308+
309+
String originalPathWithoutExtension = winnowing.deobfuscateFilePath(obfuscatedPathWithoutExtension);
310+
assertNotNull("Should be able to retrieve original path without extension", originalPathWithoutExtension);
311+
assertEquals("Original path should match input file without extension", fileWithoutExtension, originalPathWithoutExtension);
312+
313+
log.info("Finished {} -->", methodName);
314+
}
315+
316+
@Test
317+
public void TestDeobfuscateFilePathEmpty() {
318+
String methodName = new Object() {
319+
}.getClass().getEnclosingMethod().getName();
320+
log.info("<-- Starting {}", methodName);
321+
322+
Winnowing winnowing = Winnowing.builder().build();
323+
assertNull("Should return null when given an empty obfuscated path", winnowing.deobfuscateFilePath(""));
324+
325+
log.info("Finished {} -->", methodName);
326+
}
327+
328+
@Test
329+
public void TestDeobfuscateFilePathInvalid() {
330+
String methodName = new Object() {
331+
}.getClass().getEnclosingMethod().getName();
332+
log.info("<-- Starting {}", methodName);
333+
334+
Winnowing winnowing = Winnowing.builder().build();
335+
assertNull("Should return null for a non-existent obfuscated path", winnowing.deobfuscateFilePath("invalidPath"));
336+
337+
log.info("Finished {} -->", methodName);
338+
}
268339
}
340+

0 commit comments

Comments
 (0)