Skip to content

Commit

Permalink
added snippet line length limit option
Browse files Browse the repository at this point in the history
  • Loading branch information
eeisegn committed Jul 7, 2023
1 parent de8b6c8 commit e57f4d8
Show file tree
Hide file tree
Showing 12 changed files with 124 additions and 46 deletions.
10 changes: 8 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Upcoming changes...

## [0.2.0] - 2023-07-04
## [0.4.0] - 2023-07-07

### Added
- Added long snippet generation check limit (`snippetLimit`)
- Added command line option: `--snippet-limit` to support it

## [0.2.0] - 2023-07-04

### Added
- First pass at the following Classes
- Fingerprinting ([Winnowing](src/main/java/com/scanoss/Winnowing.java))
- Scanning ([Scanner](src/main/java/com/scanoss/Scanner.java))
- REST Interface ([ScanApi](src/main/java/com/scanoss/rest/ScanApi.java))
- JSON Utils ([JsonUtils](src/main/java/com/scanoss/utils/JsonUtils.java))
- CLI ([CommandLine](src/main/java/com/scanoss/cli/CommandLine.java))

[0.0.1]: https://github.com/scanoss/scanoss.java/compare/v0.0.0...v0.2.0
[0.2.0]: https://github.com/scanoss/scanoss.java/compare/v0.0.0...v0.2.0
[0.4.0]: https://github.com/scanoss/scanoss.java/compare/v0.2.0...v0.4.0
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>com.scanoss</groupId>
<artifactId>scanoss</artifactId>
<version>0.3.0</version>
<version>0.4.0</version>
<packaging>jar</packaging>
<name>scanoss.java</name>
<url>https://github.com/scanoss/scanoss.java</url>
Expand Down
13 changes: 9 additions & 4 deletions scanoss-cli.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@ if [ "$b_dir" = "" ]; then
b_dir=.
fi
export b_dir

# Search in the 'target' directory for the CLI jar file to execute
jar_file=$(find "$b_dir/target" -name "scanoss*jar-with-dependencies.jar" -print | sort | tail -1)
# Default log level of info
LOG_LEVEL="${DEFAULT_LOG_LEVEL:-info}"
export LOG_LEVEL
jar_file=
if [ -d "$b_dir/target" ] ; then
# Search in the 'target' directory for the CLI jar file to execute
jar_file=$(find "$b_dir/target" -name "scanoss*jar-with-dependencies.jar" -print | sort | tail -1)
fi
if [ "$jar_file" = "" ] ; then
# Nothing there, so search the full subfolder tree
jar_file=$(find "$b_dir" -name "scanoss*jar-with-dependencies.jar" -print | sort | tail -1)
Expand All @@ -24,4 +29,4 @@ if [ "$jar_file" = "" ] ; then
fi
fi
export jar_file
exec java -jar "$jar_file" "$@"
exec java -Dorg.slf4j.simpleLogger.defaultLogLevel="$LOG_LEVEL" -jar "$jar_file" "$@"
28 changes: 5 additions & 23 deletions src/main/java/com/scanoss/Scanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ public class Scanner {
private String scanFlags; // Scan flags to pass to the API
private String sbomType; // SBOM type (identify/ignore)
private String sbom; // SBOM to supply while scanning
private int snippetLimit; // Size limit for a single line of generated snippet
private Winnowing winnowing;
private ScanApi scanApi;
private ScanFileProcessor scanFileProcessor;
Expand All @@ -86,7 +87,7 @@ public class Scanner {
@SuppressWarnings("unused")
private Scanner(Boolean skipSnippets, Boolean allExtensions, Boolean obfuscate, Boolean hpsm,
Boolean hiddenFilesFolders, Boolean allFolders, Integer numThreads, Integer timeout, Integer retryLimit,
String url, String apiKey, String scanFlags, String sbomType, String sbom,
String url, String apiKey, String scanFlags, String sbomType, String sbom, Integer snippetLimit,
Winnowing winnowing, ScanApi scanApi,
ScanFileProcessor scanFileProcessor, WfpFileProcessor wfpFileProcessor
) {
Expand All @@ -104,33 +105,14 @@ private Scanner(Boolean skipSnippets, Boolean allExtensions, Boolean obfuscate,
this.scanFlags = scanFlags;
this.sbomType = sbomType;
this.sbom = sbom;
this.snippetLimit = snippetLimit;
this.winnowing = Objects.requireNonNullElseGet(winnowing, () ->
Winnowing.builder().skipSnippets(skipSnippets).allExtensions(allExtensions).obfuscate(obfuscate).hpsm(hpsm).build());
// if (winnowing == null) {
// this.winnowing = Winnowing.builder().skipSnippets(skipSnippets).allExtensions(allExtensions).obfuscate(obfuscate).hpsm(hpsm).build();
// } else {
// this.winnowing = winnowing;
// }
Winnowing.builder().skipSnippets(skipSnippets).allExtensions(allExtensions).obfuscate(obfuscate).hpsm(hpsm).snippetLimit(snippetLimit).build());
this.scanApi = Objects.requireNonNullElseGet(scanApi, () ->
ScanApi.builder().url(url).apiKey(apiKey).timeout(timeout).retryLimit(retryLimit).flags(scanFlags).scanType(sbomType).sbom(sbom).build());
// if (scanApi == null) {
// this.scanApi = ScanApi.builder().url(url).apiKey(apiKey).timeout(timeout).retryLimit(retryLimit).flags(scanFlags).build();
// } else {
// this.scanApi = scanApi;
// }
this.scanFileProcessor = Objects.requireNonNullElseGet(scanFileProcessor, () ->
ScanFileProcessor.builder().winnowing(this.winnowing).scanApi(this.scanApi).build());
// if (scanFileProcessor == null) {
// this.scanFileProcessor = ScanFileProcessor.builder().winnowing(this.winnowing).scanApi(this.scanApi).build();
// } else {
// this.scanFileProcessor = scanFileProcessor;
// }
this.wfpFileProcessor = Objects.requireNonNullElseGet(wfpFileProcessor, () -> WfpFileProcessor.builder().winnowing(this.winnowing).build());
// if (wfpFileProcessor == null) {
// this.wfpFileProcessor = WfpFileProcessor.builder().winnowing(this.winnowing).build();
// } else {
// this.wfpFileProcessor = wfpFileProcessor;
// }
}

/**
Expand Down Expand Up @@ -214,7 +196,7 @@ private Boolean filterFile(String name) {
* @return Updated (if necessary) path
*/
private String stripDirectory(String scanDir, String path) {
int length = scanDir.endsWith(File.pathSeparator) ? scanDir.length() : scanDir.length() + 1;
int length = scanDir.endsWith(File.separator) ? scanDir.length() : scanDir.length() + 1;
if (length > 0 && path.startsWith(scanDir)) {
return path.substring(length);
}
Expand Down
1 change: 1 addition & 0 deletions src/main/java/com/scanoss/ScanossConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class ScanossConstants {
static final int WINDOW = 64; // Winnowing Window size. Do NOT Modify
static final long MAX_CRC32 = 4294967296L;
static final int MIN_FILE_SIZE = 256; // Minimum size for a file to be considered for snippet generation
static final int MAX_LONG_LINE_CHARS = 1000; // Maximum length of a single source line to be considered source code

// File extensions to ignore snippets for
static final List<String> SKIP_SNIPPET_EXT = Arrays.asList(
Expand Down
86 changes: 73 additions & 13 deletions src/main/java/com/scanoss/Winnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
import java.util.zip.CRC32C;
import java.util.zip.Checksum;

import static com.scanoss.ScanossConstants.MAX_LONG_LINE_CHARS;

/**
* SCANOSS Winnowing Class
* <p>
Expand All @@ -65,6 +67,8 @@ public class Winnowing {
private Boolean obfuscate = Boolean.FALSE; // Obfuscate file path
@Builder.Default
private Boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
@Builder.Default
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation

/**
* Calculate the WFP (fingerprint) for the given file
Expand Down Expand Up @@ -141,10 +145,15 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
if (minHash != lastHash) {
String minHashHex = crc32cHex(minHash);
if (lastLine != line) {
if (outputBuilder.length() > 0) {
wfpBuilder.append(outputBuilder).append("\n");
int obLength = outputBuilder.length();
if (obLength > 0) {
if (snippetLimit > 0 && obLength > snippetLimit) {
log.debug("Skipping snippet line as it's too big ({}): {}", filename, outputBuilder);
} else {
wfpBuilder.append(outputBuilder).append("\n");
}
}
outputBuilder.delete(0, outputBuilder.length());
outputBuilder.delete(0, obLength);
outputBuilder.append(String.format("%d=%s", line, minHashHex));
} else {
outputBuilder.append(",").append(minHashHex);
Expand All @@ -158,8 +167,13 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
}
}
}
if (outputBuilder.length() > 0) {
wfpBuilder.append(outputBuilder).append("\n");
int obLength = outputBuilder.length();
if (obLength > 0) {
if (snippetLimit > 0 && obLength > snippetLimit) {
log.debug("Skipping snippet line as it's too big ({}) {} - {}: {}", filename, snippetLimit, obLength, outputBuilder);
} else {
wfpBuilder.append(outputBuilder).append("\n");
}
}
return wfpBuilder.toString();
}
Expand All @@ -177,6 +191,14 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
log.trace("Generating snippets for all extensions: {}", filename);
return false;
}
if (contents.length <= ScanossConstants.MIN_FILE_SIZE) {
log.trace("Skipping snippets as the file is too small: {} - {}", filename, contents.length);
return true;
}
if (contents[0] == '{' || contents[0] == '<') {
log.trace("Skipping snippets as the file appears to be JSON/XML/HTML: {}", filename);
return true;
}
if (!filename.isEmpty()) {
String lowerFilename = filename.toLowerCase();
for (String ending : ScanossConstants.SKIP_SNIPPET_EXT) {
Expand All @@ -186,17 +208,42 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
}
}
}
if (contents.length <= ScanossConstants.MIN_FILE_SIZE) {
log.trace("Skipping snippets as the file is too small: {} - {}", filename, contents.length);
return true;
}
if (contents[0] == '{' || contents[0] == '<') {
log.trace("Skipping snippets as the file appears to be JSON/XML/HTML: {}", filename);
return true;
}
// TODO do we still want this?
// Check to see if the first newline is very far away. If so, it's another hint this could be a binary/data file
// for (int i = 0; i < contents.length; i++) {
// if (contents[i] == '\n') {
// return false;
// } else if (i > MAX_LONG_LINE_CHARS) {
// log.trace("Skipping snippets due to file line being too long: {} - {}", filename, MAX_LONG_LINE_CHARS);
// return true;
// }
// }
// TODO do we want to skip a whole file is some of it is a large single line?
// StringBuilder outputBuilder = new StringBuilder();
// for (char c: contents) {
// if (c == '\n') { // New line, check line length
// if (outputBuilder.length() > MAX_LONG_LINE_CHARS) {
// log.trace("Skipping snippets due to file line being too long: {} - {}", filename, MAX_LONG_LINE_CHARS);
// return true;
// }
// outputBuilder.setLength(0); // empty the string again
// } else {
// outputBuilder.append(c);
// }
// }
// if (outputBuilder.length() > MAX_LONG_LINE_CHARS) { // Check the last string length
// log.trace("Skipping snippets due to file line being too long: {} - {}", filename, MAX_LONG_LINE_CHARS);
// return true;
// }
return false;
}

/**
* Try to detect if this is a text file or not
*
* @param f File to check
* @return <code>true/false</code> if is/is not a text file, <code>null</code> if something went wrong
*/
private Boolean isTextFile(File f) {
try {
String type = tika.detect(f);
Expand All @@ -213,6 +260,13 @@ private Boolean isTextFile(File f) {
return null;
}

/**
* Check if the file contents is a text file
*
* @param f File being checked
* @param contentBytes File Contents
* @return <code>true</code> if a text file, <code>false</code> otherwise
*/
private Boolean isTextContent(File f, byte[] contentBytes) {
ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(contentBytes);
try {
Expand All @@ -224,6 +278,12 @@ private Boolean isTextContent(File f, byte[] contentBytes) {
return false;
}

/**
* Check if this media type is a text based
*
* @param mediaType Media Type
* @return <code>true</code> if a text file, <code>false</code> otherwise
*/
private Boolean isTextMediaType(MediaType mediaType) {
if (mediaType == null) {
return false;
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/com/scanoss/cli/ScanCommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ class ScanCommandLine implements Runnable {
@picocli.CommandLine.Option(names = {"-n", "--ignore"}, description = "Ignore components specified in the SBOM file")
private String ignoreSbom;

@picocli.CommandLine.Option(names = {"--snippet-limit"}, description = "Length of single line snippet limit (0 for unlimited, default 1000)")
private int snippetLimit = 1000;

@picocli.CommandLine.Parameters(arity = "1", description = "file/folder to scan")
private String fileFolder;

Expand Down Expand Up @@ -136,7 +139,7 @@ public void run() {
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions)
.hiddenFilesFolders(allHidden).numThreads(numThreads).url(apiUrl).apiKey(apiKey)
.retryLimit(retryLimit).timeout(timeoutLimit).scanFlags(scanFlags)
.sbomType(sbomType).sbom(sbom)
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit)
.build();
File f = new File(fileFolder);
if (!f.exists()) {
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/com/scanoss/cli/WfpCommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ public class WfpCommandLine implements Runnable {
@picocli.CommandLine.Option(names = {"-T", "--threads"}, description = "Number of parallel threads to use")
private int numThreads = 5;

@picocli.CommandLine.Option(names = {"--snippet-limit"}, description = "Length of single line snippet limit (0 for unlimited, default 1000)")
private int snippetLimit = 1000;

@picocli.CommandLine.Parameters(arity = "1", description = "file/folder to fingerprint")
private String fileFolder;

Expand All @@ -63,7 +66,8 @@ public void run() {
printMsg(err, String.format("Running with %d threads.", numThreads));
}
}
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions).hiddenFilesFolders(allHidden).numThreads(numThreads).build();
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions)
.hiddenFilesFolders(allHidden).numThreads(numThreads).snippetLimit(snippetLimit).build();
if (f.isFile()) {
wfpFile(fileFolder);
} else if (f.isDirectory()) {
Expand Down
1 change: 0 additions & 1 deletion src/main/java/com/scanoss/rest/ScanApi.java
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ public String scan(String wfp, String context, int scanID) throws ScanApiExcepti
throw new ScanApiException("SCANOSS API request timed out for " + url, e);
}
log.debug("Connection timeout {} (retry {}). Sleeping, then trying again...", timeout, retry);
//noinspection BusyWait
TimeUnit.SECONDS.sleep(RETRY_FAIL_SLEEP_TIME); // Sleep ? seconds before trying again
} catch (IOException | InterruptedException | NullPointerException e) {
throw new ScanApiException(String.format("Problem encountered scanning: %d - %s against %s", scanID, uuid, url), e);
Expand Down
6 changes: 6 additions & 0 deletions src/test/java/com/scanoss/TestWinnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,12 @@ public void TestWinnowingFileSkipSnippets() {
assertNotNull("Expected a result from WFP", wfp);
assertEquals("file=d7cfce9cff6d109c6b0249233ee26368,345,testing/data/json-file.c", wfp.trim());

file = "testing/data/source-file-with-long-line.c";
wfp = winnowing.wfpForFile(file, file);
log.info("WFP for Long C file: {}", wfp);
assertNotNull("Expected a result from WFP", wfp);
assertFalse("Should NOT have snippets here", snippetPat.matcher(wfp).matches());

winnowing.setSkipSnippets(true);
file = "src/test/java/com/scanoss/TestWinnowing.java";
wfp = winnowing.wfpForFile(file, file);
Expand Down
Empty file removed testing/data/sample.java
Empty file.
12 changes: 12 additions & 0 deletions testing/data/source-file-with-long-line.c

Large diffs are not rendered by default.

0 comments on commit e57f4d8

Please sign in to comment.