Skip to content

Commit e57f4d8

Browse files
committed
added snippet line length limit option
1 parent de8b6c8 commit e57f4d8

File tree

12 files changed

+124
-46
lines changed

12 files changed

+124
-46
lines changed

CHANGELOG.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1111

1212
- Upcoming changes...
1313

14-
## [0.2.0] - 2023-07-04
14+
## [0.4.0] - 2023-07-07
1515

1616
### Added
17+
- Added long snippet generation check limit (`snippetLimit`)
18+
- Added command line option: `--snippet-limit` to support it
19+
20+
## [0.2.0] - 2023-07-04
1721

22+
### Added
1823
- First pass at the following Classes
1924
- Fingerprinting ([Winnowing](src/main/java/com/scanoss/Winnowing.java))
2025
- Scanning ([Scanner](src/main/java/com/scanoss/Scanner.java))
2126
- REST Interface ([ScanApi](src/main/java/com/scanoss/rest/ScanApi.java))
2227
- JSON Utils ([JsonUtils](src/main/java/com/scanoss/utils/JsonUtils.java))
2328
- CLI ([CommandLine](src/main/java/com/scanoss/cli/CommandLine.java))
2429

25-
[0.0.1]: https://github.com/scanoss/scanoss.java/compare/v0.0.0...v0.2.0
30+
[0.2.0]: https://github.com/scanoss/scanoss.java/compare/v0.0.0...v0.2.0
31+
[0.4.0]: https://github.com/scanoss/scanoss.java/compare/v0.2.0...v0.4.0

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>com.scanoss</groupId>
88
<artifactId>scanoss</artifactId>
9-
<version>0.3.0</version>
9+
<version>0.4.0</version>
1010
<packaging>jar</packaging>
1111
<name>scanoss.java</name>
1212
<url>https://github.com/scanoss/scanoss.java</url>

scanoss-cli.sh

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,14 @@ if [ "$b_dir" = "" ]; then
1212
b_dir=.
1313
fi
1414
export b_dir
15-
16-
# Search in the 'target' directory for the CLI jar file to execute
17-
jar_file=$(find "$b_dir/target" -name "scanoss*jar-with-dependencies.jar" -print | sort | tail -1)
15+
# Default log level of info
16+
LOG_LEVEL="${DEFAULT_LOG_LEVEL:-info}"
17+
export LOG_LEVEL
18+
jar_file=
19+
if [ -d "$b_dir/target" ] ; then
20+
# Search in the 'target' directory for the CLI jar file to execute
21+
jar_file=$(find "$b_dir/target" -name "scanoss*jar-with-dependencies.jar" -print | sort | tail -1)
22+
fi
1823
if [ "$jar_file" = "" ] ; then
1924
# Nothing there, so search the full subfolder tree
2025
jar_file=$(find "$b_dir" -name "scanoss*jar-with-dependencies.jar" -print | sort | tail -1)
@@ -24,4 +29,4 @@ if [ "$jar_file" = "" ] ; then
2429
fi
2530
fi
2631
export jar_file
27-
exec java -jar "$jar_file" "$@"
32+
exec java -Dorg.slf4j.simpleLogger.defaultLogLevel="$LOG_LEVEL" -jar "$jar_file" "$@"

src/main/java/com/scanoss/Scanner.java

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ public class Scanner {
7878
private String scanFlags; // Scan flags to pass to the API
7979
private String sbomType; // SBOM type (identify/ignore)
8080
private String sbom; // SBOM to supply while scanning
81+
private int snippetLimit; // Size limit for a single line of generated snippet
8182
private Winnowing winnowing;
8283
private ScanApi scanApi;
8384
private ScanFileProcessor scanFileProcessor;
@@ -86,7 +87,7 @@ public class Scanner {
8687
@SuppressWarnings("unused")
8788
private Scanner(Boolean skipSnippets, Boolean allExtensions, Boolean obfuscate, Boolean hpsm,
8889
Boolean hiddenFilesFolders, Boolean allFolders, Integer numThreads, Integer timeout, Integer retryLimit,
89-
String url, String apiKey, String scanFlags, String sbomType, String sbom,
90+
String url, String apiKey, String scanFlags, String sbomType, String sbom, Integer snippetLimit,
9091
Winnowing winnowing, ScanApi scanApi,
9192
ScanFileProcessor scanFileProcessor, WfpFileProcessor wfpFileProcessor
9293
) {
@@ -104,33 +105,14 @@ private Scanner(Boolean skipSnippets, Boolean allExtensions, Boolean obfuscate,
104105
this.scanFlags = scanFlags;
105106
this.sbomType = sbomType;
106107
this.sbom = sbom;
108+
this.snippetLimit = snippetLimit;
107109
this.winnowing = Objects.requireNonNullElseGet(winnowing, () ->
108-
Winnowing.builder().skipSnippets(skipSnippets).allExtensions(allExtensions).obfuscate(obfuscate).hpsm(hpsm).build());
109-
// if (winnowing == null) {
110-
// this.winnowing = Winnowing.builder().skipSnippets(skipSnippets).allExtensions(allExtensions).obfuscate(obfuscate).hpsm(hpsm).build();
111-
// } else {
112-
// this.winnowing = winnowing;
113-
// }
110+
Winnowing.builder().skipSnippets(skipSnippets).allExtensions(allExtensions).obfuscate(obfuscate).hpsm(hpsm).snippetLimit(snippetLimit).build());
114111
this.scanApi = Objects.requireNonNullElseGet(scanApi, () ->
115112
ScanApi.builder().url(url).apiKey(apiKey).timeout(timeout).retryLimit(retryLimit).flags(scanFlags).scanType(sbomType).sbom(sbom).build());
116-
// if (scanApi == null) {
117-
// this.scanApi = ScanApi.builder().url(url).apiKey(apiKey).timeout(timeout).retryLimit(retryLimit).flags(scanFlags).build();
118-
// } else {
119-
// this.scanApi = scanApi;
120-
// }
121113
this.scanFileProcessor = Objects.requireNonNullElseGet(scanFileProcessor, () ->
122114
ScanFileProcessor.builder().winnowing(this.winnowing).scanApi(this.scanApi).build());
123-
// if (scanFileProcessor == null) {
124-
// this.scanFileProcessor = ScanFileProcessor.builder().winnowing(this.winnowing).scanApi(this.scanApi).build();
125-
// } else {
126-
// this.scanFileProcessor = scanFileProcessor;
127-
// }
128115
this.wfpFileProcessor = Objects.requireNonNullElseGet(wfpFileProcessor, () -> WfpFileProcessor.builder().winnowing(this.winnowing).build());
129-
// if (wfpFileProcessor == null) {
130-
// this.wfpFileProcessor = WfpFileProcessor.builder().winnowing(this.winnowing).build();
131-
// } else {
132-
// this.wfpFileProcessor = wfpFileProcessor;
133-
// }
134116
}
135117

136118
/**
@@ -214,7 +196,7 @@ private Boolean filterFile(String name) {
214196
* @return Updated (if necessary) path
215197
*/
216198
private String stripDirectory(String scanDir, String path) {
217-
int length = scanDir.endsWith(File.pathSeparator) ? scanDir.length() : scanDir.length() + 1;
199+
int length = scanDir.endsWith(File.separator) ? scanDir.length() : scanDir.length() + 1;
218200
if (length > 0 && path.startsWith(scanDir)) {
219201
return path.substring(length);
220202
}

src/main/java/com/scanoss/ScanossConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ public class ScanossConstants {
1616
static final int WINDOW = 64; // Winnowing Window size. Do NOT Modify
1717
static final long MAX_CRC32 = 4294967296L;
1818
static final int MIN_FILE_SIZE = 256; // Minimum size for a file to be considered for snippet generation
19+
static final int MAX_LONG_LINE_CHARS = 1000; // Maximum length of a single source line to be considered source code
1920

2021
// File extensions to ignore snippets for
2122
static final List<String> SKIP_SNIPPET_EXT = Arrays.asList(

src/main/java/com/scanoss/Winnowing.java

Lines changed: 73 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
import java.util.zip.CRC32C;
4343
import java.util.zip.Checksum;
4444

45+
import static com.scanoss.ScanossConstants.MAX_LONG_LINE_CHARS;
46+
4547
/**
4648
* SCANOSS Winnowing Class
4749
* <p>
@@ -65,6 +67,8 @@ public class Winnowing {
6567
private Boolean obfuscate = Boolean.FALSE; // Obfuscate file path
6668
@Builder.Default
6769
private Boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
70+
@Builder.Default
71+
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation
6872

6973
/**
7074
* Calculate the WFP (fingerprint) for the given file
@@ -141,10 +145,15 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
141145
if (minHash != lastHash) {
142146
String minHashHex = crc32cHex(minHash);
143147
if (lastLine != line) {
144-
if (outputBuilder.length() > 0) {
145-
wfpBuilder.append(outputBuilder).append("\n");
148+
int obLength = outputBuilder.length();
149+
if (obLength > 0) {
150+
if (snippetLimit > 0 && obLength > snippetLimit) {
151+
log.debug("Skipping snippet line as it's too big ({}): {}", filename, outputBuilder);
152+
} else {
153+
wfpBuilder.append(outputBuilder).append("\n");
154+
}
146155
}
147-
outputBuilder.delete(0, outputBuilder.length());
156+
outputBuilder.delete(0, obLength);
148157
outputBuilder.append(String.format("%d=%s", line, minHashHex));
149158
} else {
150159
outputBuilder.append(",").append(minHashHex);
@@ -158,8 +167,13 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
158167
}
159168
}
160169
}
161-
if (outputBuilder.length() > 0) {
162-
wfpBuilder.append(outputBuilder).append("\n");
170+
int obLength = outputBuilder.length();
171+
if (obLength > 0) {
172+
if (snippetLimit > 0 && obLength > snippetLimit) {
173+
log.debug("Skipping snippet line as it's too big ({}) {} - {}: {}", filename, snippetLimit, obLength, outputBuilder);
174+
} else {
175+
wfpBuilder.append(outputBuilder).append("\n");
176+
}
163177
}
164178
return wfpBuilder.toString();
165179
}
@@ -177,6 +191,14 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
177191
log.trace("Generating snippets for all extensions: {}", filename);
178192
return false;
179193
}
194+
if (contents.length <= ScanossConstants.MIN_FILE_SIZE) {
195+
log.trace("Skipping snippets as the file is too small: {} - {}", filename, contents.length);
196+
return true;
197+
}
198+
if (contents[0] == '{' || contents[0] == '<') {
199+
log.trace("Skipping snippets as the file appears to be JSON/XML/HTML: {}", filename);
200+
return true;
201+
}
180202
if (!filename.isEmpty()) {
181203
String lowerFilename = filename.toLowerCase();
182204
for (String ending : ScanossConstants.SKIP_SNIPPET_EXT) {
@@ -186,17 +208,42 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
186208
}
187209
}
188210
}
189-
if (contents.length <= ScanossConstants.MIN_FILE_SIZE) {
190-
log.trace("Skipping snippets as the file is too small: {} - {}", filename, contents.length);
191-
return true;
192-
}
193-
if (contents[0] == '{' || contents[0] == '<') {
194-
log.trace("Skipping snippets as the file appears to be JSON/XML/HTML: {}", filename);
195-
return true;
196-
}
211+
// TODO do we still want this?
212+
// Check to see if the first newline is very far away. If so, it's another hint this could be a binary/data file
213+
// for (int i = 0; i < contents.length; i++) {
214+
// if (contents[i] == '\n') {
215+
// return false;
216+
// } else if (i > MAX_LONG_LINE_CHARS) {
217+
// log.trace("Skipping snippets due to file line being too long: {} - {}", filename, MAX_LONG_LINE_CHARS);
218+
// return true;
219+
// }
220+
// }
221+
// TODO do we want to skip a whole file is some of it is a large single line?
222+
// StringBuilder outputBuilder = new StringBuilder();
223+
// for (char c: contents) {
224+
// if (c == '\n') { // New line, check line length
225+
// if (outputBuilder.length() > MAX_LONG_LINE_CHARS) {
226+
// log.trace("Skipping snippets due to file line being too long: {} - {}", filename, MAX_LONG_LINE_CHARS);
227+
// return true;
228+
// }
229+
// outputBuilder.setLength(0); // empty the string again
230+
// } else {
231+
// outputBuilder.append(c);
232+
// }
233+
// }
234+
// if (outputBuilder.length() > MAX_LONG_LINE_CHARS) { // Check the last string length
235+
// log.trace("Skipping snippets due to file line being too long: {} - {}", filename, MAX_LONG_LINE_CHARS);
236+
// return true;
237+
// }
197238
return false;
198239
}
199240

241+
/**
242+
* Try to detect if this is a text file or not
243+
*
244+
* @param f File to check
245+
* @return <code>true/false</code> if is/is not a text file, <code>null</code> if something went wrong
246+
*/
200247
private Boolean isTextFile(File f) {
201248
try {
202249
String type = tika.detect(f);
@@ -213,6 +260,13 @@ private Boolean isTextFile(File f) {
213260
return null;
214261
}
215262

263+
/**
264+
* Check if the file contents is a text file
265+
*
266+
* @param f File being checked
267+
* @param contentBytes File Contents
268+
* @return <code>true</code> if a text file, <code>false</code> otherwise
269+
*/
216270
private Boolean isTextContent(File f, byte[] contentBytes) {
217271
ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(contentBytes);
218272
try {
@@ -224,6 +278,12 @@ private Boolean isTextContent(File f, byte[] contentBytes) {
224278
return false;
225279
}
226280

281+
/**
282+
* Check if this media type is a text based
283+
*
284+
* @param mediaType Media Type
285+
* @return <code>true</code> if a text file, <code>false</code> otherwise
286+
*/
227287
private Boolean isTextMediaType(MediaType mediaType) {
228288
if (mediaType == null) {
229289
return false;

src/main/java/com/scanoss/cli/ScanCommandLine.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ class ScanCommandLine implements Runnable {
8787
@picocli.CommandLine.Option(names = {"-n", "--ignore"}, description = "Ignore components specified in the SBOM file")
8888
private String ignoreSbom;
8989

90+
@picocli.CommandLine.Option(names = {"--snippet-limit"}, description = "Length of single line snippet limit (0 for unlimited, default 1000)")
91+
private int snippetLimit = 1000;
92+
9093
@picocli.CommandLine.Parameters(arity = "1", description = "file/folder to scan")
9194
private String fileFolder;
9295

@@ -136,7 +139,7 @@ public void run() {
136139
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions)
137140
.hiddenFilesFolders(allHidden).numThreads(numThreads).url(apiUrl).apiKey(apiKey)
138141
.retryLimit(retryLimit).timeout(timeoutLimit).scanFlags(scanFlags)
139-
.sbomType(sbomType).sbom(sbom)
142+
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit)
140143
.build();
141144
File f = new File(fileFolder);
142145
if (!f.exists()) {

src/main/java/com/scanoss/cli/WfpCommandLine.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ public class WfpCommandLine implements Runnable {
4040
@picocli.CommandLine.Option(names = {"-T", "--threads"}, description = "Number of parallel threads to use")
4141
private int numThreads = 5;
4242

43+
@picocli.CommandLine.Option(names = {"--snippet-limit"}, description = "Length of single line snippet limit (0 for unlimited, default 1000)")
44+
private int snippetLimit = 1000;
45+
4346
@picocli.CommandLine.Parameters(arity = "1", description = "file/folder to fingerprint")
4447
private String fileFolder;
4548

@@ -63,7 +66,8 @@ public void run() {
6366
printMsg(err, String.format("Running with %d threads.", numThreads));
6467
}
6568
}
66-
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions).hiddenFilesFolders(allHidden).numThreads(numThreads).build();
69+
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions)
70+
.hiddenFilesFolders(allHidden).numThreads(numThreads).snippetLimit(snippetLimit).build();
6771
if (f.isFile()) {
6872
wfpFile(fileFolder);
6973
} else if (f.isDirectory()) {

src/main/java/com/scanoss/rest/ScanApi.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,6 @@ public String scan(String wfp, String context, int scanID) throws ScanApiExcepti
178178
throw new ScanApiException("SCANOSS API request timed out for " + url, e);
179179
}
180180
log.debug("Connection timeout {} (retry {}). Sleeping, then trying again...", timeout, retry);
181-
//noinspection BusyWait
182181
TimeUnit.SECONDS.sleep(RETRY_FAIL_SLEEP_TIME); // Sleep ? seconds before trying again
183182
} catch (IOException | InterruptedException | NullPointerException e) {
184183
throw new ScanApiException(String.format("Problem encountered scanning: %d - %s against %s", scanID, uuid, url), e);

src/test/java/com/scanoss/TestWinnowing.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,12 @@ public void TestWinnowingFileSkipSnippets() {
162162
assertNotNull("Expected a result from WFP", wfp);
163163
assertEquals("file=d7cfce9cff6d109c6b0249233ee26368,345,testing/data/json-file.c", wfp.trim());
164164

165+
file = "testing/data/source-file-with-long-line.c";
166+
wfp = winnowing.wfpForFile(file, file);
167+
log.info("WFP for Long C file: {}", wfp);
168+
assertNotNull("Expected a result from WFP", wfp);
169+
assertFalse("Should NOT have snippets here", snippetPat.matcher(wfp).matches());
170+
165171
winnowing.setSkipSnippets(true);
166172
file = "src/test/java/com/scanoss/TestWinnowing.java";
167173
wfp = winnowing.wfpForFile(file, file);

0 commit comments

Comments
 (0)