Skip to content

Commit cbbb141

Browse files
committed
SP-431 Adds HPSM support
Co-authored-by: Alejandro Perez <[email protected]>
1 parent db510fb commit cbbb141

File tree

9 files changed

+213
-32
lines changed

9 files changed

+213
-32
lines changed

.idea/misc.xml

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1111

1212
- Upcoming changes...
1313

14+
## [0.7.0] - 2024-04-04
15+
### Added
16+
- Add HPSM support
17+
1418
## [0.6.1] - 2024-04-01
1519
### Changed
1620
- Fixed issue with SBOM ingestion
@@ -74,3 +78,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7478
[0.5.5]: https://github.com/scanoss/scanoss.java/compare/v0.5.4...v0.5.5
7579
[0.6.0]: https://github.com/scanoss/scanoss.java/compare/v0.5.5...v0.6.0
7680
[0.6.1]: https://github.com/scanoss/scanoss.java/compare/v0.6.0...v0.6.1
81+
[0.7.0]: https://github.com/scanoss/scanoss.java/compare/v0.6.1...v0.7.0

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Include in a maven project using:
1616
<dependency>
1717
<groupId>com.scanoss</groupId>
1818
<artifactId>scanoss</artifactId>
19-
<version>0.6.0</version>
19+
<version>0.7.0</version>
2020
</dependency>
2121
```
2222

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>com.scanoss</groupId>
88
<artifactId>scanoss</artifactId>
9-
<version>0.6.1</version>
9+
<version>0.7.0</version>
1010
<packaging>jar</packaging>
1111
<name>scanoss.java</name>
1212
<url>https://github.com/scanoss/scanoss.java</url>

src/main/java/com/scanoss/Winnowing.java

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,9 @@
2323
package com.scanoss;
2424

2525
import com.scanoss.exceptions.WinnowingException;
26-
import lombok.Builder;
27-
import lombok.Getter;
28-
import lombok.NonNull;
29-
import lombok.Setter;
26+
import com.scanoss.utils.Hpsm;
27+
import com.scanoss.utils.WinnowingUtils;
28+
import lombok.*;
3029
import lombok.extern.slf4j.Slf4j;
3130
import org.apache.commons.codec.digest.DigestUtils;
3231
import org.apache.tika.Tika;
@@ -66,7 +65,7 @@ public class Winnowing {
6665
@Builder.Default
6766
private Boolean obfuscate = Boolean.FALSE; // Obfuscate file path
6867
@Builder.Default
69-
private Boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
68+
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
7069
@Builder.Default
7170
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation
7271

@@ -98,7 +97,6 @@ public String wfpForFile(@NonNull String filePath, @NonNull String path) throws
9897
}
9998
}
10099

101-
102100
/**
103101
* Generate a WFP for the given file contents
104102
*
@@ -119,7 +117,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
119117
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
120118
return wfpBuilder.toString();
121119
}
122-
// TODO add HPSM support here
120+
121+
if(this.isHpsm()){
122+
wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents)));
123+
}
124+
123125
String gram = "";
124126
List<Long> window = new ArrayList<>();
125127
char normalized;
@@ -133,7 +135,7 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
133135
line++;
134136
normalized = 0;
135137
} else {
136-
normalized = normalize(c);
138+
normalized = WinnowingUtils.normalize(c);
137139
}
138140
if (normalized > 0) {
139141
gram += normalized;
@@ -180,7 +182,6 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
180182

181183
/**
182184
* Determine if a file/contents should be skipped for snippet generation or not
183-
*
184185
* @param filename filename for the contents (optional)
185186
* @param contents file contents
186187
* @return <code>true</code> if we should skip snippets, <code>false</code> otherwise
@@ -300,7 +301,6 @@ private Boolean isTextMediaType(MediaType mediaType) {
300301
return mediaTypes.stream().anyMatch(mt -> mt.getType().equals("text"));
301302
}
302303

303-
304304
/**
305305
* Convert the give number to a Little Endian encoded byte
306306
*
@@ -316,24 +316,6 @@ private byte[] toLittleEndian(long number) {
316316
return b;
317317
}
318318

319-
/**
320-
* Normalise the given character
321-
*
322-
* @param c character to normalise
323-
* @return normalised character
324-
*/
325-
private char normalize(char c) {
326-
if (c < '0' || c > 'z') {
327-
return 0;
328-
} else if (c <= '9' || c >= 'a') {
329-
return c;
330-
} else if (c >= 'A' && c <= 'Z') {
331-
return (char) (c + 32);
332-
} else {
333-
return 0;
334-
}
335-
}
336-
337319
/**
338320
* Calculate the CRC32 for the given string
339321
*

src/main/java/com/scanoss/cli/ScanCommandLine.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ class ScanCommandLine implements Runnable {
100100
@picocli.CommandLine.Option(names = {"--proxy"}, description = "HTTP Proxy URL (optional)")
101101
private String proxyString;
102102

103+
@picocli.CommandLine.Option(names = {"-H", "--hpsm"}, description = "Use High Precision Snippet Matching algorithm")
104+
private boolean enableHpsm = false;
105+
103106
@picocli.CommandLine.Parameters(arity = "1", description = "file/folder to scan")
104107
private String fileFolder;
105108

@@ -160,7 +163,7 @@ public void run() {
160163
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions)
161164
.hiddenFilesFolders(allHidden).numThreads(numThreads).url(apiUrl).apiKey(apiKey)
162165
.retryLimit(retryLimit).timeout(Duration.ofSeconds(timeoutLimit)).scanFlags(scanFlags)
163-
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit).customCert(caCertPem).proxy(proxy)
166+
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit).customCert(caCertPem).proxy(proxy).hpsm(enableHpsm)
164167
.build();
165168
File f = new File(fileFolder);
166169
if (!f.exists()) {
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
package com.scanoss.utils;
2+
3+
import java.nio.charset.StandardCharsets;
4+
import java.util.ArrayList;
5+
import java.util.List;
6+
7+
/**
8+
* SCANOSS Hpsm Class
9+
* <p>
10+
* The Hpsm class provides all the necessary implementations to generate HPSM fingerprint for a given file or contents.
11+
* </p>
12+
*/
13+
public class Hpsm {
14+
15+
// CRC8 table, Polynomial, initial CRC and post CRC XOR value.
16+
private static final int CRC8_MAXIM_DOW_TABLE_SIZE = 0x100;
17+
private static final int CRC8_MAXIM_DOW_POLYNOMIAL = 0x8C; // 0x31 reflected
18+
private static final int CRC8_MAXIM_DOW_INITIAL = 0x00; // 0x00 reflected
19+
private static final int CRC8_MAXIM_DOW_FINAL = 0x00; // 0x00 reflected
20+
private static int[] crc8MaximDowTable = new int[CRC8_MAXIM_DOW_TABLE_SIZE];
21+
22+
private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII);
23+
24+
/**
25+
* Calculates the HPSM value for the given content, represented as an array of bytes.
26+
* This method performs normalization on the content, calculates CRC8 for each line,
27+
* and returns the hexadecimal representation of the CRC8 values.
28+
*
29+
* @param content the content as an array of bytes
30+
* @return the HPSM value in hexadecimal format
31+
*/
32+
public static String calcHpsm(byte[] content) {
33+
List<Integer> listNormalized = new ArrayList<>();
34+
List<Integer> crcLines = new ArrayList<>();
35+
36+
int lastLine = 0;
37+
crc8MaximDowGenerateTable();
38+
39+
for (int i = 0; i < content.length ; i++) {
40+
char c = (char) content[i];
41+
if (c == '\n') { // When there is a new line
42+
if (!listNormalized.isEmpty()) {
43+
crcLines.add(crc8MaximDowBuffer(convertListToByteArray(listNormalized)));
44+
listNormalized.clear();
45+
} else if (lastLine + 1 == i) {
46+
crcLines.add(0xFF);
47+
} else if (i - lastLine > 1) {
48+
crcLines.add(0x00);
49+
}
50+
lastLine = i;
51+
} else {
52+
int cNormalized = WinnowingUtils.normalize(c);
53+
if (cNormalized != 0) listNormalized.add(cNormalized);
54+
}
55+
}
56+
57+
return convertToHex(convertListToByteArray(crcLines));
58+
}
59+
60+
/**
61+
* Calculates CRC-8 using the Maxim/Dallas polynomial without using a lookup table.
62+
* This method is suitable for applications where memory constraints are critical
63+
* and a lookup table cannot be afforded.
64+
*
65+
* @param crc The current CRC value.
66+
* @param b The byte to be processed.
67+
* @return The updated CRC value after processing the byte.
68+
*/
69+
private static int crc8MaximDowByteNoTable(int crc, int b) {
70+
crc ^= b;
71+
for (int count = 0; count < 8; count++) {
72+
boolean isSet = (crc & 0x01) != 0;
73+
crc >>= 1;
74+
if (isSet) crc ^= CRC8_MAXIM_DOW_POLYNOMIAL;
75+
}
76+
return crc;
77+
}
78+
79+
/**
80+
* Generates a lookup table for CRC-8 using the Maxim/Dallas polynomial.
81+
* The generated table is used for faster CRC calculations.
82+
*/
83+
private static void crc8MaximDowGenerateTable() {
84+
for (int i = 0; i < CRC8_MAXIM_DOW_TABLE_SIZE; i++) {
85+
crc8MaximDowTable[i] = crc8MaximDowByteNoTable(0, i);
86+
}
87+
}
88+
89+
/**
90+
* Calculates CRC-8 using the Maxim/Dow polynomial with a lookup table.
91+
* This method utilizes a pre-generated lookup table for faster CRC calculations.
92+
*
93+
* @param crc The current CRC value.
94+
* @param b The byte to be processed.
95+
* @return The updated CRC value after processing the byte.
96+
*/
97+
private static int crc8MaximDowByte(int crc, int b) {
98+
int index = b ^ crc;
99+
return crc8MaximDowTable[index] ^ (crc >> 8);
100+
}
101+
102+
/**
103+
* Calculates CRC-8 for a buffer of bytes using the Maxim/Dallas polynomial.
104+
*
105+
* @param buffer The buffer containing bytes for CRC calculation.
106+
* @return The CRC-8 value for the given buffer.
107+
*/
108+
private static int crc8MaximDowBuffer(byte[] buffer) {
109+
int crc = CRC8_MAXIM_DOW_INITIAL;
110+
for (byte b : buffer) {
111+
crc = crc8MaximDowByte(crc, b & 0xFF); // Convert byte to unsigned integer
112+
}
113+
crc ^= CRC8_MAXIM_DOW_FINAL;
114+
return crc;
115+
}
116+
117+
/**
118+
* Converts a list of integers to a byte array.
119+
*
120+
* @param integerList The list of integers to be converted.
121+
* @return The byte array representing the converted integers.
122+
*/
123+
private static byte[] convertListToByteArray(List<Integer> integerList) {
124+
byte[] byteArray = new byte[integerList.size()];
125+
for (int i = 0; i < integerList.size(); i++) {
126+
byteArray[i] = integerList.get(i).byteValue();
127+
}
128+
return byteArray;
129+
}
130+
131+
/**
132+
* Converts an array of bytes to its hexadecimal representation.
133+
*
134+
* @param bytes the array of bytes to be converted
135+
* @return the hexadecimal representation of the input byte array
136+
*/
137+
private static String convertToHex(byte [] bytes) {
138+
byte[] hexChars = new byte[bytes.length * 2];
139+
for (int j = 0; j < bytes.length; j++) {
140+
int v = bytes[j] & 0xFF;
141+
hexChars[j * 2] = HEX_ARRAY[v >>> 4];
142+
hexChars[j * 2 + 1] = HEX_ARRAY[v & 0x0F];
143+
}
144+
return new String(hexChars, StandardCharsets.UTF_8).toLowerCase();
145+
}
146+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package com.scanoss.utils;
2+
3+
public class WinnowingUtils {
4+
5+
/**
6+
* Normalise the given character
7+
*
8+
* @param c character to normalise
9+
* @return normalised character
10+
*/
11+
public static char normalize(char c) {
12+
if (c < '0' || c > 'z') {
13+
return 0;
14+
} else if (c <= '9' || c >= 'a') {
15+
return c;
16+
} else if (c >= 'A' && c <= 'Z') {
17+
return (char) (c + 32);
18+
} else {
19+
return 0;
20+
}
21+
}
22+
}

src/test/java/com/scanoss/TestWinnowing.java

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import java.io.IOException;
3434
import java.util.Arrays;
3535
import java.util.regex.Pattern;
36-
3736
import static org.junit.Assert.*;
3837

3938
@Slf4j
@@ -92,6 +91,28 @@ public void TestWinnowingPositive() {
9291
log.info("Finished {} -->", methodName);
9392
}
9493

94+
@Test
95+
public void TestWinnowingContentsHPSM() {
96+
String methodName = new Object() {
97+
}.getClass().getEnclosingMethod().getName();
98+
log.info("<-- Starting {}", methodName);
99+
Winnowing winnowing = Winnowing.builder().hpsm(true).build();
100+
101+
102+
byte[] contents = "sample c code with lots of code that we should analyse\nAnd even more code to get connected.\nAnd we need to get this as long as possible, in order to trigger snippet matching.\nHere comes more code to help get this working.\nPlease help get this across the line. We need all the help we can get.\n".getBytes();
103+
String wfp = winnowing.wfpForContents("local-file.c", false, contents);
104+
assertNotNull(wfp);
105+
assertFalse(wfp.isEmpty());
106+
assertEquals("file=609a24b6cd27ef8108792ca459db1b28,293,local-file.c\n" +
107+
"hpsm=df13c104d4\n" +
108+
"3=0ed5027a,a9442399,d019b836\n" +
109+
"4=613d56c0\n" +
110+
"5=828b5fe0\n",wfp);
111+
log.info("TestWinnowingContents - WFP contents: {}", wfp);
112+
113+
log.info("Finished {} -->", methodName);
114+
}
115+
95116
@Test
96117
public void TestWinnowingContents() {
97118
String methodName = new Object() {

0 commit comments

Comments
 (0)