Skip to content

Commit

Permalink
SP-431 Adds HPSM support
Browse files Browse the repository at this point in the history
Co-authored-by: Alejandro Perez <[email protected]>
  • Loading branch information
agustingroh committed Apr 4, 2024
1 parent db510fb commit cbbb141
Show file tree
Hide file tree
Showing 9 changed files with 213 additions and 32 deletions.
2 changes: 2 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Upcoming changes...

## [0.7.0] - 2024-04-04
### Added
- Add HPSM support

## [0.6.1] - 2024-04-01
### Changed
- Fixed issue with SBOM ingestion
Expand Down Expand Up @@ -74,3 +78,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[0.5.5]: https://github.com/scanoss/scanoss.java/compare/v0.5.4...v0.5.5
[0.6.0]: https://github.com/scanoss/scanoss.java/compare/v0.5.5...v0.6.0
[0.6.1]: https://github.com/scanoss/scanoss.java/compare/v0.6.0...v0.6.1
[0.7.0]: https://github.com/scanoss/scanoss.java/compare/v0.6.1...v0.7.0
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Include in a maven project using:
<dependency>
<groupId>com.scanoss</groupId>
<artifactId>scanoss</artifactId>
<version>0.6.0</version>
<version>0.7.0</version>
</dependency>
```

Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>com.scanoss</groupId>
<artifactId>scanoss</artifactId>
<version>0.6.1</version>
<version>0.7.0</version>
<packaging>jar</packaging>
<name>scanoss.java</name>
<url>https://github.com/scanoss/scanoss.java</url>
Expand Down
38 changes: 10 additions & 28 deletions src/main/java/com/scanoss/Winnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
package com.scanoss;

import com.scanoss.exceptions.WinnowingException;
import lombok.Builder;
import lombok.Getter;
import lombok.NonNull;
import lombok.Setter;
import com.scanoss.utils.Hpsm;
import com.scanoss.utils.WinnowingUtils;
import lombok.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.tika.Tika;
Expand Down Expand Up @@ -66,7 +65,7 @@ public class Winnowing {
@Builder.Default
private Boolean obfuscate = Boolean.FALSE; // Obfuscate file path
@Builder.Default
private Boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
@Builder.Default
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation

Expand Down Expand Up @@ -98,7 +97,6 @@ public String wfpForFile(@NonNull String filePath, @NonNull String path) throws
}
}


/**
* Generate a WFP for the given file contents
*
Expand All @@ -119,7 +117,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
return wfpBuilder.toString();
}
// TODO add HPSM support here

if(this.isHpsm()){
wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents)));
}

String gram = "";
List<Long> window = new ArrayList<>();
char normalized;
Expand All @@ -133,7 +135,7 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
line++;
normalized = 0;
} else {
normalized = normalize(c);
normalized = WinnowingUtils.normalize(c);
}
if (normalized > 0) {
gram += normalized;
Expand Down Expand Up @@ -180,7 +182,6 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c

/**
* Determine if a file/contents should be skipped for snippet generation or not
*
* @param filename filename for the contents (optional)
* @param contents file contents
* @return <code>true</code> if we should skip snippets, <code>false</code> otherwise
Expand Down Expand Up @@ -300,7 +301,6 @@ private Boolean isTextMediaType(MediaType mediaType) {
return mediaTypes.stream().anyMatch(mt -> mt.getType().equals("text"));
}


/**
* Convert the give number to a Little Endian encoded byte
*
Expand All @@ -316,24 +316,6 @@ private byte[] toLittleEndian(long number) {
return b;
}

/**
* Normalise the given character
*
* @param c character to normalise
* @return normalised character
*/
private char normalize(char c) {
if (c < '0' || c > 'z') {
return 0;
} else if (c <= '9' || c >= 'a') {
return c;
} else if (c >= 'A' && c <= 'Z') {
return (char) (c + 32);
} else {
return 0;
}
}

/**
* Calculate the CRC32 for the given string
*
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/com/scanoss/cli/ScanCommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ class ScanCommandLine implements Runnable {
@picocli.CommandLine.Option(names = {"--proxy"}, description = "HTTP Proxy URL (optional)")
private String proxyString;

@picocli.CommandLine.Option(names = {"-H", "--hpsm"}, description = "Use High Precision Snippet Matching algorithm")
private boolean enableHpsm = false;

@picocli.CommandLine.Parameters(arity = "1", description = "file/folder to scan")
private String fileFolder;

Expand Down Expand Up @@ -160,7 +163,7 @@ public void run() {
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions)
.hiddenFilesFolders(allHidden).numThreads(numThreads).url(apiUrl).apiKey(apiKey)
.retryLimit(retryLimit).timeout(Duration.ofSeconds(timeoutLimit)).scanFlags(scanFlags)
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit).customCert(caCertPem).proxy(proxy)
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit).customCert(caCertPem).proxy(proxy).hpsm(enableHpsm)
.build();
File f = new File(fileFolder);
if (!f.exists()) {
Expand Down
146 changes: 146 additions & 0 deletions src/main/java/com/scanoss/utils/Hpsm.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
package com.scanoss.utils;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

/**
* SCANOSS Hpsm Class
* <p>
* The Hpsm class provides all the necessary implementations to generate HPSM fingerprint for a given file or contents.
* </p>
*/
public class Hpsm {

// CRC8 table, Polynomial, initial CRC and post CRC XOR value.
private static final int CRC8_MAXIM_DOW_TABLE_SIZE = 0x100;
private static final int CRC8_MAXIM_DOW_POLYNOMIAL = 0x8C; // 0x31 reflected
private static final int CRC8_MAXIM_DOW_INITIAL = 0x00; // 0x00 reflected
private static final int CRC8_MAXIM_DOW_FINAL = 0x00; // 0x00 reflected
private static int[] crc8MaximDowTable = new int[CRC8_MAXIM_DOW_TABLE_SIZE];

private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII);

/**
* Calculates the HPSM value for the given content, represented as an array of bytes.
* This method performs normalization on the content, calculates CRC8 for each line,
* and returns the hexadecimal representation of the CRC8 values.
*
* @param content the content as an array of bytes
* @return the HPSM value in hexadecimal format
*/
public static String calcHpsm(byte[] content) {
List<Integer> listNormalized = new ArrayList<>();
List<Integer> crcLines = new ArrayList<>();

int lastLine = 0;
crc8MaximDowGenerateTable();

for (int i = 0; i < content.length ; i++) {
char c = (char) content[i];
if (c == '\n') { // When there is a new line
if (!listNormalized.isEmpty()) {
crcLines.add(crc8MaximDowBuffer(convertListToByteArray(listNormalized)));
listNormalized.clear();
} else if (lastLine + 1 == i) {
crcLines.add(0xFF);
} else if (i - lastLine > 1) {
crcLines.add(0x00);
}
lastLine = i;
} else {
int cNormalized = WinnowingUtils.normalize(c);
if (cNormalized != 0) listNormalized.add(cNormalized);
}
}

return convertToHex(convertListToByteArray(crcLines));
}

/**
* Calculates CRC-8 using the Maxim/Dallas polynomial without using a lookup table.
* This method is suitable for applications where memory constraints are critical
* and a lookup table cannot be afforded.
*
* @param crc The current CRC value.
* @param b The byte to be processed.
* @return The updated CRC value after processing the byte.
*/
private static int crc8MaximDowByteNoTable(int crc, int b) {
crc ^= b;
for (int count = 0; count < 8; count++) {
boolean isSet = (crc & 0x01) != 0;
crc >>= 1;
if (isSet) crc ^= CRC8_MAXIM_DOW_POLYNOMIAL;
}
return crc;
}

/**
* Generates a lookup table for CRC-8 using the Maxim/Dallas polynomial.
* The generated table is used for faster CRC calculations.
*/
private static void crc8MaximDowGenerateTable() {
for (int i = 0; i < CRC8_MAXIM_DOW_TABLE_SIZE; i++) {
crc8MaximDowTable[i] = crc8MaximDowByteNoTable(0, i);
}
}

/**
* Calculates CRC-8 using the Maxim/Dow polynomial with a lookup table.
* This method utilizes a pre-generated lookup table for faster CRC calculations.
*
* @param crc The current CRC value.
* @param b The byte to be processed.
* @return The updated CRC value after processing the byte.
*/
private static int crc8MaximDowByte(int crc, int b) {
int index = b ^ crc;
return crc8MaximDowTable[index] ^ (crc >> 8);
}

/**
* Calculates CRC-8 for a buffer of bytes using the Maxim/Dallas polynomial.
*
* @param buffer The buffer containing bytes for CRC calculation.
* @return The CRC-8 value for the given buffer.
*/
private static int crc8MaximDowBuffer(byte[] buffer) {
int crc = CRC8_MAXIM_DOW_INITIAL;
for (byte b : buffer) {
crc = crc8MaximDowByte(crc, b & 0xFF); // Convert byte to unsigned integer
}
crc ^= CRC8_MAXIM_DOW_FINAL;
return crc;
}

/**
* Converts a list of integers to a byte array.
*
* @param integerList The list of integers to be converted.
* @return The byte array representing the converted integers.
*/
private static byte[] convertListToByteArray(List<Integer> integerList) {
byte[] byteArray = new byte[integerList.size()];
for (int i = 0; i < integerList.size(); i++) {
byteArray[i] = integerList.get(i).byteValue();
}
return byteArray;
}

/**
* Converts an array of bytes to its hexadecimal representation.
*
* @param bytes the array of bytes to be converted
* @return the hexadecimal representation of the input byte array
*/
private static String convertToHex(byte [] bytes) {
byte[] hexChars = new byte[bytes.length * 2];
for (int j = 0; j < bytes.length; j++) {
int v = bytes[j] & 0xFF;
hexChars[j * 2] = HEX_ARRAY[v >>> 4];
hexChars[j * 2 + 1] = HEX_ARRAY[v & 0x0F];
}
return new String(hexChars, StandardCharsets.UTF_8).toLowerCase();
}
}
22 changes: 22 additions & 0 deletions src/main/java/com/scanoss/utils/WinnowingUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package com.scanoss.utils;

public class WinnowingUtils {

/**
* Normalise the given character
*
* @param c character to normalise
* @return normalised character
*/
public static char normalize(char c) {
if (c < '0' || c > 'z') {
return 0;
} else if (c <= '9' || c >= 'a') {
return c;
} else if (c >= 'A' && c <= 'Z') {
return (char) (c + 32);
} else {
return 0;
}
}
}
23 changes: 22 additions & 1 deletion src/test/java/com/scanoss/TestWinnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.regex.Pattern;

import static org.junit.Assert.*;

@Slf4j
Expand Down Expand Up @@ -92,6 +91,28 @@ public void TestWinnowingPositive() {
log.info("Finished {} -->", methodName);
}

@Test
public void TestWinnowingContentsHPSM() {
String methodName = new Object() {
}.getClass().getEnclosingMethod().getName();
log.info("<-- Starting {}", methodName);
Winnowing winnowing = Winnowing.builder().hpsm(true).build();


byte[] contents = "sample c code with lots of code that we should analyse\nAnd even more code to get connected.\nAnd we need to get this as long as possible, in order to trigger snippet matching.\nHere comes more code to help get this working.\nPlease help get this across the line. We need all the help we can get.\n".getBytes();
String wfp = winnowing.wfpForContents("local-file.c", false, contents);
assertNotNull(wfp);
assertFalse(wfp.isEmpty());
assertEquals("file=609a24b6cd27ef8108792ca459db1b28,293,local-file.c\n" +
"hpsm=df13c104d4\n" +
"3=0ed5027a,a9442399,d019b836\n" +
"4=613d56c0\n" +
"5=828b5fe0\n",wfp);
log.info("TestWinnowingContents - WFP contents: {}", wfp);

log.info("Finished {} -->", methodName);
}

@Test
public void TestWinnowingContents() {
String methodName = new Object() {
Expand Down

0 comments on commit cbbb141

Please sign in to comment.