Skip to content

Commit

Permalink
Standardize input/output read name buffer separator.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Jan 7, 2025
1 parent bad9ece commit 91c1cce
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,22 @@

public class NameTokenisationDecode {
//TODO: lift these values to a common location since they're used by the encoder, the decoder, and the tests

// use a single byte to separate the names in the decoded buffer; this particular byte is chosen because the
// calling code in the CRAM reader for read names already assumes the decompressed data will be a block of
// BYTE_ARRAY_STOP '\0' separated names
public final static byte NAME_SEPARATOR = 0;
public final static CharSequence LOCAL_NAME_SEPARATOR_CHARSEQUENCE = new String(new byte[] {NAME_SEPARATOR});
public static final int UNCOMPRESSED_LENGTH_ADJUSTMENT = 1;

public static final int DEFAULT_POSITION_ALLOCATION = 30;

// the input must be a ByteBuffer containing the read names, separated by the NAME_SEPARATOR byte, WITHOUT
// a terminating separator
//TODO: this needs a stop byte parameter
/**
* Return is a byte[] containing the read names, each separated by a NAME_SEPARATOR byte, including a terminating
* separator.
*/
//TODO: the caller needs to be able to specify the stop/separator byte via a parameter
public byte[] uncompress(final ByteBuffer inBuffer) {
inBuffer.order(ByteOrder.LITTLE_ENDIAN);
final int uncompressedLength = inBuffer.getInt() - UNCOMPRESSED_LENGTH_ADJUSTMENT;
final int uncompressedLength = inBuffer.getInt();

final int numNames = inBuffer.getInt() & 0xFFFFFFFF;
final int useArith = inBuffer.get() & 0xFF;
Expand All @@ -46,11 +46,8 @@ public byte[] uncompress(final ByteBuffer inBuffer) {
final List<List<String>> decodedNameTokens = new ArrayList<>(numNames);
final ByteBuffer decodedNames = CompressionUtils.allocateByteBuffer(uncompressedLength);
for (int i = 0; i < numNames; i++) {
final byte[] b = decodeSingleName(tokenStreams, decodedNameTokens, i);
decodedNames.put(b);
if (i != numNames - 1) {
decodedNames.put((byte) '\0');
}
decodedNames.put(decodeSingleName(tokenStreams, decodedNameTokens, i));
decodedNames.put((byte) NAME_SEPARATOR);
}
return decodedNames.array();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,17 @@ public class NameTokenisationEncode {
private int maxStringValueLength; // longest *String* value for any token

/**
* Output format is the read names, separated by the NAME_SEPARATOR byte, WITHOUT a terminating separator
* Input buffer format is the read names, separated by the NAME_SEPARATOR byte, including a terminal separator.
*/
//TODO: the caller needs to be able to specify the stop byte via parameter
public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith) {
// strictly speaking, keeping this list isn't necessary, but since the first thing that we need to write
// to the output stream is the number of names, we have to scan the entire input anyway to count them,
// so just extract them while we're scanning
final List<String> namesToEncode = extractInputNames(inBuffer, CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE);
final int numNames = namesToEncode.size();
// compensate for the separator at the end of the last name that is not present in the local implementation
final int uncompressedDataSize = Integer.max(0, inBuffer.limit() + NameTokenisationDecode.UNCOMPRESSED_LENGTH_ADJUSTMENT);
final int uncompressedDataSize = Integer.max(0, inBuffer.limit());

//TODO: guess max size -> str.length*2 + 10000 (from htscodecs javascript code)
// what if this is exceeded ?
Expand Down Expand Up @@ -179,20 +180,20 @@ private List<EncodeToken> tokeniseName(
}

// extract the individual names from the input buffer and return in a list
// TODO: this needs a parameter to specify the separator
private static List<String> extractInputNames(final ByteBuffer inBuffer, final int preAllocationSize) {
final List<String> names = new ArrayList(preAllocationSize);
for (int lastPosition = inBuffer.position(); inBuffer.hasRemaining();) {
final byte currentByte = inBuffer.get();
if (currentByte == NameTokenisationDecode.NAME_SEPARATOR || inBuffer.position() == inBuffer.limit()) {
if (currentByte == NameTokenisationDecode.NAME_SEPARATOR) {
final int length = inBuffer.position() - lastPosition;
final byte[] bytes = new byte[length];
inBuffer.position(lastPosition);
inBuffer.get(bytes, 0, length); // consume the string + the terminator
names.add(new String(
bytes,
0,
//TODO: special case handling end of the buffer, where there is no trailing separator
length - (inBuffer.position() == inBuffer.limit() ? 0 : 1),
length - 1, // don't include the separator in the string
StandardCharsets.UTF_8));
lastPosition = inBuffer.position();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,8 @@ private static String getUnCompressedFileNameFromCompressedFileName(final String

// translate an htslib interop stream into the stream format used by the htsjdk name tokenization codec
private ByteBuffer convertHTSLIBToHTSJDKStreamFormat(final ByteBuffer htslibBuffer) {
// don't include the terminating delimiter that htslib interop streams use because htsjdk doesn't use a terminator
final ByteBuffer translatedBuffer = ByteBuffer.allocate(htslibBuffer.limit() - 1);
for (int i = 0; i < htslibBuffer.limit() - 1; i++) {
final ByteBuffer translatedBuffer = ByteBuffer.allocate(htslibBuffer.limit());
for (int i = 0; i < htslibBuffer.limit(); i++) {
if (htslibBuffer.get(i) == HTSLIB_NAME_SEPARATOR) {
translatedBuffer.put(i, NameTokenisationDecode.NAME_SEPARATOR);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public Object[][] getNameTokenisationTestData() {
"20FUKAAXX100202:1:45:12798:104365" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
"20GAVAAXX100126:3:23:6419:199245" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
"20FUKAAXX100202:8:48:6663:137967" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
"20FUKAAXX100202:6:68:17726:162601"); //TODO: no trailing separator...for now
"20FUKAAXX100202:6:68:17726:162601" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE);

// a subset of read names from
// src/test/resources/htsjdk/samtools/longreads/NA12878.m64020_190210_035026.chr21.5011316.5411316.unmapped.bam
Expand All @@ -69,12 +69,12 @@ public Object[][] getNameTokenisationTestData() {
"m64020_190210_035026/147719983/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
"m64020_190210_035026/60883331/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
"m64020_190210_035026/1116165/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
"m64020_190210_035026/75893199/ccs"); //TODO:no trailing separator for now
"m64020_190210_035026/75893199/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE);

// source: https://gatk.broadinstitute.org/hc/en-us/articles/360035890671-Read-groups
readNamesList.add(
"H0164ALXX140820:2:1101:10003:23460" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
"H0164ALXX140820:2:1101:15118:25288"); //TODO: no trailing separator...for now
"H0164ALXX140820:2:1101:15118:25288" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE);

final List<Object[]> testCases = new ArrayList<>();
for (final String readName : readNamesList) {
Expand Down

0 comments on commit 91c1cce

Please sign in to comment.