Standardize input/output read name buffer separator.

samtools · Jan 7, 2025 · 91c1cce · 91c1cce
1 parent bad9ece
commit 91c1cce
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 22 deletions.
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java
@@ -15,22 +15,22 @@
 
 public class NameTokenisationDecode {
     //TODO: lift these values to a common location since they're used by the encoder, the decoder, and the tests
-
     // use a single byte to separate the names in the decoded buffer; this particular byte is chosen because the
     // calling code in the CRAM reader for read names already assumes the decompressed data will be a block of
     // BYTE_ARRAY_STOP '\0' separated names
     public final static byte NAME_SEPARATOR = 0;
     public final static CharSequence LOCAL_NAME_SEPARATOR_CHARSEQUENCE = new String(new byte[] {NAME_SEPARATOR});
-    public static final int UNCOMPRESSED_LENGTH_ADJUSTMENT = 1;
 
     public static final int DEFAULT_POSITION_ALLOCATION = 30;
 
-    // the input must be a ByteBuffer containing the read names, separated by the NAME_SEPARATOR byte, WITHOUT
-    // a terminating separator
-    //TODO: this needs a stop byte parameter
+    /**
+     * Return is a byte[] containing the read names, each separated by a NAME_SEPARATOR byte, including a terminating
+     * separator.
+     */
+    //TODO: the caller needs to be able to specify the stop/separator byte via a parameter
     public byte[] uncompress(final ByteBuffer inBuffer) {
         inBuffer.order(ByteOrder.LITTLE_ENDIAN);
-        final int uncompressedLength = inBuffer.getInt() - UNCOMPRESSED_LENGTH_ADJUSTMENT;
+        final int uncompressedLength = inBuffer.getInt();
 
         final int numNames = inBuffer.getInt() & 0xFFFFFFFF;
         final int useArith = inBuffer.get() & 0xFF;
@@ -46,11 +46,8 @@ public byte[] uncompress(final ByteBuffer inBuffer) {
         final List<List<String>> decodedNameTokens = new ArrayList<>(numNames);
         final ByteBuffer decodedNames = CompressionUtils.allocateByteBuffer(uncompressedLength);
         for (int i = 0; i < numNames; i++) {
-            final byte[] b = decodeSingleName(tokenStreams, decodedNameTokens, i);
-            decodedNames.put(b);
-            if (i != numNames - 1) {
-                decodedNames.put((byte) '\0');
-            }
+            decodedNames.put(decodeSingleName(tokenStreams, decodedNameTokens, i));
+            decodedNames.put((byte) NAME_SEPARATOR);
         }
         return decodedNames.array();
     }

diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java
@@ -46,16 +46,17 @@ public class NameTokenisationEncode {
     private int maxStringValueLength; // longest *String* value for any token
 
     /**
-     * Output format is the read names, separated by the NAME_SEPARATOR byte, WITHOUT a terminating separator
+     * Input buffer format is the read names, separated by the NAME_SEPARATOR byte, including a terminal separator.
      */
+    //TODO: the caller needs to be able to specify the stop byte via parameter
     public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith) {
         // strictly speaking, keeping this list isn't necessary, but since the first thing that we need to write
         // to the output stream is the number of names, we have to scan the entire input anyway to count them,
         // so just extract them while we're scanning
         final List<String> namesToEncode = extractInputNames(inBuffer, CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE);
         final int numNames = namesToEncode.size();
         // compensate for the separator at the end of the last name that is not present in the local implementation
-        final int uncompressedDataSize = Integer.max(0, inBuffer.limit() + NameTokenisationDecode.UNCOMPRESSED_LENGTH_ADJUSTMENT);
+        final int uncompressedDataSize = Integer.max(0, inBuffer.limit());
 
         //TODO: guess max size -> str.length*2 + 10000 (from htscodecs javascript code)
         // what if this is exceeded ?
@@ -179,20 +180,20 @@ private List<EncodeToken> tokeniseName(
     }
 
     // extract the individual names from the input buffer and return in a list
+    // TODO: this needs a parameter to specify the separator
     private static List<String> extractInputNames(final ByteBuffer inBuffer, final int preAllocationSize) {
         final List<String> names = new ArrayList(preAllocationSize);
         for (int lastPosition = inBuffer.position(); inBuffer.hasRemaining();) {
             final byte currentByte = inBuffer.get();
-            if (currentByte == NameTokenisationDecode.NAME_SEPARATOR || inBuffer.position() == inBuffer.limit()) {
+            if (currentByte == NameTokenisationDecode.NAME_SEPARATOR) {
                 final int length = inBuffer.position() - lastPosition;
                 final byte[] bytes = new byte[length];
                 inBuffer.position(lastPosition);
                 inBuffer.get(bytes, 0, length);  // consume the string + the terminator
                 names.add(new String(
                         bytes,
                         0,
-                        //TODO: special case handling end of the buffer, where there is no trailing separator
-                        length - (inBuffer.position() == inBuffer.limit() ? 0 : 1),
+                        length - 1, // don't include the separator in the string
                         StandardCharsets.UTF_8));
                 lastPosition = inBuffer.position();
             }

diff --git a/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java
@@ -136,9 +136,8 @@ private static String getUnCompressedFileNameFromCompressedFileName(final String
 
     // translate an htslib interop stream into the stream format used by the htsjdk name tokenization codec
     private ByteBuffer convertHTSLIBToHTSJDKStreamFormat(final ByteBuffer htslibBuffer) {
-        // don't include the terminating delimiter that htslib interop streams use because htsjdk doesn't use a terminator
-        final ByteBuffer translatedBuffer = ByteBuffer.allocate(htslibBuffer.limit() - 1);
-        for (int i = 0; i < htslibBuffer.limit() - 1; i++) {
+        final ByteBuffer translatedBuffer = ByteBuffer.allocate(htslibBuffer.limit());
+        for (int i = 0; i < htslibBuffer.limit(); i++) {
             if (htslibBuffer.get(i) == HTSLIB_NAME_SEPARATOR) {
                 translatedBuffer.put(i, NameTokenisationDecode.NAME_SEPARATOR);
             } else {

diff --git a/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java b/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java
@@ -44,7 +44,7 @@ public Object[][] getNameTokenisationTestData() {
                 "20FUKAAXX100202:1:45:12798:104365" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
                 "20GAVAAXX100126:3:23:6419:199245" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
                 "20FUKAAXX100202:8:48:6663:137967" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
-                "20FUKAAXX100202:6:68:17726:162601"); //TODO: no trailing separator...for now
+                "20FUKAAXX100202:6:68:17726:162601" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE);
 
         // a subset of read names from
         // src/test/resources/htsjdk/samtools/longreads/NA12878.m64020_190210_035026.chr21.5011316.5411316.unmapped.bam
@@ -69,12 +69,12 @@ public Object[][] getNameTokenisationTestData() {
                 "m64020_190210_035026/147719983/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
                 "m64020_190210_035026/60883331/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
                 "m64020_190210_035026/1116165/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
-                "m64020_190210_035026/75893199/ccs"); //TODO:no trailing separator for now
+                "m64020_190210_035026/75893199/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE);
 
         // source: https://gatk.broadinstitute.org/hc/en-us/articles/360035890671-Read-groups
         readNamesList.add(
                 "H0164ALXX140820:2:1101:10003:23460" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE +
-                "H0164ALXX140820:2:1101:15118:25288"); //TODO: no trailing separator...for now
+                "H0164ALXX140820:2:1101:15118:25288" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE);
 
         final List<Object[]> testCases = new ArrayList<>();
         for (final String readName : readNamesList) {