Update name separator handling.

samtools · Jan 7, 2025 · ce90f4d · ce90f4d
1 parent 91c1cce
commit ce90f4d
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 72 deletions.
diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java
@@ -14,21 +14,22 @@
 // so we don't have to repeatedly interconvert them when fetching from this list
 
 public class NameTokenisationDecode {
-    //TODO: lift these values to a common location since they're used by the encoder, the decoder, and the tests
-    // use a single byte to separate the names in the decoded buffer; this particular byte is chosen because the
-    // calling code in the CRAM reader for read names already assumes the decompressed data will be a block of
-    // BYTE_ARRAY_STOP '\0' separated names
+    //TODO: lift this value to a common location since its used by the encoder, the decoder, and the tests
+    // use a single byte to separate the names in buffer used for encoding/decoding; this particular byte is
+    // chosen because the calling code in the CRAM reader for read names already assumes the decompressed
+    // data will be a block of BYTE_ARRAY_STOP '\0' separated names
     public final static byte NAME_SEPARATOR = 0;
-    public final static CharSequence LOCAL_NAME_SEPARATOR_CHARSEQUENCE = new String(new byte[] {NAME_SEPARATOR});
 
     public static final int DEFAULT_POSITION_ALLOCATION = 30;
 
     /**
-     * Return is a byte[] containing the read names, each separated by a NAME_SEPARATOR byte, including a terminating
-     * separator.
+     * Uncompress the compressed name data in the input buffer. Return is a byte[] containing the read names,
+     * each separated by the byte value specified by nameSeparator, including a terminating separator.
+     * @param inBuffer the buffer to uncompress
+     * @param nameSeparator the name separtor byte to use in the output buffer
+     * @return the uncompressed read names
      */
-    //TODO: the caller needs to be able to specify the stop/separator byte via a parameter
-    public byte[] uncompress(final ByteBuffer inBuffer) {
+    public byte[] uncompress(final ByteBuffer inBuffer, final byte nameSeparator) {
         inBuffer.order(ByteOrder.LITTLE_ENDIAN);
         final int uncompressedLength = inBuffer.getInt();
 
@@ -47,7 +48,7 @@ public byte[] uncompress(final ByteBuffer inBuffer) {
         final ByteBuffer decodedNames = CompressionUtils.allocateByteBuffer(uncompressedLength);
         for (int i = 0; i < numNames; i++) {
             decodedNames.put(decodeSingleName(tokenStreams, decodedNameTokens, i));
-            decodedNames.put((byte) NAME_SEPARATOR);
+            decodedNames.put(nameSeparator);
         }
         return decodedNames.array();
     }

diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java
@@ -18,10 +18,6 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-//TODO:its super wasteful (but simpler) to always store the accumulated tokens as Strings, since this results
-// int lots of String<-> int interconversions
-//TODO: enforce a maximum of 128 tokens
-
 /**
  * A very naive implementation of a name tokenization encoder.
  *
@@ -46,21 +42,28 @@ public class NameTokenisationEncode {
     private int maxStringValueLength; // longest *String* value for any token
 
     /**
-     * Input buffer format is the read names, separated by the NAME_SEPARATOR byte, including a terminal separator.
+     * Compress the input buffer of read names.
+     * @param inBuffer formatted as read names separated by the byte specified by the nameSeparator parameter
+     * @param useArith true if the arithmetic coder should be used
+     * @param nameSeparator name separator
+     * @return the compressed buffer
      */
-    //TODO: the caller needs to be able to specify the stop byte via parameter
-    public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith) {
+    public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith, final byte nameSeparator) {
         // strictly speaking, keeping this list isn't necessary, but since the first thing that we need to write
         // to the output stream is the number of names, we have to scan the entire input anyway to count them,
         // so just extract them while we're scanning
-        final List<String> namesToEncode = extractInputNames(inBuffer, CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE);
+        final List<String> namesToEncode = extractInputNames(
+                inBuffer,
+                CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE,
+                nameSeparator);
         final int numNames = namesToEncode.size();
-        // compensate for the separator at the end of the last name that is not present in the local implementation
         final int uncompressedDataSize = Integer.max(0, inBuffer.limit());
 
-        //TODO: guess max size -> str.length*2 + 10000 (from htscodecs javascript code)
-        // what if this is exceeded ?
-        final ByteBuffer outBuffer = CompressionUtils.allocateByteBuffer((inBuffer.limit()*2)+10000);
+        // pre-allocate the output buffer; we don't know how big it will be. instead of implementing a wrapper around
+        // the ByteBuffer to allow for dynamic resizing, over-allocate; if the writer ever exceeds this, writing will
+        // fail with an exception, but it would indicate a serious error somewhere in the writer
+        final int outputLen = inBuffer.limit() * 2;
+        final ByteBuffer outBuffer = CompressionUtils.allocateByteBuffer(outputLen);
         outBuffer.putInt(uncompressedDataSize);
         outBuffer.putInt(numNames);
         outBuffer.put((byte)(useArith == true ? 1 : 0));
@@ -91,7 +94,9 @@ public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith) {
             serializeTokenStreams(streamsForPosition, outBuffer, useArith);
         }
 
-        outBuffer.flip(); // set the limit to current position, and reset position to '0'
+        // set the limit to current position (important because we initially dramatically over-allocated the buffer,
+        // so make sure the caller doesn't go past the actual limit), and reset position to '0'
+        outBuffer.flip();
         return outBuffer;
     }
 
@@ -180,12 +185,14 @@ private List<EncodeToken> tokeniseName(
     }
 
     // extract the individual names from the input buffer and return in a list
-    // TODO: this needs a parameter to specify the separator
-    private static List<String> extractInputNames(final ByteBuffer inBuffer, final int preAllocationSize) {
+    private static List<String> extractInputNames(
+            final ByteBuffer inBuffer,
+            final int preAllocationSize,
+            final byte nameSeparator) {
         final List<String> names = new ArrayList(preAllocationSize);
         for (int lastPosition = inBuffer.position(); inBuffer.hasRemaining();) {
             final byte currentByte = inBuffer.get();
-            if (currentByte == NameTokenisationDecode.NAME_SEPARATOR) {
+            if (currentByte == nameSeparator) {
                 final int length = inBuffer.position() - lastPosition;
                 final byte[] bytes = new byte[length];
                 inBuffer.position(lastPosition);

diff --git a/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java
@@ -29,7 +29,7 @@ public class NameTokenizationInteropTest extends HtsjdkTest {
 
     @DataProvider(name = "allNameTokInteropTests")
     public Object[][] getAllNameTokenizationInteropTests() throws IOException {
-        // compressed path (htslib interop preCompressed file), raw (unCompressed) path, useArith (used for round tripping only)
+        // raw (unCompressed) path, useArith
         final List<Object[]> testCases = new ArrayList<>();
         for (final Path preCompressedInteropPath : getPreCompressedInteropNameTokTestPaths()) {
             for (boolean useArith: new boolean[]{true, false}) {
@@ -51,16 +51,22 @@ public void testNameTokRoundTrip(
         try (final InputStream unCompressedInteropStream = Files.newInputStream(unCompressedInteropPath)) {
             // convert the uncompressed data from htslib to the unCompressed format used to pass data in/out of the htsjdk name tok codec
             final ByteBuffer unCompressedInteropBytes = convertHTSLIBToHTSJDKStreamFormat(
-                    ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream))
+                    ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream)),
+                    NameTokenisationDecode.NAME_SEPARATOR
             );
 
             // Use htsjdk to compress the uncompressed data with the provided useArith flag
             final NameTokenisationEncode nameEncoder = new NameTokenisationEncode();
-            final ByteBuffer compressedHtsjdkBytes = nameEncoder.compress(unCompressedInteropBytes, useArith);
+            final ByteBuffer compressedHtsjdkBytes = nameEncoder.compress(
+                    unCompressedInteropBytes,
+                    useArith,
+                    NameTokenisationDecode.NAME_SEPARATOR);
 
             // Now use htsjdk to uncompress the data we just compressed
             final NameTokenisationDecode nameDecoder = new NameTokenisationDecode();
-            final ByteBuffer unCompressedHtsjdkBytes = ByteBuffer.wrap(nameDecoder.uncompress(compressedHtsjdkBytes));
+            final ByteBuffer unCompressedHtsjdkBytes = ByteBuffer.wrap(nameDecoder.uncompress(
+                    compressedHtsjdkBytes,
+                    NameTokenisationDecode.NAME_SEPARATOR));
 
             // compare to the original (ByteBuffers have to have identical positions in order to be equal (!),
             // so rewind both buffers before comparing)
@@ -93,12 +99,15 @@ public void testNameTokUnCompress(
             final ByteBuffer preCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(preCompressedInteropStream));
             // convert the uncompressed data from htslib to the unCompressed format used to pass data in/out of the htsjdk name tok codec
             final ByteBuffer uncompressedInteropBytes = convertHTSLIBToHTSJDKStreamFormat(
-                    ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream))
+                    ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream)),
+                    NameTokenisationDecode.NAME_SEPARATOR
             );
 
             // Use htsjdk to uncompress the precompressed file from htscodecs repo
             final NameTokenisationDecode nameTokenisationDecode = new NameTokenisationDecode();
-            final ByteBuffer uncompressedHtsjdkBytes = ByteBuffer.wrap(nameTokenisationDecode.uncompress(preCompressedInteropBytes));
+            final ByteBuffer uncompressedHtsjdkBytes = ByteBuffer.wrap(
+                    nameTokenisationDecode.uncompress(preCompressedInteropBytes, NameTokenisationDecode.NAME_SEPARATOR)
+            );
 
             // Compare the htsjdk uncompressed bytes with the original input file from htscodecs repo
             Assert.assertEquals(uncompressedHtsjdkBytes, uncompressedInteropBytes);
@@ -135,11 +144,11 @@ private static String getUnCompressedFileNameFromCompressedFileName(final String
     }
 
     // translate an htslib interop stream into the stream format used by the htsjdk name tokenization codec
-    private ByteBuffer convertHTSLIBToHTSJDKStreamFormat(final ByteBuffer htslibBuffer) {
+    private ByteBuffer convertHTSLIBToHTSJDKStreamFormat(final ByteBuffer htslibBuffer, final byte newSeparator) {
         final ByteBuffer translatedBuffer = ByteBuffer.allocate(htslibBuffer.limit());
         for (int i = 0; i < htslibBuffer.limit(); i++) {
             if (htslibBuffer.get(i) == HTSLIB_NAME_SEPARATOR) {
-                translatedBuffer.put(i, NameTokenisationDecode.NAME_SEPARATOR);
+                translatedBuffer.put(i, newSeparator);
             } else {
                 translatedBuffer.put(i, htslibBuffer.get(i));
             }