From b73b922e18ad9395ccd2c681ed7cf47ac87ede60 Mon Sep 17 00:00:00 2001 From: yash-puligundla Date: Wed, 7 Feb 2024 13:55:16 -0500 Subject: [PATCH] Add FQZComp Decoder --- .../compression/fqzcomp/FQZCompDecode.java | 301 ++++++++++++++++++ .../compression/fqzcomp/FQZGlobalFlags.java | 28 ++ .../cram/compression/fqzcomp/FQZModel.java | 56 ++++ .../cram/compression/fqzcomp/FQZParam.java | 205 ++++++++++++ .../cram/compression/fqzcomp/FQZState.java | 88 +++++ .../NameTokenisationDecode.java | 164 ++++++++++ .../NameTokenisationEncode.java | 287 +++++++++++++++++ .../nametokenisation/TokenStreams.java | 125 ++++++++ .../nametokenisation/tokens/EncodeToken.java | 38 +++ .../cram/compression/range/RangeCoder.java | 4 +- .../samtools/cram/FQZCompInteropTest.java | 80 +++++ .../cram/NameTokenizationInteropTest.java | 134 ++++++++ .../NameTokenisationTest.java | 100 ++++++ 13 files changed, 1608 insertions(+), 2 deletions(-) create mode 100644 src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompDecode.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZGlobalFlags.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZModel.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParam.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZState.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java create mode 100644 src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/EncodeToken.java create mode 100644 src/test/java/htsjdk/samtools/cram/FQZCompInteropTest.java create mode 100644 src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java create mode 100644 src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompDecode.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompDecode.java new file mode 100644 index 0000000000..5933a7bcaf --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZCompDecode.java @@ -0,0 +1,301 @@ +package htsjdk.samtools.cram.compression.fqzcomp; + +import htsjdk.samtools.cram.CRAMException; +import htsjdk.samtools.cram.compression.CompressionUtils; +import htsjdk.samtools.cram.compression.range.ByteModel; +import htsjdk.samtools.cram.compression.range.RangeCoder; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +public class FQZCompDecode { + private static final int NUMBER_OF_SYMBOLS = 256; + + public static ByteBuffer uncompress( final ByteBuffer inBuffer) { + final int bufferLength = CompressionUtils.readUint7(inBuffer); + final int version = inBuffer.get() & 0xFF; + if (version != 5) { + throw new CRAMException("Invalid FQZComp format version number: " + version); + } + final FQZGlobalFlags globalFlags = new FQZGlobalFlags(inBuffer.get() & 0xFF); + final int numParamBlock = globalFlags.isMultiParam()?inBuffer.get() : 1; + int maxSelector = (numParamBlock > 1) ? (numParamBlock - 1) : 0; + final int[] selectorTable = new int[NUMBER_OF_SYMBOLS]; + if (globalFlags.hasSelectorTable()) { + maxSelector = inBuffer.get() & 0xFF; + readArray(inBuffer, selectorTable, NUMBER_OF_SYMBOLS); + } else { + for (int i = 0; i < numParamBlock; i++) { + selectorTable[i] = i; + } + for (int i = numParamBlock; i < NUMBER_OF_SYMBOLS; i++) { + selectorTable[i] = numParamBlock - 1; + } + } + final List fqzParamList = new ArrayList(numParamBlock); + int maxSymbols = 0; // maximum number of distinct Quality values across all param sets + for (int p=0; p < numParamBlock; p++){ + fqzParamList.add(p,decodeFQZSingleParam(inBuffer)); + if(maxSymbols < fqzParamList.get(p).getMaxSymbols()){ + maxSymbols = fqzParamList.get(p).getMaxSymbols(); + } + } + + // main decode loop + int i = 0; + final FQZState fqzState = new FQZState(); + final RangeCoder rangeCoder = new RangeCoder(); + rangeCoder.rangeDecodeStart(inBuffer); + final FQZModel model = fqzCreateModels(maxSymbols, maxSelector); + final List QualityLengths = new ArrayList<>(); + FQZParam params = null; + int last = 0; + final int[] rev = null; + final ByteBuffer outBuffer = CompressionUtils.allocateByteBuffer(bufferLength); + while (i 0) + rle[j++] = run; + } + last = run; + } + + // Now expand runs in rle to table, noting 255 is max run + int i = 0; + j = 0; + z = 0; + int part; + while (z < size) { + int run_len = 0; + do { + part = rle[j++]; + run_len += part; + } while (part == 255); + + while (run_len-- > 0) + table[z++] = i; + i++; + } + } + + public static FQZModel fqzCreateModels(final int maxSymbols, final int maxSelector){ + final FQZModel fqzModel = new FQZModel(); + fqzModel.setQuality(new ByteModel[1 << 16]); + for (int i = 0; i < (1 << 16); i++) { + fqzModel.getQuality()[i] = new ByteModel(maxSymbols + 1); // +1 as max value not num. values + } + fqzModel.setLength(new ByteModel[4]); + for (int i = 0; i < 4; i++) { + fqzModel.getLength()[i] = new ByteModel(NUMBER_OF_SYMBOLS); + } + fqzModel.setReverse(new ByteModel(2)); + fqzModel.setDuplicate(new ByteModel(2)); + if (maxSelector > 0) { + fqzModel.setSelector(new ByteModel(maxSelector + 1)); + } + return fqzModel; + } + + // If duplicate returns 1, else 0 + public static void decodeFQZNewRecord( + final ByteBuffer inBuffer, + final RangeCoder rangeCoder, + final FQZModel model, + final FQZState state, + final int maxSelector, + final boolean doReverse, + final int[] selectorTable, + final List fqzParamList, + final int[] rev){ + + // Parameter selector + if (maxSelector > 0) { + state.setSelector(model.getSelector().modelDecode(inBuffer, rangeCoder)); + } else { + state.setSelector(0); + } + state.setSelectorTable(selectorTable[state.getSelector()]); + final FQZParam params = fqzParamList.get(state.getSelectorTable()); + + // Reset contexts at the start of each new record + int len; + if (params.getFixedLen() >= 0) { + // Not fixed or fixed but first record + len = model.getLength()[0].modelDecode(inBuffer, rangeCoder); + len |= model.getLength()[1].modelDecode(inBuffer, rangeCoder) << 8; + len |= model.getLength()[2].modelDecode(inBuffer, rangeCoder) << 16; + len |= model.getLength()[3].modelDecode(inBuffer, rangeCoder) << 24; + if (params.getFixedLen() > 0) { + params.setFixedLen(-len); + } + } else { + len = -params.getFixedLen(); + } + state.setRecordLength(len); + if (doReverse) { + rev[state.getRecordNumber()] = model.getReverse().modelDecode(inBuffer, rangeCoder); + } + state.setIsDuplicate(false); + if (params.isDoDedup()) { + if (model.getDuplicate().modelDecode(inBuffer, rangeCoder) != 0) { + state.setIsDuplicate(true); + } + } + state.setBases(len); // number of remaining bytes in this record + state.setDelta(0); + state.setQualityContext(0); + state.setPreviousQuality(0); + state.setRecordNumber(state.getRecordNumber() + 1); + } + + public static int fqzUpdateContext(final FQZParam params, + final FQZState state, + final int quality){ + + int last = params.getContext(); + state.setQualityContext(((state.getQualityContext() << params.getQualityContextShift()) + params.getQualityContextTable()[quality]) >>> 0); + last += ((state.getQualityContext() & ((1 << params.getQualityContextBits()) - 1)) << params.getQualityContextLocation()) >>> 0; + + if (params.isDoPos()) + last += params.getPositionContextTable()[Math.min(state.getBases(), 1023)] << params.getPositionContextLocation(); + + if (params.isDoDelta()) { + last += params.getDeltaContextTable()[Math.min(state.getDelta(), 255)] << params.getDeltaContextLocation(); + state.setDelta(state.getDelta()+ ((state.getPreviousQuality() != quality) ? 1 : 0)); + state.setPreviousQuality(quality); + } + if (params.isDoSel()) + last += state.getSelector() << params.getSelectorContextLocation(); + state.setBases(state.getBases()-1); + return last & 0xffff; + } + + public static FQZParam decodeFQZSingleParam(ByteBuffer inBuffer) { + final FQZParam param = new FQZParam(); + param.setContext((inBuffer.get() & 0xFF) | ((inBuffer.get() & 0xFF) << 8)); + param.setParameterFlags(inBuffer.get() & 0xFF); + param.setMaxSymbols(inBuffer.get() & 0xFF); + final int x = inBuffer.get() & 0xFF; + param.setQualityContextBits(x >> 4); + param.setQualityContextShift(x & 0x0F); + final int y = inBuffer.get() & 0xFF; + param.setQualityContextLocation(y >> 4); + param.setSelectorContextLocation(y & 0x0F); + final int z = inBuffer.get() & 0xFF; + param.setPositionContextLocation(z >> 4); + param.setDeltaContextLocation(z & 0x0F); + + // Read Quality Map. Example: "unbin" Illumina Qualities + param.setQualityMap(new int[NUMBER_OF_SYMBOLS]); + if (param.isDoQmap()) { + for (int i = 0; i < param.getMaxSymbols(); i++) { + param.getQualityMap()[i] = inBuffer.get() & 0xFF; + } + } else { + for (int i = 0; i < NUMBER_OF_SYMBOLS; i++) { + param.getQualityMap()[i] = i; + } + } + + // Read tables + param.setQualityContextTable(new int[1024]); + if (param.getQualityContextBits() > 0 && param.isDoQtab()) { + readArray(inBuffer, param.getQualityContextTable(), NUMBER_OF_SYMBOLS); + } else { + for (int i = 0; i < NUMBER_OF_SYMBOLS; i++) { + param.getQualityContextTable()[i] = i; // NOP + } + } + param.setPositionContextTable(new int[1024]); + if (param.isDoPos()) { + readArray(inBuffer, param.getPositionContextTable(), 1024); + } + param.setDeltaContextTable(new int[NUMBER_OF_SYMBOLS]); + if (param.isDoDelta()) { + readArray(inBuffer, param.getDeltaContextTable(), NUMBER_OF_SYMBOLS); + } + return param; + } + + public static void reverseQualities( + final ByteBuffer outBuffer, + final int bufferLength, + final int[] rev, + final List QualityLengths + ){ + int rec = 0; + int idx = 0; + while (idx< bufferLength) { + if (rev[rec]==1) { + int j = 0; + int k = QualityLengths.get(rec) - 1; + while (j < k) { + byte tmp = outBuffer.get(idx + j); + outBuffer.put(idx + j,outBuffer.get(idx + k)); + outBuffer.put(idx + k, tmp); + j++; + k--; + } + } + idx += QualityLengths.get(rec++); + } + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZGlobalFlags.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZGlobalFlags.java new file mode 100644 index 0000000000..937b7eed62 --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZGlobalFlags.java @@ -0,0 +1,28 @@ +package htsjdk.samtools.cram.compression.fqzcomp; + +public class FQZGlobalFlags { + public static final int MULTI_PARAM_FLAG_MASK = 0x01; + public static final int SELECTOR_TABLE_FLAG_MASK = 0x02; + public static final int DO_REVERSE_FLAG_MASK = 0x04; + + private int globalFlags; + + public FQZGlobalFlags(final int globalFlags) { + this.globalFlags = globalFlags; + } + + // returns True if more than one parameter block is present + public boolean isMultiParam(){ + return ((globalFlags & MULTI_PARAM_FLAG_MASK)!=0); + } + + // returns True if the parameter selector is mapped through selector table + public boolean hasSelectorTable(){ + return ((globalFlags & SELECTOR_TABLE_FLAG_MASK)!=0); + } + + public boolean doReverse(){ + return ((globalFlags & DO_REVERSE_FLAG_MASK)!=0); + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZModel.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZModel.java new file mode 100644 index 0000000000..047c387a2a --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZModel.java @@ -0,0 +1,56 @@ +package htsjdk.samtools.cram.compression.fqzcomp; + +import htsjdk.samtools.cram.compression.range.ByteModel; + +public class FQZModel { + + private ByteModel[] quality; // Primary model for quality values + private ByteModel[] length; // Read length models with the context 0-3 being successive byte numbers (little endian order) + private ByteModel reverse; // indicates which strings to reverse + private ByteModel duplicate; // Indicates if this whole string is a duplicate of the last one + private ByteModel selector; // Used if gflags.multi_param or pflags.do_sel are defined. + + public FQZModel() { + } + + public ByteModel[] getQuality() { + + return quality; + } + + public void setQuality(ByteModel[] quality) { + this.quality = quality; + } + + public ByteModel[] getLength() { + return length; + } + + public void setLength(ByteModel[] length) { + this.length = length; + } + + public ByteModel getReverse() { + return reverse; + } + + public void setReverse(ByteModel reverse) { + this.reverse = reverse; + } + + public ByteModel getDuplicate() { + return duplicate; + } + + public void setDuplicate(ByteModel duplicate) { + this.duplicate = duplicate; + } + + public ByteModel getSelector() { + return selector; + } + + public void setSelector(ByteModel selector) { + this.selector = selector; + } +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParam.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParam.java new file mode 100644 index 0000000000..eaf9b9d08c --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZParam.java @@ -0,0 +1,205 @@ +package htsjdk.samtools.cram.compression.fqzcomp; + +public class FQZParam { + private int context; + private int parameterFlags; // Per-parameter block bit-flags + // TODO: rename - follow names from spec. These flags should be set using parameterFlags value + private boolean doDedup; + private int fixedLen; + private boolean doSel; + private boolean doQmap; + private boolean doPos; + private boolean doDelta; + private boolean doQtab; + + private int maxSymbols; // Total number of distinct quality values + private int qualityContextBits; // Total number of bits for Quality context + private int qualityContextShift; // Left bit shift per successive quality in quality context + private int qualityContextLocation; // Bit position of quality context + private int selectorContextLocation; // Bit position of selector context + private int positionContextLocation; // Bit position of position context + private int deltaContextLocation; // Bit position of delta context + private int[] qualityMap; // Map for unbinning quality values. + private int[] qualityContextTable; // Quality context lookup table + private int[] positionContextTable; // Position context lookup table + private int[] deltaContextTable; // Delta context lookup table + + private static final int DEDUP_FLAG_MASK = 0x02; + private static final int FIXED_LEN_FLAG_MASK = 0x04; + private static final int SEL_FLAG_MASK = 0x08; + private static final int QMAP_FLAG_MASK = 0x10; + private static final int PTAB_FLAG_MASK = 0x20; + private static final int DTAB_FLAG_MASK = 0x40; + private static final int QTAB_FLAG_MASK = 0x80; + + public FQZParam() { + } + + public int getContext() { + return context; + } + + public int getParameterFlags() { + return parameterFlags; + } + + public boolean isDoDedup() { + return doDedup; + } + + public int getFixedLen() { + return fixedLen; + } + + public boolean isDoSel() { + return doSel; + } + + public boolean isDoQmap() { + return doQmap; + } + + public boolean isDoPos() { + return doPos; + } + + public boolean isDoDelta() { + return doDelta; + } + + public boolean isDoQtab() { + return doQtab; + } + + public int getMaxSymbols() { + return maxSymbols; + } + + public int getQualityContextBits() { + return qualityContextBits; + } + + public int getQualityContextShift() { + return qualityContextShift; + } + + public int getQualityContextLocation() { + return qualityContextLocation; + } + + public int getSelectorContextLocation() { + return selectorContextLocation; + } + + public int getPositionContextLocation() { + return positionContextLocation; + } + + public int getDeltaContextLocation() { + return deltaContextLocation; + } + + public int[] getQualityMap() { + return qualityMap; + } + + public int[] getQualityContextTable() { + return qualityContextTable; + } + + public int[] getPositionContextTable() { + return positionContextTable; + } + + public int[] getDeltaContextTable() { + return deltaContextTable; + } + + public void setContext(int context) { + this.context = context; + } + + public void setParameterFlags(int parameterFlags) { + this.parameterFlags = parameterFlags; + setDoDedup((parameterFlags & DEDUP_FLAG_MASK) != 0); + setFixedLen(parameterFlags & FIXED_LEN_FLAG_MASK); + setDoSel((parameterFlags & SEL_FLAG_MASK) != 0); + setDoQmap((parameterFlags & QMAP_FLAG_MASK) != 0); + setDoPos((parameterFlags & PTAB_FLAG_MASK) != 0); + setDoDelta((parameterFlags & DTAB_FLAG_MASK) != 0); + setDoQtab((parameterFlags & QTAB_FLAG_MASK) != 0); + } + + public void setDoDedup(boolean doDedup) { + this.doDedup = doDedup; + } + + public void setFixedLen(int fixedLen) { + this.fixedLen = fixedLen; + } + + public void setDoSel(boolean doSel) { + this.doSel = doSel; + } + + public void setDoQmap(boolean doQmap) { + this.doQmap = doQmap; + } + + public void setDoPos(boolean doPos) { + this.doPos = doPos; + } + + public void setDoDelta(boolean doDelta) { + this.doDelta = doDelta; + } + + public void setDoQtab(boolean doQtab) { + this.doQtab = doQtab; + } + + public void setMaxSymbols(int maxSymbols) { + this.maxSymbols = maxSymbols; + } + + public void setQualityContextBits(int qualityContextBits) { + this.qualityContextBits = qualityContextBits; + } + + public void setQualityContextShift(int qualityContextShift) { + this.qualityContextShift = qualityContextShift; + } + + public void setQualityContextLocation(int qualityContextLocation) { + this.qualityContextLocation = qualityContextLocation; + } + + public void setSelectorContextLocation(int selectorContextLocation) { + this.selectorContextLocation = selectorContextLocation; + } + + public void setPositionContextLocation(int positionContextLocation) { + this.positionContextLocation = positionContextLocation; + } + + public void setDeltaContextLocation(int deltaContextLocation) { + this.deltaContextLocation = deltaContextLocation; + } + + public void setQualityMap(int[] qualityMap) { + this.qualityMap = qualityMap; + } + + public void setQualityContextTable(int[] qualityContextTable) { + this.qualityContextTable = qualityContextTable; + } + + public void setPositionContextTable(int[] positionContextTable) { + this.positionContextTable = positionContextTable; + } + + public void setDeltaContextTable(int[] deltaContextTable) { + this.deltaContextTable = deltaContextTable; + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZState.java b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZState.java new file mode 100644 index 0000000000..3a4981029c --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/fqzcomp/FQZState.java @@ -0,0 +1,88 @@ +package htsjdk.samtools.cram.compression.fqzcomp; + +public class FQZState { + private int qualityContext; // Qual-only sub-context + private int previousQuality; // Previous quality value + private int delta; // Running delta (quality vs previousQuality) + private int bases; // Number of bases left in current record + private int selector; // Current parameter selector value (0 if unused) + private int selectorTable; // "stab" tabulated copy of s + private int recordLength; // Length of current string + private boolean isDuplicate; // This string is a duplicate of last + private int recordNumber; // Record number + + public FQZState() { + } + + public int getQualityContext() { + return qualityContext; + } + + public void setQualityContext(int qualityContext) { + this.qualityContext = qualityContext; + } + + public int getPreviousQuality() { + return previousQuality; + } + + public void setPreviousQuality(int previousQuality) { + this.previousQuality = previousQuality; + } + + public int getDelta() { + return delta; + } + + public void setDelta(int delta) { + this.delta = delta; + } + + public int getBases() { + return bases; + } + + public void setBases(int bases) { + this.bases = bases; + } + + public int getSelector() { + return selector; + } + + public void setSelector(int selector) { + this.selector = selector; + } + + public int getSelectorTable() { + return selectorTable; + } + + public void setSelectorTable(int selectorTable) { + this.selectorTable = selectorTable; + } + + public int getRecordLength() { + return recordLength; + } + + public void setRecordLength(int recordLength) { + this.recordLength = recordLength; + } + + public boolean getIsDuplicate() { + return isDuplicate; + } + + public void setIsDuplicate(boolean isDuplicate) { + this.isDuplicate = isDuplicate; + } + + public int getRecordNumber() { + return recordNumber; + } + + public void setRecordNumber(int recordNumber) { + this.recordNumber = recordNumber; + } +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java new file mode 100644 index 0000000000..61d935aad1 --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java @@ -0,0 +1,164 @@ +package htsjdk.samtools.cram.compression.nametokenisation; + +import htsjdk.samtools.cram.CRAMException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; +import java.util.StringJoiner; + +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_TYPE; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_STRING; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_CHAR; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS0; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DZLEN; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DUP; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA0; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_MATCH; +import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_END; + +public class NameTokenisationDecode { + + public static String uncompress(final ByteBuffer inBuffer) { + return uncompress(inBuffer, "\n"); + } + + public static String uncompress( + final ByteBuffer inBuffer, + final String separator) { + inBuffer.order(ByteOrder.LITTLE_ENDIAN); + final int uncompressedLength = inBuffer.getInt() & 0xFFFFFFFF; //unused variable. Following the spec + final int numNames = inBuffer.getInt() & 0xFFFFFFFF; + final int useArith = inBuffer.get() & 0xFF; + TokenStreams tokenStreams = new TokenStreams(inBuffer, useArith, numNames); + List> tokensList = new ArrayList<>(numNames); + for(int i = 0; i < numNames; i++) { + tokensList.add(new ArrayList<>()); + } + StringJoiner decodedNamesJoiner = new StringJoiner(separator); + for (int i = 0; i < numNames; i++) { + decodedNamesJoiner.add(decodeSingleName(tokenStreams, tokensList, i)); + } + String uncompressedNames = decodedNamesJoiner.toString(); + if (uncompressedLength == uncompressedNames.length() + separator.length()){ + return uncompressedNames + separator; + } + return uncompressedNames; + } + + private static String decodeSingleName( + final TokenStreams tokenStreams, + final List> tokensList, + final int currentNameIndex) { + + // The information about whether a name is a duplicate or not + // is obtained from the list of tokens at tokenStreams[0,0] + byte nameType = tokenStreams.getTokenStreamByteBuffer(0,TOKEN_TYPE).get(); + final ByteBuffer distBuffer = tokenStreams.getTokenStreamByteBuffer(0,nameType).order(ByteOrder.LITTLE_ENDIAN); + final int dist = distBuffer.getInt() & 0xFFFFFFFF; + final int prevNameIndex = currentNameIndex - dist; + if (nameType == TOKEN_DUP){ + tokensList.add(currentNameIndex, tokensList.get(prevNameIndex)); + return String.join("", tokensList.get(currentNameIndex)); + } + int tokenPosition = 1; // At position 0, we get nameType information + byte type; + StringBuilder decodedNameBuilder = new StringBuilder(); + do { + type = tokenStreams.getTokenStreamByteBuffer(tokenPosition, TOKEN_TYPE).get(); + String currentToken = ""; + switch(type){ + case TOKEN_CHAR: + final char currentTokenChar = (char) tokenStreams.getTokenStreamByteBuffer(tokenPosition, TOKEN_CHAR).get(); + currentToken = String.valueOf(currentTokenChar); + break; + case TOKEN_STRING: + currentToken = readString(tokenStreams.getTokenStreamByteBuffer(tokenPosition, TOKEN_STRING)); + break; + case TOKEN_DIGITS: + currentToken = getDigitsToken(tokenStreams, tokenPosition, TOKEN_DIGITS); + break; + case TOKEN_DIGITS0: + final String digits0Token = getDigitsToken(tokenStreams, tokenPosition, TOKEN_DIGITS0); + final int lenDigits0Token = tokenStreams.getTokenStreamByteBuffer(tokenPosition, TOKEN_DZLEN).get() & 0xFF; + currentToken = leftPadNumber(digits0Token, lenDigits0Token); + break; + case TOKEN_DELTA: + currentToken = getDeltaToken(tokenStreams, tokenPosition, tokensList, prevNameIndex, TOKEN_DELTA); + break; + case TOKEN_DELTA0: + final String delta0Token = getDeltaToken(tokenStreams, tokenPosition, tokensList, prevNameIndex, TOKEN_DELTA0); + final int lenDelta0Token = tokensList.get(prevNameIndex).get(tokenPosition-1).length(); + currentToken = leftPadNumber(delta0Token, lenDelta0Token); + break; + case TOKEN_MATCH: + currentToken = tokensList.get(prevNameIndex).get(tokenPosition-1); + break; + default: + break; + } + tokensList.get(currentNameIndex).add(tokenPosition-1,currentToken); + decodedNameBuilder.append(currentToken); + tokenPosition++; + } while (type!= TOKEN_END); + return decodedNameBuilder.toString(); + } + + private static String getDeltaToken( + final TokenStreams tokenStreams, + final int tokenPosition, + final List> tokensList, + final int prevNameIndex, + final byte tokenType) { + if (!(tokenType == TOKEN_DELTA || tokenType == TOKEN_DELTA0)){ + throw new CRAMException(String.format("Invalid tokenType : %s. " + + "tokenType must be either TOKEN_DELTA or TOKEN_DELTA0", tokenType)); + } + int prevToken; + try { + prevToken = Integer.parseInt(tokensList.get(prevNameIndex).get(tokenPosition -1)); + } catch (final NumberFormatException e) { + final String exceptionMessageSubstring = (tokenType == TOKEN_DELTA) ? "DIGITS or DELTA" : "DIGITS0 or DELTA0"; + throw new CRAMException(String.format("The token in the prior name must be of type %s", + exceptionMessageSubstring), e); + } + final int deltaTokenValue = tokenStreams.getTokenStreamByteBuffer(tokenPosition,tokenType).get() & 0xFF; + return Long.toString(prevToken + deltaTokenValue); + } + + private static String getDigitsToken( + final TokenStreams tokenStreams, + final int tokenPosition, + final byte tokenType ) { + if (!(tokenType == TOKEN_DIGITS || tokenType == TOKEN_DIGITS0)){ + throw new CRAMException(String.format("Invalid tokenType : %s. " + + "tokenType must be either TOKEN_DIGITS or TOKEN_DIGITS0", tokenType)); + } + final ByteBuffer digitsByteBuffer = tokenStreams.getTokenStreamByteBuffer(tokenPosition, tokenType).order(ByteOrder.LITTLE_ENDIAN); + final long digits = digitsByteBuffer.getInt() & 0xFFFFFFFFL; + return Long.toString(digits); + } + + private static String readString(final ByteBuffer inputBuffer) { + // spec: We fetch one byte at a time from the value byte stream, + // appending to the name buffer until the byte retrieved is zero. + StringBuilder resultStringBuilder = new StringBuilder(); + byte currentByte = inputBuffer.get(); + while (currentByte != 0) { + resultStringBuilder.append((char) currentByte); + currentByte = inputBuffer.get(); + } + return resultStringBuilder.toString(); + } + + private static String leftPadNumber(String value, final int len) { + // return value such that it is at least len bytes long with leading zeros + while (value.length() < len) { + value = "0" + value; + } + return value; + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java new file mode 100644 index 0000000000..4a07f8422f --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java @@ -0,0 +1,287 @@ +package htsjdk.samtools.cram.compression.nametokenisation; + +import htsjdk.samtools.cram.compression.CompressionUtils; +import htsjdk.samtools.cram.compression.nametokenisation.tokens.EncodeToken; +import htsjdk.samtools.cram.compression.range.RangeEncode; +import htsjdk.samtools.cram.compression.range.RangeParams; +import htsjdk.samtools.cram.compression.rans.RANSEncode; +import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Encode; +import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Params; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class NameTokenisationEncode { + + private int maxToken; + private int maxLength; + + public ByteBuffer compress(final ByteBuffer inBuffer){ + return compress(inBuffer, 0); + } + + public ByteBuffer compress(final ByteBuffer inBuffer, final int useArith){ + maxToken = 0; + maxLength = 0; + ArrayList names = new ArrayList<>(); + int lastPosition = inBuffer.position(); + + // convert buffer to array of names + while(inBuffer.hasRemaining()){ + byte currentByte = inBuffer.get(); + if ((currentByte) == '\n' || inBuffer.position()==inBuffer.limit()){ + int length = inBuffer.position() - lastPosition; + byte[] bytes = new byte[length]; + inBuffer.position(lastPosition); + inBuffer.get(bytes, 0, length); + names.add(new String(bytes, StandardCharsets.UTF_8).trim()); + lastPosition = inBuffer.position(); + } + } + + final int numNames = names.size(); + // guess max size -> str.length*2 + 10000 (from htscodecs javascript code) + ByteBuffer outBuffer = allocateOutputBuffer((inBuffer.limit()*2)+10000); + outBuffer.putInt(inBuffer.limit()); + outBuffer.putInt(numNames); + outBuffer.put((byte)useArith); + + // Instead of List> for tokensList like we did in Decoder, we use List> + // as we also need to store the TOKEN_TYPE, relative value when compared to prev name's token + // along with the token value. + List> tokensList = new ArrayList<>(numNames); + HashMap nameIndexMap = new HashMap<>(); + int[] tokenFrequencies = new int[256]; + for(int nameIndex = 0; nameIndex < numNames; nameIndex++) { + tokeniseName(tokensList, nameIndexMap, tokenFrequencies, names.get(nameIndex), nameIndex); + } + for (int tokenPosition = 0; tokenPosition < maxToken; tokenPosition++) { + List tokenStream = new ArrayList(TokenStreams.TOTAL_TOKEN_TYPES); + for (int i = 0; i < TokenStreams.TOTAL_TOKEN_TYPES; i++) { + tokenStream.add(ByteBuffer.allocate(numNames* maxLength).order(ByteOrder.LITTLE_ENDIAN)); + } + fillByteStreams(tokenStream,tokensList,tokenPosition,numNames); + serializeByteStreams(tokenStream,useArith,outBuffer); + } + + // sets limit to current position and position to '0' + outBuffer.flip(); + return outBuffer; + } + + private void tokeniseName(final List> tokensList, + HashMap nameIndexMap, + int[] tokenFrequencies, + final String name, + final int currentNameIndex) { + int currMaxLength = 0; + + // always compare against last name only + final int prevNameIndex = currentNameIndex - 1; + tokensList.add(new ArrayList<>()); + if (nameIndexMap.containsKey(name)) { + // TODO: Add Test to cover this code + tokensList.get(currentNameIndex).add(new EncodeToken(String.valueOf(currentNameIndex - nameIndexMap.get(name)), String.valueOf(currentNameIndex - nameIndexMap.get(name)),TokenStreams.TOKEN_DUP)); + } else { + tokensList.get(currentNameIndex).add(new EncodeToken(String.valueOf(currentNameIndex == 0 ? 0 : 1),String.valueOf(currentNameIndex == 0 ? 0 : 1),TokenStreams.TOKEN_DIFF)); + } + // Get the list of tokens `tok` for the current name + nameIndexMap.put(name, currentNameIndex); + String regex = "([a-zA-Z0-9]{1,9})|([^a-zA-Z0-9]+)"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(name); + List tok = new ArrayList<>(); + while (matcher.find()) { + tok.add(matcher.group()); + } + for (int i = 0; i < tok.size(); i++) { + // In the list of tokens, all the tokens are offset by 1 + // because at position "0", we have a token that provides info if the name is a DIFF or DUP + // token 0 = DIFF vs DUP + int tokenIndex = i + 1; + byte type = TokenStreams.TOKEN_STRING; + String str = tok.get(i); // absolute value of the token + String val = tok.get(i); // relative value of the token (comparing to prevname's token at the same token position) + if (tok.get(i).matches("^0+[0-9]*$")) { + type = TokenStreams.TOKEN_DIGITS0; + } else if (tok.get(i).matches("^[0-9]+$")) { + type = TokenStreams.TOKEN_DIGITS; + } else if (tok.get(i).length() == 1) { + type = TokenStreams.TOKEN_CHAR; + } + + // compare the current token with token from the previous name at the current token's index + // if there exists a previous name and a token at the corresponding index of the previous name + if (prevNameIndex >=0 && tokensList.get(prevNameIndex).size() > tokenIndex) { + EncodeToken prevToken = tokensList.get(prevNameIndex).get(tokenIndex); + if (prevToken.getActualTokenValue().equals(tok.get(i))) { + type = TokenStreams.TOKEN_MATCH; + val = ""; + } else if (type==TokenStreams.TOKEN_DIGITS + && (prevToken.getTokenType() == TokenStreams.TOKEN_DIGITS || prevToken.getTokenType() == TokenStreams.TOKEN_DELTA)) { + int v = Integer.parseInt(val); + int s = Integer.parseInt(prevToken.getActualTokenValue()); + int d = v - s; + tokenFrequencies[tokenIndex]++; + if (d >= 0 && d < 256 && tokenFrequencies[tokenIndex] > currentNameIndex / 2) { + type = TokenStreams.TOKEN_DELTA; + val = String.valueOf(d); + } + } else if (type==TokenStreams.TOKEN_DIGITS0 && prevToken.getActualTokenValue().length() == val.length() + && (prevToken.getTokenType() == TokenStreams.TOKEN_DIGITS0 || prevToken.getTokenType() == TokenStreams.TOKEN_DELTA0)) { + int d = Integer.parseInt(val) - Integer.parseInt(prevToken.getActualTokenValue()); + tokenFrequencies[tokenIndex]++; + if (d >= 0 && d < 256 && tokenFrequencies[tokenIndex] > currentNameIndex / 2) { + type = TokenStreams.TOKEN_DELTA0; + val = String.valueOf(d); + } + } + } + tokensList.get(currentNameIndex).add(new EncodeToken(str, val, type)); + + if (currMaxLength < val.length() + 3) { + // TODO: check this? Why isn't unint32 case handled? + // +3 for integers; 5 -> (Uint32)5 (from htscodecs javascript code) + currMaxLength = val.length() + 3; + } + } + + tokensList.get(currentNameIndex).add(new EncodeToken("","",TokenStreams.TOKEN_END)); + final int currMaxToken = tokensList.get(currentNameIndex).size(); + if (maxToken < currMaxToken) + maxToken = currMaxToken; + if (maxLength < currMaxLength) + maxLength = currMaxLength; + } + + public void fillByteStreams( + final List tokenStream, + final List> tokensList, + final int tokenPosition, + final int numNames) { + + // Fill tokenStreams object using tokensList + for (int nameIndex = 0; nameIndex < numNames; nameIndex++) { + if (tokenPosition > 0 && tokensList.get(nameIndex).get(0).getTokenType() == TokenStreams.TOKEN_DUP) { + continue; + } + if (tokensList.get(nameIndex).size() <= tokenPosition) { + continue; + } + EncodeToken encodeToken = tokensList.get(nameIndex).get(tokenPosition); + byte type = encodeToken.getTokenType(); + tokenStream.get(TokenStreams.TOKEN_TYPE).put(type); + switch (type) { + case TokenStreams.TOKEN_DIFF: + tokenStream.get(TokenStreams.TOKEN_DIFF).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); + break; + + case TokenStreams.TOKEN_DUP: + tokenStream.get(TokenStreams.TOKEN_DUP).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); + break; + + case TokenStreams.TOKEN_STRING: + writeString(tokenStream.get(TokenStreams.TOKEN_STRING),encodeToken.getRelativeTokenValue()); + break; + + case TokenStreams.TOKEN_CHAR: + tokenStream.get(TokenStreams.TOKEN_CHAR).put(encodeToken.getRelativeTokenValue().getBytes()[0]); + break; + + case TokenStreams.TOKEN_DIGITS: + tokenStream.get(TokenStreams.TOKEN_DIGITS).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); + break; + + case TokenStreams.TOKEN_DIGITS0: + tokenStream.get(TokenStreams.TOKEN_DIGITS0).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); + tokenStream.get(TokenStreams.TOKEN_DZLEN).put((byte) encodeToken.getRelativeTokenValue().length()); + break; + + case TokenStreams.TOKEN_DELTA: + tokenStream.get(TokenStreams.TOKEN_DELTA).put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue())); + break; + + case TokenStreams.TOKEN_DELTA0: + tokenStream.get(TokenStreams.TOKEN_DELTA0).put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue())); + break; + } + } + } + + private static void writeString(final ByteBuffer tokenStreamBuffer, final String val) { + byte[] bytes = val.getBytes(); + tokenStreamBuffer.put(bytes); + tokenStreamBuffer.put((byte) 0); + } + + public static ByteBuffer tryCompress(final ByteBuffer src, final int useArith) { + // compress with different formatFlags + // and return the compressed output ByteBuffer with the least number of bytes + int bestcompressedByteLength = 1 << 30; + ByteBuffer compressedByteBuffer = null; + int[] formatFlagsList = {0, 1, 64, 65, 128, 129, 193+8}; + for (int formatFlags : formatFlagsList) { + if ((formatFlags & 1) != 0 && src.remaining() < 100) + continue; + + if ((formatFlags & 8) != 0 && (src.remaining() % 4) != 0) + continue; + + ByteBuffer tmpByteBuffer = null; + try { + if (useArith!=0) { + // Encode using Range + RangeEncode rangeEncode = new RangeEncode(); + src.rewind(); + tmpByteBuffer = rangeEncode.compress(src,new RangeParams(formatFlags)); + + } else { + // Encode using RANS + RANSEncode ransEncode = new RANSNx16Encode(); + src.rewind(); + tmpByteBuffer = ransEncode.compress(src, new RANSNx16Params(formatFlags)); + } + } catch (final Exception ignored) {} + if (tmpByteBuffer != null && bestcompressedByteLength > tmpByteBuffer.remaining()) { + bestcompressedByteLength = tmpByteBuffer.remaining(); + compressedByteBuffer = tmpByteBuffer; + } + } + return compressedByteBuffer; + } + + protected void serializeByteStreams( + final List tokenStream, + final int useArith, + final ByteBuffer outBuffer) { + + // Compress and serialise tokenStreams + for (int tokenType = 0; tokenType <= TokenStreams.TOKEN_END; tokenType++) { + if (tokenStream.get(tokenType).remaining() > 0) { + outBuffer.put((byte) (tokenType + ((tokenType == 0) ? 128 : 0))); + ByteBuffer tempOutByteBuffer = tryCompress(tokenStream.get(tokenType), useArith); + CompressionUtils.writeUint7(tempOutByteBuffer.limit(),outBuffer); + outBuffer.put(tempOutByteBuffer); + } + } + } + + protected ByteBuffer allocateOutputBuffer(final int inSize) { + + // same as the allocateOutputBuffer in RANS4x8Encode and RANSNx16Encode + // TODO: de-duplicate + final int compressedSize = (int) (1.05 * inSize + 257 * 257 * 3 + 9); + final ByteBuffer outputBuffer = ByteBuffer.allocate(compressedSize); + if (outputBuffer.remaining() < compressedSize) { + throw new RuntimeException("Failed to allocate sufficient buffer size for Range coder."); + } + outputBuffer.order(ByteOrder.LITTLE_ENDIAN); + return outputBuffer; + } +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java new file mode 100644 index 0000000000..deed459022 --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java @@ -0,0 +1,125 @@ +package htsjdk.samtools.cram.compression.nametokenisation; + +import htsjdk.samtools.cram.CRAMException; +import htsjdk.samtools.cram.compression.CompressionUtils; +import htsjdk.samtools.cram.compression.range.RangeDecode; +import htsjdk.samtools.cram.compression.rans.RANSDecode; +import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +public class TokenStreams { + + public static final byte TOKEN_TYPE = 0x00; + public static final byte TOKEN_STRING = 0x01; + public static final byte TOKEN_CHAR = 0x02; + public static final byte TOKEN_DIGITS0 = 0x03; + public static final byte TOKEN_DZLEN = 0x04; + public static final byte TOKEN_DUP = 0x05; + public static final byte TOKEN_DIFF = 0x06; + public static final byte TOKEN_DIGITS = 0x07; + public static final byte TOKEN_DELTA = 0x08; + public static final byte TOKEN_DELTA0 = 0x09; + public static final byte TOKEN_MATCH = 0x0A; + public static final byte TOKEN_END = 0x0C; + public static final int TOTAL_TOKEN_TYPES = 13; + + private static final int NEW_TOKEN_FLAG_MASK = 0x80; + private static final int DUP_TOKEN_FLAG_MASK = 0x40; + private static final int TYPE_TOKEN_FLAG_MASK = 0x3F; + + private final List> tokenStreams; + + public TokenStreams() { + tokenStreams = new ArrayList<>(TOTAL_TOKEN_TYPES); + for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) { + tokenStreams.add(new ArrayList<>()); + } + } + + public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final int numNames) { + // The outer index corresponds to type of the token + // and the inner index corresponds to the position of the token in a name (starting at index 1) + // Each element in this list of lists is a Token (ie, a ByteBuffer) + + // TokenStreams[type = TOKEN_TYPE(0x00), pos = 0] contains a ByteBuffer of length = number of names + // This ByteBuffer helps determine if each of the names is a TOKEN_DUP or TOKEN_DIFF + // when compared with the previous name + + // TokenStreams[type = TOKEN_TYPE(0x00), pos = all except 0] + // contains a ByteBuffer of length = number of names + // This ByteBuffer helps determine the type of each of the token at the specicfied pos + + this(); + int tokenPosition = -1; + while (inputByteBuffer.hasRemaining()) { + final byte tokenTypeFlags = inputByteBuffer.get(); + final boolean isNewToken = ((tokenTypeFlags & NEW_TOKEN_FLAG_MASK) != 0); + final boolean isDupToken = ((tokenTypeFlags & DUP_TOKEN_FLAG_MASK) != 0); + final int tokenType = (tokenTypeFlags & TYPE_TOKEN_FLAG_MASK); + if (tokenType < 0 || tokenType > TOKEN_END) { + throw new CRAMException("Invalid Token tokenType: " + tokenType); + } + if (isNewToken) { + tokenPosition++; + if (tokenPosition > 0) { + // If newToken and not the first newToken + // Ensure that the size of tokenStream for each type of token = tokenPosition + // by adding an empty ByteBuffer if needed + for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) { + final List currTokenStream = tokenStreams.get(i); + if (currTokenStream.size() < tokenPosition) { + currTokenStream.add(ByteBuffer.allocate(0)); + } + if (currTokenStream.size() < tokenPosition) { + throw new CRAMException("TokenStream is missing Token(s) at Token Type: " + i); + } + } + } + } + if ((isNewToken) && (tokenType != TOKEN_TYPE)) { + + // Spec: if we have a byte stream B5,DIGIT S but no B5,T Y P E + // then we assume the contents of B5,T Y P E consist of one DIGITS tokenType + // followed by as many MATCH types as are needed. + final ByteBuffer typeDataByteBuffer = ByteBuffer.allocate(numNames); + for (int i = 0; i < numNames; i++) { + typeDataByteBuffer.put((byte) TOKEN_MATCH); + } + typeDataByteBuffer.rewind(); + typeDataByteBuffer.put(0, (byte) tokenType); + tokenStreams.get(0).add(typeDataByteBuffer); + } + if (isDupToken) { + final int dupPosition = inputByteBuffer.get() & 0xFF; + final int dupType = inputByteBuffer.get() & 0xFF; + final ByteBuffer dupTokenStream = tokenStreams.get(dupType).get(dupPosition).duplicate(); + tokenStreams.get(tokenType).add(tokenPosition,dupTokenStream); + } else { + final int clen = CompressionUtils.readUint7(inputByteBuffer); + final byte[] dataBytes = new byte[clen]; + inputByteBuffer.get(dataBytes, 0, clen); // offset in the dst byte array + final ByteBuffer uncompressedDataByteBuffer; + if (useArith != 0) { + RangeDecode rangeDecode = new RangeDecode(); + uncompressedDataByteBuffer = rangeDecode.uncompress(ByteBuffer.wrap(dataBytes)); + + } else { + RANSDecode ransdecode = new RANSNx16Decode(); + uncompressedDataByteBuffer = ransdecode.uncompress(ByteBuffer.wrap(dataBytes)); + } + this.getTokenStreamByType(tokenType).add(tokenPosition,uncompressedDataByteBuffer); + } + } + } + + public List getTokenStreamByType(final int tokenType) { + return tokenStreams.get(tokenType); + } + + public ByteBuffer getTokenStreamByteBuffer(final int tokenPosition, final int tokenType) { + return tokenStreams.get(tokenType).get(tokenPosition); + } +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/EncodeToken.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/EncodeToken.java new file mode 100644 index 0000000000..4e7cb0288a --- /dev/null +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/tokens/EncodeToken.java @@ -0,0 +1,38 @@ +package htsjdk.samtools.cram.compression.nametokenisation.tokens; + +public class EncodeToken { + + private String actualTokenValue; + private String relativeTokenValue; + private byte tokenType; + + public EncodeToken(String str, String val, byte type) { + this.actualTokenValue = str; + this.relativeTokenValue = val; + this.tokenType = type; + } + + public String getActualTokenValue() { + return actualTokenValue; + } + + public void setActualTokenValue(String actualTokenValue) { + this.actualTokenValue = actualTokenValue; + } + + public String getRelativeTokenValue() { + return relativeTokenValue; + } + + public void setRelativeTokenValue(String relativeTokenValue) { + this.relativeTokenValue = relativeTokenValue; + } + + public byte getTokenType() { + return tokenType; + } + + public void setTokenType(byte tokenType) { + this.tokenType = tokenType; + } +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/range/RangeCoder.java b/src/main/java/htsjdk/samtools/cram/compression/range/RangeCoder.java index a7d7b21828..f0d7d82911 100644 --- a/src/main/java/htsjdk/samtools/cram/compression/range/RangeCoder.java +++ b/src/main/java/htsjdk/samtools/cram/compression/range/RangeCoder.java @@ -11,7 +11,7 @@ public class RangeCoder { private boolean carry; private int cache; - protected RangeCoder() { + public RangeCoder() { // Spec: RangeEncodeStart this.low = 0; this.range = Constants.MAX_RANGE; // 4 bytes of all 1's @@ -21,7 +21,7 @@ protected RangeCoder() { this.cache = 0; } - protected void rangeDecodeStart(final ByteBuffer inBuffer){ + public void rangeDecodeStart(final ByteBuffer inBuffer){ for (int i = 0; i < 5; i++){ code = (code << 8) + (inBuffer.get() & 0xFF); } diff --git a/src/test/java/htsjdk/samtools/cram/FQZCompInteropTest.java b/src/test/java/htsjdk/samtools/cram/FQZCompInteropTest.java new file mode 100644 index 0000000000..ab9ad4a517 --- /dev/null +++ b/src/test/java/htsjdk/samtools/cram/FQZCompInteropTest.java @@ -0,0 +1,80 @@ +package htsjdk.samtools.cram; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.cram.compression.CompressionUtils; +import htsjdk.samtools.cram.compression.fqzcomp.FQZCompDecode; +import org.apache.commons.compress.utils.IOUtils; +import org.testng.Assert; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.NoSuchFileException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class FQZCompInteropTest extends HtsjdkTest { + + public static final String COMPRESSED_FQZCOMP_DIR = "fqzcomp"; + + // uses the available compressed interop test files + @DataProvider(name = "decodeOnlyTestCases") + public Object[][] getDecodeOnlyTestCases() throws IOException { + + // params: + // compressed testfile path, uncompressed testfile path, + // FQZComp decoder + final List testCases = new ArrayList<>(); + for (Path path : CRAMInteropTestUtils.getInteropCompressedFilePaths(COMPRESSED_FQZCOMP_DIR)) { + Object[] objects = new Object[]{ + path, + CRAMInteropTestUtils.getUnCompressedFilePath(path), + new FQZCompDecode() + }; + testCases.add(objects); + } + return testCases.toArray(new Object[][]{}); + } + + @Test(description = "Test if CRAM Interop Test Data is available") + public void testHtsCodecsCorpusIsAvailable() { + if (!CRAMInteropTestUtils.isInteropTestDataAvailable()) { + throw new SkipException(String.format("CRAM Interop Test Data is not available at %s", + CRAMInteropTestUtils.INTEROP_TEST_FILES_PATH)); + } + } + + @Test ( + dependsOnMethods = "testHtsCodecsCorpusIsAvailable", + dataProvider = "decodeOnlyTestCases", + description = "Uncompress the existing compressed file using htsjdk FQZComp and compare it with the original file.") + public void testDecodeOnly( + final Path compressedFilePath, + final Path uncompressedInteropPath, + final FQZCompDecode fqzcompDecode) throws IOException { + try (final InputStream uncompressedInteropStream = Files.newInputStream(uncompressedInteropPath); + final InputStream preCompressedInteropStream = Files.newInputStream(compressedFilePath) + ) { + // preprocess the uncompressed data (to match what the htscodecs-library test harness does) + // by filtering out the embedded newlines, and then round trip through FQZComp codec + // and compare the results + final ByteBuffer uncompressedInteropBytes = CompressionUtils.wrap(CRAMInteropTestUtils.filterEmbeddedNewlines(IOUtils.toByteArray(uncompressedInteropStream))); + final ByteBuffer preCompressedInteropBytes = CompressionUtils.wrap(IOUtils.toByteArray(preCompressedInteropStream)); + + // Use htsjdk to uncompress the precompressed file from htscodecs repo + final ByteBuffer uncompressedHtsjdkBytes = fqzcompDecode.uncompress(preCompressedInteropBytes); + + // Compare the htsjdk uncompressed bytes with the original input file from htscodecs repo + Assert.assertEquals(uncompressedHtsjdkBytes, uncompressedInteropBytes); + } catch (final NoSuchFileException ex){ + throw new SkipException("Skipping testDecodeOnly as either input file " + + "or precompressed file is missing.", ex); + } + } + +} \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java new file mode 100644 index 0000000000..8a4aa0e22b --- /dev/null +++ b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java @@ -0,0 +1,134 @@ +package htsjdk.samtools.cram; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationDecode; +import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationEncode; +import org.apache.commons.compress.utils.IOUtils; +import org.testng.Assert; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.NoSuchFileException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class NameTokenizationInteropTest extends HtsjdkTest { + public static final String COMPRESSED_TOK_DIR = "tok3"; + + @DataProvider(name = "allNameTokenizationFiles") + public Object[][] getAllNameTokenizationCodecsForRoundTrip() throws IOException { + + // params: + // compressed testfile path, uncompressed testfile path, NameTokenization encoder, NameTokenization decoder + final List testCases = new ArrayList<>(); + for (Path path : getInteropNameTokenizationCompressedFiles()) { + Object[] objects = new Object[]{ + path, + getNameTokenizationUnCompressedFilePath(path), + new NameTokenisationEncode(), + new NameTokenisationDecode() + }; + testCases.add(objects); + } + return testCases.toArray(new Object[][]{}); + } + + @Test(description = "Test if CRAM Interop Test Data is available") + public void testGetHTSCodecsCorpus() { + if (!CRAMInteropTestUtils.isInteropTestDataAvailable()) { + throw new SkipException(String.format("CRAM Interop Test Data is not available at %s", + CRAMInteropTestUtils.INTEROP_TEST_FILES_PATH)); + } + } + + @Test ( + dependsOnMethods = "testGetHTSCodecsCorpus", + dataProvider = "allNameTokenizationFiles", + description = "Roundtrip using htsjdk NameTokenization Codec. Compare the output with the original file" ) + public void testRangeRoundTrip( + final Path precompressedFilePath, + final Path uncompressedFilePath, + final NameTokenisationEncode nameTokenisationEncode, + final NameTokenisationDecode nameTokenisationDecode) throws IOException { + try(final InputStream preCompressedInteropStream = Files.newInputStream(precompressedFilePath); + final InputStream unCompressedInteropStream = Files.newInputStream(uncompressedFilePath)){ + final ByteBuffer preCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(preCompressedInteropStream)); + final ByteBuffer unCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream)); + ByteBuffer compressedHtsjdkBytes = nameTokenisationEncode.compress(unCompressedInteropBytes); + String decompressedHtsjdkString = nameTokenisationDecode.uncompress(compressedHtsjdkBytes); + ByteBuffer decompressedHtsjdkBytes = StandardCharsets.UTF_8.encode(decompressedHtsjdkString); + unCompressedInteropBytes.rewind(); + Assert.assertEquals(decompressedHtsjdkBytes, unCompressedInteropBytes); + } catch (final NoSuchFileException ex){ + throw new SkipException("Skipping testRangeRoundTrip as either the input precompressed file " + + "or the uncompressed file is missing.", ex); + } + } + + + + @Test ( + dependsOnMethods = "testGetHTSCodecsCorpus", + dataProvider = "allNameTokenizationFiles", + description = "Compress the original file using htsjdk NameTokenization Codec and compare it with the existing compressed file. " + + "Uncompress the existing compressed file using htsjdk NameTokenization Codec and compare it with the original file.") + public void testtNameTokenizationPreCompressed( + final Path compressedFilePath, + final Path uncompressedFilePath, + final NameTokenisationEncode unsusednameTokenisationEncode, + final NameTokenisationDecode nameTokenisationDecode) throws IOException { + try(final InputStream preCompressedInteropStream = Files.newInputStream(compressedFilePath); + final InputStream unCompressedInteropStream = Files.newInputStream(uncompressedFilePath)){ + final ByteBuffer preCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(preCompressedInteropStream)); + final ByteBuffer unCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(unCompressedInteropStream)); + + // Use htsjdk to uncompress the precompressed file from htscodecs repo + final String uncompressedHtsjdkString = nameTokenisationDecode.uncompress(preCompressedInteropBytes); + ByteBuffer uncompressedHtsjdkBytes = StandardCharsets.UTF_8.encode(uncompressedHtsjdkString); + + // Compare the htsjdk uncompressed bytes with the original input file from htscodecs repo + Assert.assertEquals(uncompressedHtsjdkBytes, unCompressedInteropBytes); + } catch (final NoSuchFileException ex){ + throw new SkipException("Skipping testNameTokenizationPrecompressed as either input file " + + "or precompressed file is missing.", ex); + } + + } + + // return a list of all NameTokenization encoded test data files in the htscodecs/tests/names/tok3 directory + private List getInteropNameTokenizationCompressedFiles() throws IOException { + final List paths = new ArrayList<>(); + Files.newDirectoryStream( + CRAMInteropTestUtils.getInteropTestDataLocation().resolve("names/"+COMPRESSED_TOK_DIR), + path -> Files.isRegularFile(path)) + .forEach(path -> paths.add(path)); + return paths; + } + + // Given a compressed test file path, return the corresponding uncompressed file path + public static final Path getNameTokenizationUnCompressedFilePath(final Path compressedInteropPath) { + String uncompressedFileName = getUncompressedFileName(compressedInteropPath.getFileName().toString()); + // Example compressedInteropPath: ../names/tok3/01.names.1 => unCompressedFilePath: ../names/01.names + return compressedInteropPath.getParent().getParent().resolve(uncompressedFileName); + } + + public static final String getUncompressedFileName(final String compressedFileName) { + // Returns original filename from compressed file name + int lastDotIndex = compressedFileName.lastIndexOf("."); + if (lastDotIndex >= 0) { + return compressedFileName.substring(0, lastDotIndex); + } else { + throw new CRAMException("The format of the compressed File Name is not as expected. " + + "The name of the compressed file should contain a period followed by a number that" + + "indicates type of compression. Actual compressed file name = "+ compressedFileName); + } + } + +} \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java b/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java new file mode 100644 index 0000000000..29e487f64a --- /dev/null +++ b/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java @@ -0,0 +1,100 @@ +package htsjdk.samtools.cram.compression.nametokenisation; + +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +public class NameTokenisationTest extends HtsjdkTest { + + private static class TestDataEnvelope { + public final byte[] testArray; + public TestDataEnvelope(final byte[] testdata) { + this.testArray = testdata; + } + public String toString() { + return String.format("Array of size %d", testArray.length); + } + } + + @DataProvider(name="nameTokenisation") + public Object[][] getNameTokenisationTestData() { + + List readNamesList = new ArrayList<>(); + readNamesList.add(""); + + // a subset of read names from + // src/test/resources/htsjdk/samtools/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.first.8000.bam + readNamesList.add("20FUKAAXX100202:6:27:4968:125377\n" + + "20FUKAAXX100202:6:27:4986:125375\n" + + "20FUKAAXX100202:5:62:8987:1929\n" + + "20GAVAAXX100126:1:28:4295:139802\n" + + "20FUKAAXX100202:4:23:8516:117251\n" + + "20FUKAAXX100202:6:23:6442:37469\n" + + "20FUKAAXX100202:8:24:10477:24196\n" + + "20GAVAAXX100126:8:63:5797:158250\n" + + "20FUKAAXX100202:1:45:12798:104365\n" + + "20GAVAAXX100126:3:23:6419:199245\n" + + "20FUKAAXX100202:8:48:6663:137967\n" + + "20FUKAAXX100202:6:68:17726:162601"); + + // a subset of read names from + // src/test/resources/htsjdk/samtools/longreads/NA12878.m64020_190210_035026.chr21.5011316.5411316.unmapped.bam + readNamesList.add("m64020_190210_035026/44368402/ccs\n"); + readNamesList.add("m64020_190210_035026/44368402/ccs"); + readNamesList.add("m64020_190210_035026/44368402/ccs\n" + + "m64020_190210_035026/124127126/ccs\n" + + "m64020_190210_035026/4981311/ccs\n" + + "m64020_190210_035026/80022195/ccs\n" + + "m64020_190210_035026/17762104/ccs\n" + + "m64020_190210_035026/62981096/ccs\n" + + "m64020_190210_035026/86968803/ccs\n" + + "m64020_190210_035026/46400955/ccs\n" + + "m64020_190210_035026/137561592/ccs\n" + + "m64020_190210_035026/52233471/ccs\n" + + "m64020_190210_035026/97127189/ccs\n" + + "m64020_190210_035026/115278035/ccs\n" + + "m64020_190210_035026/155256324/ccs\n" + + "m64020_190210_035026/163644151/ccs\n" + + "m64020_190210_035026/162728365/ccs\n" + + "m64020_190210_035026/160238116/ccs\n" + + "m64020_190210_035026/147719983/ccs\n" + + "m64020_190210_035026/60883331/ccs\n" + + "m64020_190210_035026/1116165/ccs\n" + + "m64020_190210_035026/75893199/ccs"); + + // source: https://gatk.broadinstitute.org/hc/en-us/articles/360035890671-Read-groups + readNamesList.add( + "H0164ALXX140820:2:1101:10003:23460\n" + + "H0164ALXX140820:2:1101:15118:25288"); + + final List testCases = new ArrayList<>(); + for (String readName : readNamesList) { + Object[] objects = new Object[]{ + new NameTokenisationEncode(), + new NameTokenisationDecode(), + new TestDataEnvelope(readName.getBytes())}; + testCases.add(objects); + } + return testCases.toArray(new Object[][]{}); + } + + @Test(dataProvider = "nameTokenisation") + public void testRoundTrip( + final NameTokenisationEncode nameTokenisationEncode, + final NameTokenisationDecode nameTokenisationDecode, + final TestDataEnvelope td) { + ByteBuffer uncompressedBuffer = ByteBuffer.wrap(td.testArray); + ByteBuffer compressedBuffer = nameTokenisationEncode.compress(uncompressedBuffer, 0); + String decompressedNames = nameTokenisationDecode.uncompress(compressedBuffer); + ByteBuffer decompressedNamesBuffer = StandardCharsets.UTF_8.encode(decompressedNames); + uncompressedBuffer.rewind(); + Assert.assertEquals(decompressedNamesBuffer, uncompressedBuffer); + } + +} \ No newline at end of file