diff --git a/.dockstore.yml b/.dockstore.yml
index ce89e992252..62fca1abc19 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -183,6 +183,7 @@ workflows:
- master
- ah_var_store
- rsa_vs_1245
+ - hatcher_testing_parquet
tags:
- /.*/
- name: GvsPrepareRangesCallset
diff --git a/Dockerfile b/Dockerfile
index a513863128d..22f319086ad 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.1.0
+ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.2.0
# stage 1 for constructing the GATK zip
FROM ${BASE_DOCKER} AS gradleBuild
@@ -93,8 +93,6 @@ RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \
echo "source activate gatk" >> /gatk/gatkenv.rc && \
echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \
conda clean -afy && \
- find /opt/miniconda/ -follow -type f -name '*.a' -delete && \
- find /opt/miniconda/ -follow -type f -name '*.pyc' -delete && \
rm -rf /root/.cache/pip
CMD ["bash", "--init-file", "/gatk/gatkenv.rc"]
diff --git a/build.gradle b/build.gradle
index f6600e81595..cec759e23c5 100644
--- a/build.gradle
+++ b/build.gradle
@@ -3,7 +3,7 @@
buildscript {
repositories {
mavenCentral()
- }
+ }
}
plugins {
@@ -16,6 +16,7 @@ plugins {
id "com.github.johnrengelman.shadow" version "8.1.1" //used to build the shadow and sparkJars
id "com.github.ben-manes.versions" version "0.12.0" //used for identifying dependencies that need updating
id 'com.palantir.git-version' version '0.5.1' //version helper
+ id 'org.sonatype.gradle.plugins.scan' version '2.6.1' // scans for security vulnerabilities in our dependencies
}
@@ -56,20 +57,20 @@ repositories {
mavenLocal()
}
-final htsjdkVersion = System.getProperty('htsjdk.version','3.0.5')
-final picardVersion = System.getProperty('picard.version','3.1.0')
+final htsjdkVersion = System.getProperty('htsjdk.version','4.1.0')
+final picardVersion = System.getProperty('picard.version','3.1.1')
final barclayVersion = System.getProperty('barclay.version','5.0.0')
-final sparkVersion = System.getProperty('spark.version', '3.3.1')
-final hadoopVersion = System.getProperty('hadoop.version', '3.3.1')
-final disqVersion = System.getProperty('disq.version','0.3.6')
-final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.0')
-final bigQueryVersion = System.getProperty('bigQuery.version', '2.31.0')
-final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.41.0')
-final guavaVersion = System.getProperty('guava.version', '32.1.2-jre')
+final sparkVersion = System.getProperty('spark.version', '3.5.0')
+final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
+final disqVersion = System.getProperty('disq.version','0.3.8')
+final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.1')
+final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0')
+final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0')
+final guavaVersion = System.getProperty('guava.version', '32.1.3-jre')
final log4j2Version = System.getProperty('log4j2Version', '2.17.1')
final testNGVersion = '7.0.0'
-final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.0'
+final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.8'
final baseJarName = 'gatk'
final secondaryBaseJarName = 'hellbender'
@@ -109,7 +110,7 @@ def resolveLargeResourceStubFiles(largeResourcesFolder, buildPrerequisitesMessag
} catch (IOException e) {
throw new GradleException(
"An IOException occurred while attempting to execute the command $gitLFSExecCommand."
- + " git-lfs is required to build GATK but may not be installed. $buildPrerequisitesMessage", e)
+ + " git-lfs is required to build GATK but may not be installed. $buildPrerequisitesMessage", e)
}
}
@@ -187,8 +188,10 @@ configurations.all {
}
tasks.withType(JavaCompile) {
- options.compilerArgs = ['-proc:none', '-Xlint:all', '-Werror', '-Xdiags:verbose']
- options.encoding = 'UTF-8'
+// Changing this right now just for getting around deprecation
+// options.compilerArgs = ['-proc:none', '-Xlint:all', '-Werror', '-Xdiags:verbose']
+ options.compilerArgs = ['-proc:none', '-Xlint:all', '-Xdiags:verbose']
+ options.encoding = 'UTF-8'
}
sourceSets {
@@ -267,12 +270,12 @@ dependencies {
// are routed to log4j
implementation 'org.apache.logging.log4j:log4j-jcl:' + log4j2Version
- implementation 'org.apache.commons:commons-lang3:3.5'
- implementation 'org.apache.commons:commons-math3:3.5'
+ implementation 'org.apache.commons:commons-lang3:3.14.0'
+ implementation 'org.apache.commons:commons-math3:3.6.1'
implementation 'org.hipparchus:hipparchus-stat:2.0'
- implementation 'org.apache.commons:commons-collections4:4.1'
- implementation 'org.apache.commons:commons-vfs2:2.0'
- implementation 'org.apache.commons:commons-configuration2:2.4'
+ implementation 'org.apache.commons:commons-collections4:4.4'
+ implementation 'org.apache.commons:commons-vfs2:2.9.0'
+ implementation 'org.apache.commons:commons-configuration2:2.9.0'
constraints {
implementation('org.apache.commons:commons-text') {
version {
@@ -287,7 +290,7 @@ dependencies {
implementation 'commons-io:commons-io:2.5'
implementation 'org.reflections:reflections:0.9.10'
- implementation 'it.unimi.dsi:fastutil:7.0.6'
+ implementation 'it.unimi.dsi:fastutil:7.0.13'
implementation 'org.broadinstitute:hdf5-java-bindings:1.1.0-hdf5_2.11.0'
implementation 'org.broadinstitute:gatk-native-bindings:1.0.0'
@@ -297,8 +300,8 @@ dependencies {
exclude group: 'org.apache.commons'
}
- //there is no mllib_2.12.15:3.3.0, so stay use 2.12:3.3.0
- implementation ('org.apache.spark:spark-mllib_2.12:3.3.0') {
+ // TODO: migrate to mllib_2.12.15?
+ implementation ('org.apache.spark:spark-mllib_2.12:' + sparkVersion) {
// JUL is used by Google Dataflow as the backend logger, so exclude jul-to-slf4j to avoid a loop
exclude module: 'jul-to-slf4j'
exclude module: 'javax.servlet'
@@ -344,15 +347,22 @@ dependencies {
implementation 'org.broadinstitute:gatk-bwamem-jni:1.0.4'
implementation 'org.broadinstitute:gatk-fermilite-jni:1.2.0'
- implementation 'org.broadinstitute:http-nio:0.1.0-rc1'
+ implementation 'org.broadinstitute:http-nio:1.1.0'
// Required for COSMIC Funcotator data source:
- implementation 'org.xerial:sqlite-jdbc:3.36.0.3'
+ implementation 'org.xerial:sqlite-jdbc:3.44.1.0'
// natural sort
implementation('net.grey-panther:natural-comparator:1.1')
implementation('com.fasterxml.jackson.module:jackson-module-scala_2.12:2.9.8')
+ // parquet writing
+ implementation('org.apache.parquet:parquet-common:1.13.1')
+ implementation('org.apache.parquet:parquet-encoding:1.13.1')
+ implementation('org.apache.parquet:parquet-column:1.13.1')
+ implementation('org.apache.parquet:parquet-hadoop:1.13.1')
+ implementation 'org.apache.parquet:parquet-avro:1.13.1'
+
testUtilsImplementation sourceSets.main.output
testUtilsImplementation 'org.testng:testng:' + testNGVersion
testUtilsImplementation 'org.apache.hadoop:hadoop-minicluster:' + hadoopVersion
@@ -426,20 +436,20 @@ final runtimeAddOpens = [
'java.base/jdk.internal.module=ALL-UNNAMED',
'java.base/java.lang.module=ALL-UNNAMED',
'java.security.jgss/sun.security.krb5=ALL-UNNAMED'
- ]
+]
final testAddOpens = [
'java.prefs/java.util.prefs=ALL-UNNAMED' // required for jacoco tasks
]
run {
- // transform the list of runtime configuration --add-opens args into command line argument format
- final runtimeJVMArgs = runtimeAddOpens.stream()
- .flatMap(openSpec -> ['--add-opens', openSpec].stream())
- .toList()
- // add in any other required args
- runtimeJVMArgs.add('-Dio.netty.tryReflectionSetAccessible=true')
- jvmArgs = runtimeJVMArgs
+ // transform the list of runtime configuration --add-opens args into command line argument format
+ final runtimeJVMArgs = runtimeAddOpens.stream()
+ .flatMap(openSpec -> ['--add-opens', openSpec].stream())
+ .toList()
+ // add in any other required args
+ runtimeJVMArgs.add('-Dio.netty.tryReflectionSetAccessible=true')
+ jvmArgs = runtimeJVMArgs
}
test {
@@ -981,6 +991,18 @@ task gatkValidateGeneratedWdl(dependsOn: [gatkWDLGen, shadowJar]) {
}
}
+// scan-gradle-plugin security vulnerability scan
+ossIndexAudit {
+ allConfigurations = false // if true includes the dependencies in all resolvable configurations. By default is false, meaning only 'compileClasspath', 'runtimeClasspath', 'releaseCompileClasspath' and 'releaseRuntimeClasspath' are considered
+ useCache = true // true by default
+ outputFormat = 'DEFAULT' // Optional, other values are: 'DEPENDENCY_GRAPH' prints dependency graph showing direct/transitive dependencies, 'JSON_CYCLONE_DX_1_4' prints a CycloneDX 1.4 SBOM in JSON format.
+ showAll = false // if true prints all dependencies. By default is false, meaning only dependencies with vulnerabilities will be printed.
+ printBanner = true // if true will print ASCII text banner. By default is true.
+
+ // ossIndexAudit can be configured to exclude vulnerabilities from matching
+ // excludeVulnerabilityIds = ['39d74cc8-457a-4e57-89ef-a258420138c5'] // list containing ids of vulnerabilities to be ignored
+ // excludeCoordinates = ['commons-fileupload:commons-fileupload:1.3'] // list containing coordinate of components which if vulnerable should be ignored
+}
/**
*This specifies what artifacts will be built and uploaded when performing a maven upload.
diff --git a/build_docker_remote.sh b/build_docker_remote.sh
index 72534a02b03..7eada9565c3 100755
--- a/build_docker_remote.sh
+++ b/build_docker_remote.sh
@@ -1,12 +1,26 @@
#!/usr/bin/env bash
#
-# Build (and optionally push) a GATK docker image to GCR using Google Cloud Build. Images are built in the cloud rather than locally. Pushing to dockerhub is not supported by this script.
+# Build (and optionally push) a GATK docker image to GCR using Google Cloud Build. Images are built
+# in the cloud rather than locally. Pushing to dockerhub is not supported by this script.
#
-# If you are pushing an image to our release repositories, be sure that you've followed
-# the setup instructions here:
-# https://github.com/broadinstitute/gatk/wiki/How-to-release-GATK4#setup_docker
-# and here:
-# https://github.com/broadinstitute/gatk/wiki/How-to-release-GATK4#setup_gcloud
+# By default the images are pushed to the following GCR repository:
+#
+# us.gcr.io/broad-dsde-methods/broad-gatk-snapshots/gatk-remote-builds
+#
+# with a name like "${YOUR_USERNAME}-${GITHUB_TAG}-${GIT_HASH_FOR_TAG}"
+#
+# This script should not be used to push to our release repositories. After
+# you've built and staged an image, run the scripts/docker/release_prebuilt_docker_image.sh
+# script to push to the release repositories.
+#
+# Usage: build_docker_remote.sh -e
* To download and extract the data sources, you can invoke {@link FuncotatorDataSourceDownloader} in the following ways:
*
- *
* {@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --extract-after-download}{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --extract-after-download}{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --hg38 --extract-after-download}{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --hg19 --extract-after-download}
* sample1 sample1.vcf.gz
* sample2 sample2.vcf.gz sample2.vcf.gz.tbi
@@ -218,12 +216,14 @@ public final class GenomicsDBImport extends GATKTool {
public static final String MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL = "max-num-intervals-to-import-in-parallel";
public static final String MERGE_CONTIGS_INTO_NUM_PARTITIONS = "merge-contigs-into-num-partitions";
public static final String BYPASS_FEATURE_READER = "bypass-feature-reader";
+ public static final String VCF_HEADER_OVERRIDE = "header";
public static final int INTERVAL_LIST_SIZE_WARNING_THRESHOLD = 100;
public static final int ARRAY_COLUMN_BOUNDS_START = 0;
public static final int ARRAY_COLUMN_BOUNDS_END = 1;
public static final String SHARED_POSIXFS_OPTIMIZATIONS = GenomicsDBArgumentCollection.SHARED_POSIXFS_OPTIMIZATIONS;
public static final String USE_GCS_HDFS_CONNECTOR = GenomicsDBArgumentCollection.USE_GCS_HDFS_CONNECTOR;
+ public static final String AVOID_NIO = "avoid-nio";
@Argument(fullName = WORKSPACE_ARG_LONG_NAME,
doc = "Workspace for GenomicsDB. Can be a POSIX file system absolute or relative path or a HDFS/GCS URL. " +
@@ -239,7 +239,7 @@ public final class GenomicsDBImport extends GATKTool {
"when using the "+INTERVAL_LIST_LONG_NAME+" option. " +
"Either this or "+WORKSPACE_ARG_LONG_NAME+" must be specified. " +
"Must point to an existing workspace.",
- mutex = {WORKSPACE_ARG_LONG_NAME})
+ mutex = {WORKSPACE_ARG_LONG_NAME, VCF_HEADER_OVERRIDE})
private String incrementalImportWorkspace;
@Argument(fullName = SEGMENT_SIZE_ARG_LONG_NAME,
@@ -254,7 +254,7 @@ public final class GenomicsDBImport extends GATKTool {
" data for only a single sample. Either this or " + SAMPLE_NAME_MAP_LONG_NAME +
" must be specified.",
optional = true,
- mutex = {SAMPLE_NAME_MAP_LONG_NAME})
+ mutex = {SAMPLE_NAME_MAP_LONG_NAME, AVOID_NIO})
private List variantPaths;
@Argument(fullName = VCF_BUFFER_SIZE_ARG_NAME,
@@ -364,6 +364,13 @@ public final class GenomicsDBImport extends GATKTool {
optional = true)
private boolean sharedPosixFSOptimizations = false;
+ @Argument(fullName = VCF_HEADER_OVERRIDE,
+ doc = "Specify a vcf file to use instead of reading and combining headers from the input vcfs",
+ optional = true,
+ mutex ={INCREMENTAL_WORKSPACE_ARG_LONG_NAME}
+ )
+ private FeatureInput headerOverride = null;
+
@Argument(fullName = BYPASS_FEATURE_READER,
doc = "Use htslib to read input VCFs instead of GATK's FeatureReader. This will reduce memory usage and potentially speed up " +
"the import. Lower memory requirements may also enable parallelism through " + MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL +
@@ -371,6 +378,15 @@ public final class GenomicsDBImport extends GATKTool {
optional = true)
private boolean bypassFeatureReader = false;
+ @Argument(fullName = AVOID_NIO,
+ doc = "Do not attempt to open the input vcf file paths in java. This can only be used with " + BYPASS_FEATURE_READER
+ + ". It allows operating on file systems which GenomicsDB understands how to open but GATK does not. This will disable "
+ + "many of the sanity checks.",
+ mutex = {StandardArgumentDefinitions.VARIANT_LONG_NAME}
+ )
+ @Advanced
+ private boolean avoidNio = false;
+
@Argument(fullName = USE_GCS_HDFS_CONNECTOR,
doc = "Use the GCS HDFS Connector instead of the native GCS SDK client with gs:// URLs.",
optional = true)
@@ -440,10 +456,6 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
// Path to combined VCF header file to be written by GenomicsDBImporter
private String vcfHeaderFile;
- // GenomicsDB callset map protobuf structure containing all callset names
- // used to write the callset json file on traversal success
- private GenomicsDBCallsetsMapProto.CallsetMappingPB callsetMappingPB;
-
//in-progress batchCount
private int batchCount = 1;
@@ -463,11 +475,14 @@ public void onStartup() {
initializeWorkspaceAndToolMode();
assertVariantPathsOrSampleNameFileWasSpecified();
assertOverwriteWorkspaceAndIncrementalImportMutuallyExclusive();
+ assertAvoidNioConditionsAreValid();
initializeHeaderAndSampleMappings();
initializeIntervals();
super.onStartup();
}
+
+
private void initializeWorkspaceAndToolMode() {
if (incrementalImportWorkspace != null && !incrementalImportWorkspace.isEmpty()) {
doIncrementalImport = true;
@@ -495,6 +510,24 @@ private void assertVariantPathsOrSampleNameFileWasSpecified(){
}
}
+ private void assertAvoidNioConditionsAreValid() {
+ if (avoidNio && (!bypassFeatureReader || headerOverride == null) ){
+ final List missing = new ArrayList<>();
+ if(!bypassFeatureReader){
+ missing.add(BYPASS_FEATURE_READER);
+ }
+ if(headerOverride == null){
+ missing.add(VCF_HEADER_OVERRIDE);
+ }
+ final String missingArgs = String.join(" and ", missing);
+
+ // this potentially produces and exception with bad grammar but that's probably ok
+ throw new CommandLineException.MissingArgument(missingArgs, "If --" +AVOID_NIO + " is set then --" + BYPASS_FEATURE_READER
+ + " and --" + VCF_HEADER_OVERRIDE + " must also be specified.");
+
+ }
+ }
+
private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importer,
List intervals) {
GenomicsDBVidMapProto.VidMappingPB vidMapPB = importer.getProtobufVidMapping();
@@ -523,32 +556,37 @@ private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importe
*/
private void initializeHeaderAndSampleMappings() {
// Only one of -V and --sampleNameMapFile may be specified
- if (variantPaths != null && variantPaths.size() > 0) {
+ if (variantPaths != null && !variantPaths.isEmpty()) {
// -V was specified
final List headers = new ArrayList<>(variantPaths.size());
sampleNameMap = new SampleNameMap();
- for (final String variantPathString : variantPaths) {
- final Path variantPath = IOUtils.getPath(variantPathString);
- if (bypassFeatureReader) {
- GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath);
- }
- final VCFHeader header = getHeaderFromPath(variantPath, null);
- Utils.validate(header != null, "Null header was found in " + variantPath + ".");
- assertGVCFHasOnlyOneSample(variantPathString, header);
- headers.add(header);
- final String sampleName = header.getGenotypeSamples().get(0);
- try {
- sampleNameMap.addSample(sampleName, new URI(variantPathString));
- }
- catch(final URISyntaxException e) {
- throw new UserException("Malformed URI "+e.toString(), e);
+ if(headerOverride == null) {
+ for (final String variantPathString : variantPaths) {
+ final Path variantPath = IOUtils.getPath(variantPathString);
+ if (bypassFeatureReader) { // avoid-nio can't be set here because it requires headerOverride
+ GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath);
+ }
+ final VCFHeader header = getHeaderFromPath(variantPath);
+ Utils.validate(header != null, "Null header was found in " + variantPath + ".");
+ assertGVCFHasOnlyOneSample(variantPathString, header);
+ headers.add(header);
+
+ final String sampleName = header.getGenotypeSamples().get(0);
+ try {
+ sampleNameMap.addSample(sampleName, new URI(variantPathString));
+ } catch (final URISyntaxException e) {
+ throw new UserException("Malformed URI " + e.getMessage(), e);
+ }
}
+ mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true);
+ mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary();
+ } else {
+ final VCFHeader header = getHeaderFromPath(headerOverride.toPath());
+ mergedHeaderLines = new LinkedHashSet<>(header.getMetaDataInInputOrder());
+ mergedHeaderSequenceDictionary = header.getSequenceDictionary();
}
- mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true);
- mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary();
mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines());
-
} else if (sampleNameMapFile != null) {
// --sampleNameMap was specified
@@ -556,31 +594,34 @@ private void initializeHeaderAndSampleMappings() {
//the resulting database will have incorrect sample names
//see https://github.com/broadinstitute/gatk/issues/3682 for more information
// The SampleNameMap class guarantees that the samples will be sorted correctly.
- sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader);
+ sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile),
+ bypassFeatureReader && !avoidNio);
final String firstSample = sampleNameMap.getSampleNameToVcfPath().entrySet().iterator().next().getKey();
- final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample);
- final Path firstVCFIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(firstSample);
- final VCFHeader header = getHeaderFromPath(firstVCFPath, firstVCFIndexPath);
+ final VCFHeader header;
+ if(headerOverride == null){
+ final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample);
+ header = getHeaderFromPath(firstVCFPath);
+ } else {
+ header = getHeaderFromPath(headerOverride.toPath());
+ }
//getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering
- mergedHeaderLines = new LinkedHashSet(header.getMetaDataInInputOrder());
+ mergedHeaderLines = new LinkedHashSet<>(header.getMetaDataInInputOrder());
mergedHeaderSequenceDictionary = header.getSequenceDictionary();
mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines());
- }
- else if (getIntervalsFromExistingWorkspace){
+ } else if (getIntervalsFromExistingWorkspace){
final String vcfHeader = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
IOUtils.assertPathsAreReadable(vcfHeader);
final String header = GenomicsDBUtils.readEntireFile(vcfHeader);
try {
File tempHeader = IOUtils.createTempFile("tempheader", ".vcf");
- Files.write(tempHeader.toPath(), header.getBytes(StandardCharsets.UTF_8));
+ Files.writeString(tempHeader.toPath(), header);
mergedHeaderSequenceDictionary = VCFFileReader.getSequenceDictionary(tempHeader);
} catch (final IOException e) {
- throw new UserException("Unable to create temporary header file to get sequence dictionary");
+ throw new UserException("Unable to create temporary header file to get sequence dictionary", e);
}
- }
- else {
+ } else {
throw new UserException(StandardArgumentDefinitions.VARIANT_LONG_NAME+" or "+
SAMPLE_NAME_MAP_LONG_NAME+" must be specified unless "+
INTERVAL_LIST_LONG_NAME+" is specified");
@@ -599,8 +640,12 @@ else if (getIntervalsFromExistingWorkspace){
}
}
- private VCFHeader getHeaderFromPath(final Path variantPath, final Path variantIndexPath) {
- try(final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath)) {
+ private VCFHeader getHeaderFromPath(final Path variantPath) {
+ //TODO make this mangling unecessary
+ final String variantURI = variantPath.toAbsolutePath().toUri().toString();
+ try(final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), false,
+ BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
+ BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer))) {
return (VCFHeader) reader.getHeader();
} catch (final IOException e) {
throw new UserException("Error while reading vcf header from " + variantPath.toUri(), e);
@@ -633,9 +678,9 @@ private void writeIntervalListToFile() {
@Override
public void onTraversalStart() {
String workspaceDir = overwriteCreateOrCheckWorkspace();
- vidMapJSONFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
- callsetMapJSONFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
- vcfHeaderFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
+ vidMapJSONFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
+ callsetMapJSONFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
+ vcfHeaderFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
if (getIntervalsFromExistingWorkspace) {
// intervals may be null if merge-contigs-into-num-partitions was used to create the workspace
// if so, we need to wait for vid to be generated before writing out the interval list
@@ -775,7 +820,7 @@ private List generateIntervalListFromWorkspace() {
final int start = Integer.parseInt(partitionInfo[1]);
final int end = Integer.parseInt(partitionInfo[2]);
return new SimpleInterval(contig, start, end);
- }).filter(o -> o != null).collect(Collectors.toList());
+ }).filter(Objects::nonNull).collect(Collectors.toList());
}
private ImportConfig createImportConfig(final int batchSize) {
@@ -785,7 +830,7 @@ private ImportConfig createImportConfig(final int batchSize) {
GenomicsDBImportConfiguration.ImportConfiguration.newBuilder();
importConfigurationBuilder.addAllColumnPartitions(partitions);
importConfigurationBuilder.setSizePerColumnPartition(vcfBufferSizePerSample);
- importConfigurationBuilder.setFailIfUpdating(true && !doIncrementalImport);
+ importConfigurationBuilder.setFailIfUpdating(!doIncrementalImport);
importConfigurationBuilder.setSegmentSize(segmentSize);
importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation);
importConfigurationBuilder.setEnableSharedPosixfsOptimizations(sharedPosixFSOptimizations);
@@ -936,7 +981,7 @@ private FeatureReader getReaderFromPath(final Path variantPath,
/* Anonymous FeatureReader subclass that wraps returned iterators to ensure that the GVCFs do not
* contain MNPs.
*/
- return new FeatureReader() {
+ return new FeatureReader<>() {
/** Iterator that asserts that variants are not MNPs. */
class NoMnpIterator implements CloseableTribbleIterator {
private final CloseableTribbleIterator inner;
@@ -971,7 +1016,8 @@ public VariantContext next() {
return new NoMnpIterator(reader.query(chr, start, end));
}
- @Override public CloseableTribbleIterator iterator() throws IOException {
+ @Override
+ public CloseableTribbleIterator iterator() throws IOException {
return new NoMnpIterator(reader.iterator());
}
};
@@ -990,7 +1036,7 @@ public VariantContext next() {
* @return The workspace directory
*/
private String overwriteCreateOrCheckWorkspace() {
- String workspaceDir = genomicsDBGetAbsolutePath(workspace);
+ String workspaceDir = GATKGenomicsDBUtils.genomicsDBGetAbsolutePath(workspace);
// From JavaDoc for GATKGenomicsDBUtils.createTileDBWorkspacevid
// returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first.
// returnCode = -1 : path was not a directory
@@ -1016,7 +1062,7 @@ private String overwriteCreateOrCheckWorkspace() {
}
static class UnableToCreateGenomicsDBWorkspace extends UserException {
- private static final long serialVersionUID = 1L;
+ @Serial private static final long serialVersionUID = 1L;
UnableToCreateGenomicsDBWorkspace(final String message){
super(message);
@@ -1028,7 +1074,7 @@ static class UnableToCreateGenomicsDBWorkspace extends UserException {
* dictionary (as returned by {@link #getBestAvailableSequenceDictionary})
* to parse/verify them. Does nothing if no intervals were specified.
*/
- protected void initializeIntervals() {
+ void initializeIntervals() {
if (intervalArgumentCollection.intervalsSpecified()) {
if (getIntervalsFromExistingWorkspace || doIncrementalImport) {
logger.warn(INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" was set, so ignoring specified intervals." +
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
index 234fee68b3b..83f1048f535 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
@@ -3,7 +3,7 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
import java.util.ArrayList;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
index d5c1eafc248..304e385441c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
@@ -5,7 +5,7 @@
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.FeatureContext;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
index a81a57d8be3..8ab3b6b9d6f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
@@ -2,7 +2,7 @@
import htsjdk.samtools.util.Locatable;
import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
public class ExtractCohortFilterRecord implements Locatable {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
index ffa1e44e77e..623c0678902 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
@@ -2,7 +2,7 @@
import htsjdk.samtools.util.Locatable;
import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import java.util.Objects;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
index 8f929262b7a..e5cda0ef47e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
@@ -3,7 +3,7 @@
import htsjdk.samtools.util.Locatable;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.util.Utf8;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
public class ReferenceRecord implements Locatable, Comparable {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
index 1919215d953..14013647d1f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
@@ -9,6 +9,7 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import org.apache.parquet.schema.MessageTypeParser;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.DeprecatedFeature;
@@ -24,6 +25,14 @@
import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
import org.broadinstitute.hellbender.utils.IntervalUtils;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
+import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
+import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
+
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
@@ -175,6 +184,48 @@ public final class CreateVariantIngestFiles extends VariantWalker {
private final Set gqStatesToIgnore = new HashSet<>();
+ public final MessageType variantRowSchema = MessageTypeParser
+ .parseMessageType("""
+ message VariantRow {
+ required int64 sample_id;
+ required int64 location;
+ required binary ref (UTF8);
+ required binary alt (UTF8);
+ optional binary AS_RAW_MQ (UTF8);
+ optional binary AS_RAW_MQRankSum (UTF8);
+ optional binary AS_QUALapprox (UTF8);
+ optional binary AS_RAW_ReadPosRankSum (UTF8);
+ optional binary AS_SB_TABLE (UTF8);
+ optional binary AS_VarDP (UTF8);
+ required binary call_GT (UTF8);
+ optional binary call_AD (UTF8);
+ optional binary call_DP (UTF8);
+ required int64 call_GQ;
+ optional binary call_PGT (UTF8);
+ optional binary call_PID (UTF8);
+ optional binary call_PS (UTF8);
+ optional binary call_PL (UTF8);
+ }
+ """);
+
+ public final MessageType refRangesRowSchema = MessageTypeParser
+ .parseMessageType("""
+ message RefRangesRow {
+ required int64 sample_id;
+ required int64 location;
+ required int64 length;
+ required binary state (UTF8);
+ }
+ """);
+
+ public final MessageType headersRowSchema = MessageTypeParser
+ .parseMessageType("""
+ message HeaderRow {
+ required int64 sample_id;
+ required binary headerLineHash (UTF8);
+ }
+ """);
+
// getGenotypes() returns list of lists for all samples at variant
// assuming one sample per gvcf, getGenotype(0) retrieves GT for sample at index 0
public static boolean isNoCall(VariantContext variant) {
@@ -291,15 +342,15 @@ public void onTraversalStart() {
SAMSequenceDictionary seqDictionary = initializeGQConfigurationAndIntervals();
if (enableReferenceRanges && !refRangesRowsExist) {
- refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences);
+ refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences, refRangesRowSchema);
}
if (enableVet && !vetRowsExist) {
- vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName, forceLoadingFromNonAlleleSpecific, skipLoadingVqsrFields);
+ vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName, forceLoadingFromNonAlleleSpecific, skipLoadingVqsrFields, variantRowSchema);
}
if (enableVCFHeaders && !vcfHeaderRowsExist) {
buildAllVcfLineHeaders();
- vcfHeaderLineScratchCreator = new VcfHeaderLineScratchCreator(sampleId, projectID, datasetName);
+ vcfHeaderLineScratchCreator = new VcfHeaderLineScratchCreator(sampleId, projectID, datasetName, outputDir, outputType, headersRowSchema);
}
logger.info("enableReferenceRanges = {}, enableVet = {}, enableVCFHeaders = {}",
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java
index 0876d860f5b..7a643f8abd1 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java
@@ -1,9 +1,14 @@
package org.broadinstitute.hellbender.tools.gvs.ingest;
+import com.google.protobuf.Descriptors;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.hadoop.fs.FileAlreadyExistsException;
+import org.apache.hadoop.fs.Path;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.schema.MessageType;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
import org.broadinstitute.hellbender.tools.gvs.common.GQStateEnum;
@@ -14,12 +19,16 @@
import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils;
+import org.broadinstitute.hellbender.utils.gvs.parquet.GvsReferenceParquetFileWriter;
+import org.broadinstitute.hellbender.utils.gvs.parquet.GvsVariantParquetFileWriter;
+import org.json.JSONObject;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import java.util.concurrent.ExecutionException;
public final class RefCreator {
@@ -31,6 +40,7 @@ public final class RefCreator {
private final boolean writeReferenceRanges;
private final Long sampleId;
+ private GvsReferenceParquetFileWriter refRangesParquetFileWriter = null;
private SimpleInterval previousInterval;
private final Set gqStatesToIgnore;
private final GenomeLocSortedSet coverageLocSortedSet;
@@ -43,7 +53,7 @@ public static boolean doRowsExistFor(CommonCode.OutputType outputType, String pr
return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
}
- public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, SAMSequenceDictionary seqDictionary, Set gqStatesToIgnore, final File outputDirectory, final CommonCode.OutputType outputType, final boolean writeReferenceRanges, final String projectId, final String datasetName, final boolean storeCompressedReferences) {
+ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, SAMSequenceDictionary seqDictionary, Set gqStatesToIgnore, final File outputDirectory, final CommonCode.OutputType outputType, final boolean writeReferenceRanges, final String projectId, final String datasetName, final boolean storeCompressedReferences, final MessageType parquetSchema) {
this.sampleId = sampleId;
this.outputType = outputType;
this.writeReferenceRanges = writeReferenceRanges;
@@ -65,11 +75,16 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
case TSV:
refRangesWriter = new RefRangesTsvWriter(refOutputFile.getCanonicalPath());
break;
- case AVRO:
+ case AVRO: // when do we use this/!?!
refRangesWriter = new RefRangesAvroWriter(refOutputFile.getCanonicalPath());
break;
+ case PARQUET:
+ refRangesParquetFileWriter = new GvsReferenceParquetFileWriter(new Path(refOutputFile.toURI()), parquetSchema, false, CompressionCodecName.SNAPPY);
+ break;
}
}
+ } catch (final FileAlreadyExistsException fs) {
+ throw new UserException("This reference parquet file already exists", fs);
} catch (final IOException ioex) {
throw new UserException("Could not create reference range outputs", ioex);
}
@@ -104,20 +119,44 @@ public void apply(VariantContext variant, List intervalsToWrite) thro
int localStart = start;
while ( localStart <= end ) {
int length = Math.min(end - localStart + 1, IngestConstants.MAX_REFERENCE_BLOCK_BASES);
- if (storeCompressedReferences) {
- refRangesWriter.writeCompressed(
- SchemaUtils.encodeCompressedRefBlock(variantChr, localStart, length,
+ switch(outputType) {
+ case BQ:
+ try {
+ if (storeCompressedReferences) {
+ refRangesWriter.writeCompressed(
+ SchemaUtils.encodeCompressedRefBlock(variantChr, localStart, length,
+ getGQStateEnum(variant.getGenotype(0).getGQ()).getCompressedValue()),
+ sampleId
+ );
+ } else {
+ refRangesWriter.write(SchemaUtils.encodeLocation(variantChr, localStart),
+ sampleId,
+ length,
+ getGQStateEnum(variant.getGenotype(0).getGQ()).getValue()
+ );
+ }
+ } catch (IOException ex) {
+ throw new IOException("BQ exception", ex);
+ }
+ break;
+ case PARQUET:
+ if (storeCompressedReferences) {
+ JSONObject record = GvsReferenceParquetFileWriter.writeCompressed(
+ SchemaUtils.encodeCompressedRefBlock(variantChr, localStart, length,
getGQStateEnum(variant.getGenotype(0).getGQ()).getCompressedValue()),
- sampleId
- );
- } else {
- refRangesWriter.write(SchemaUtils.encodeLocation(variantChr, localStart),
- sampleId,
- length,
- getGQStateEnum(variant.getGenotype(0).getGQ()).getValue()
- );
+ sampleId
+ );
+ refRangesParquetFileWriter.write(record);
+ } else {
+ JSONObject record = GvsReferenceParquetFileWriter.writeJson(SchemaUtils.encodeLocation(variantChr, localStart), sampleId, length, getGQStateEnum(variant.getGenotype(0).getGQ()).getValue());
+ refRangesParquetFileWriter.write(record);
+ }
+ break;
+
}
+
+
localStart = localStart + length ;
}
@@ -267,6 +306,13 @@ public void commitData() {
if (writeReferenceRanges && refRangesWriter != null) {
refRangesWriter.commitData();
}
+ } else if (outputType == CommonCode.OutputType.PARQUET && refRangesParquetFileWriter != null) {
+ try {
+ refRangesParquetFileWriter.close();
+ } catch (IOException exception) {
+ System.out.println("ERROR CLOSING PARQUET FILE: ");
+ exception.printStackTrace();
+ }
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VcfHeaderLineScratchCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VcfHeaderLineScratchCreator.java
index d7f4ab91b51..50fbefe1409 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VcfHeaderLineScratchCreator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VcfHeaderLineScratchCreator.java
@@ -1,25 +1,44 @@
package org.broadinstitute.hellbender.tools.gvs.ingest;
import com.google.protobuf.Descriptors;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.schema.MessageType;
import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
+import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants;
+import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils;
import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter;
+import org.broadinstitute.hellbender.utils.gvs.parquet.GvsHeaderParquetFileWriter;
+import org.broadinstitute.hellbender.utils.gvs.parquet.GvsReferenceParquetFileWriter;
+import org.broadinstitute.hellbender.utils.gvs.parquet.GvsVariantParquetFileWriter;
+import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter;
import org.json.JSONObject;
+import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.ExecutionException;
+import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
+
+
public class VcfHeaderLineScratchCreator {
+ private final CommonCode.OutputType outputType;
private final Long sampleId;
private final String projectId;
private final String datasetName;
private PendingBQWriter vcfHeaderBQJsonWriter = null;
+ private GvsHeaderParquetFileWriter vcfHeaderParquetFileWriter = null;
private static final String NON_SCRATCH_TABLE_NAME = "vcf_header_lines";
private static final String SCRATCH_TABLE_NAME = "vcf_header_lines_scratch";
+ private static final String HEADER_FILETYPE_PREFIX = "header_file";
+
+
public static boolean doScratchRowsExistFor(String projectId, String datasetName, Long sampleId) {
return BigQueryUtils.doRowsExistFor(projectId, datasetName, "vcf_header_lines_scratch", "sample_id", sampleId);
}
@@ -36,16 +55,34 @@ private static boolean doNonScratchRowsExistFor(String projectId, String dataset
return BigQueryUtils.doRowsExistFor(projectId, datasetName, NON_SCRATCH_TABLE_NAME, "vcf_header_lines_hash", headerLineHash);
}
- public VcfHeaderLineScratchCreator(Long sampleId, String projectId, String datasetName) {
+ public VcfHeaderLineScratchCreator(Long sampleId, String projectId, String datasetName, File outputDirectory, CommonCode.OutputType outputType, MessageType headersRowSchema) {
try {
this.sampleId = sampleId;
+ this.outputType = outputType;
this.projectId = projectId;
this.datasetName = datasetName;
+ // String PREFIX_SEPARATOR = "_"; // TODO should this be moved to a common place?
+
if (projectId == null || datasetName == null) {
throw new UserException("Must specify project-id and dataset-name.");
}
- vcfHeaderBQJsonWriter = new PendingBQWriter(projectId, datasetName, SCRATCH_TABLE_NAME);
+
+ switch (outputType) {
+
+ case BQ:
+ if (projectId == null || datasetName == null) {
+ throw new UserException("Must specify project-id and dataset-name when using BQ output mode.");
+ }
+ vcfHeaderBQJsonWriter = new PendingBQWriter(projectId, datasetName, SCRATCH_TABLE_NAME);
+ break;
+ case PARQUET:
+ // TODO ensure that there doesn't need to be a table_number or sampleIdentifierForOutputFileName--it's all tables/samples, yes?
+ final File parquetOutputFile = new File(outputDirectory, HEADER_FILETYPE_PREFIX + ".parquet");
+ vcfHeaderParquetFileWriter = new GvsHeaderParquetFileWriter(new Path(parquetOutputFile.toURI()), headersRowSchema, false, CompressionCodecName.SNAPPY);
+ break;
+
+ }
}
catch (Exception e) {
throw new UserException("Could not create VCF Header Scratch Table Writer", e);
@@ -55,21 +92,41 @@ public VcfHeaderLineScratchCreator(Long sampleId, String projectId, String datas
public void apply(Map allLineHeaders) throws IOException {
for (final Map.Entry headerChunk : allLineHeaders.entrySet()) {
- try {
- // if this header chunk has already been added to the scratch table, only add an association between the
- // sample_id and the hash, no need to rewrite the header chunk to the DB
- String chunkHash = Utils.calcMD5(headerChunk.getKey());
- Boolean isExpectedUnique = headerChunk.getValue();
- boolean vcfScratchHeaderRowsExist = doScratchRowsExistFor(this.projectId, this.datasetName, chunkHash);
- boolean vcfNonScratchHeaderRowsExist = doNonScratchRowsExistFor(this.projectId, this.datasetName, chunkHash);
- if (vcfScratchHeaderRowsExist || vcfNonScratchHeaderRowsExist) {
- vcfHeaderBQJsonWriter.addJsonRow(createJson(this.sampleId, null, chunkHash, isExpectedUnique));
- }
- else {
- vcfHeaderBQJsonWriter.addJsonRow(createJson(this.sampleId, headerChunk.getKey(), chunkHash, isExpectedUnique));
- }
- } catch (Descriptors.DescriptorValidationException | ExecutionException | InterruptedException ex) {
- throw new IOException("BQ exception", ex);
+ switch (outputType) {
+ case BQ:
+ try {
+ // if this header chunk has already been added to the scratch table, only add an association between the
+ // sample_id and the hash, no need to rewrite the header chunk to the DB
+ String chunkHash = Utils.calcMD5(headerChunk.getKey());
+ Boolean isExpectedUnique = headerChunk.getValue();
+ boolean vcfScratchHeaderRowsExist = doScratchRowsExistFor(this.projectId, this.datasetName, chunkHash);
+ boolean vcfNonScratchHeaderRowsExist = doNonScratchRowsExistFor(this.projectId, this.datasetName, chunkHash);
+ if (vcfScratchHeaderRowsExist || vcfNonScratchHeaderRowsExist) {
+ vcfHeaderBQJsonWriter.addJsonRow(createJson(this.sampleId, null, chunkHash, isExpectedUnique));
+ }
+ else {
+ vcfHeaderBQJsonWriter.addJsonRow(createJson(this.sampleId, headerChunk.getKey(), chunkHash, isExpectedUnique));
+ }
+ } catch (Descriptors.DescriptorValidationException | ExecutionException | InterruptedException ex) {
+ throw new IOException("BQ exception", ex);
+ }
+ break;
+ case PARQUET:
+ String chunkHash = Utils.calcMD5(headerChunk.getKey());
+ Boolean isExpectedUnique = headerChunk.getValue();
+ JSONObject record = vcfHeaderParquetFileWriter.writeJson(this.sampleId, chunkHash);
+ vcfHeaderParquetFileWriter.write(record);
+
+ //boolean vcfScratchHeaderRowsExist = doScratchRowsExistFor(this.projectId, this.datasetName, chunkHash);
+ //boolean vcfNonScratchHeaderRowsExist = doNonScratchRowsExistFor(this.projectId, this.datasetName, chunkHash);
+ // why is there no isExpectedUnique check here?
+ //if (vcfScratchHeaderRowsExist || vcfNonScratchHeaderRowsExist) {
+ //vcfHeaderParquetFileWriter.writeJson(this.sampleId, chunkHash);
+ //}
+ //else {
+ //vcfHeaderParquetFileWriter.writeJson(this.sampleId, chunkHash);
+ //}
+ break;
}
}
}
@@ -87,9 +144,16 @@ public JSONObject createJson(Long sampleId, String headerChunk, String headerHas
}
public void commitData() {
- if (vcfHeaderBQJsonWriter != null) {
+ if (outputType == CommonCode.OutputType.BQ && vcfHeaderBQJsonWriter != null) {
vcfHeaderBQJsonWriter.flushBuffer();
vcfHeaderBQJsonWriter.commitWriteStreams();
+ } else if (outputType == CommonCode.OutputType.PARQUET && vcfHeaderParquetFileWriter != null) {
+ try {
+ vcfHeaderParquetFileWriter.close();
+ } catch (IOException exception) {
+ System.out.println("ERROR CLOSING PARQUET FILE: ");
+ exception.printStackTrace();
+ }
}
}
@@ -105,5 +169,4 @@ public void closeTool() {
vcfHeaderBQJsonWriter.close();
}
}
-
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreator.java
index aec23295f7d..0aa7366e63e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreator.java
@@ -3,12 +3,17 @@
import com.google.protobuf.Descriptors;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.fs.FileAlreadyExistsException;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.schema.MessageType;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils;
import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter;
+import org.broadinstitute.hellbender.utils.gvs.parquet.GvsVariantParquetFileWriter;
import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter;
import org.json.JSONObject;
@@ -26,6 +31,7 @@ public class VetCreator {
private SimpleXSVWriter vetWriter = null;
private final Long sampleId;
private PendingBQWriter vetBQJsonWriter = null;
+ private GvsVariantParquetFileWriter vetParquetFileWriter = null;
private final boolean forceLoadingFromNonAlleleSpecific;
private final boolean skipLoadingVqsrFields;
@@ -36,7 +42,7 @@ public static boolean doRowsExistFor(CommonCode.OutputType outputType, String pr
return BigQueryUtils.doRowsExistFor(projectId, datasetName, VET_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
}
- public VetCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific, final boolean skipLoadingVqsrFields) {
+ public VetCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific, final boolean skipLoadingVqsrFields, final MessageType parquetSchema) {
this.sampleId = sampleId;
this.outputType = outputType;
this.forceLoadingFromNonAlleleSpecific = forceLoadingFromNonAlleleSpecific;
@@ -57,7 +63,14 @@ public VetCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
final File vetOutputFile = new File(outputDirectory, VET_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + IngestConstants.FILETYPE);
vetWriter = new SimpleXSVWriter(vetOutputFile.toPath(), IngestConstants.SEPARATOR);
vetWriter.setHeaderLine(getHeaders());
+ break;
+ case PARQUET:
+ final File parquetOutputFile = new File(outputDirectory, VET_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + ".parquet");
+ vetParquetFileWriter = new GvsVariantParquetFileWriter(new Path(parquetOutputFile.toURI()), parquetSchema, false, CompressionCodecName.SNAPPY);
+ break;
}
+ } catch (final FileAlreadyExistsException fs) {
+ throw new UserException("This variants parquet file already exists", fs);
} catch (final IOException ioex) {
throw new UserException("Could not create vet outputs", ioex);
}
@@ -84,6 +97,9 @@ public void apply(VariantContext variant) throws IOException {
);
vetWriter.getNewLineBuilder().setRow(row).write();
break;
+ case PARQUET:
+ vetParquetFileWriter.write(createJson(location, variant, sampleId));
+ break;
}
}
@@ -128,6 +144,13 @@ public void commitData() {
if (outputType == CommonCode.OutputType.BQ && vetBQJsonWriter != null) {
vetBQJsonWriter.flushBuffer();
vetBQJsonWriter.commitWriteStreams();
+ } else if (outputType == CommonCode.OutputType.PARQUET && vetParquetFileWriter != null) {
+ try {
+ vetParquetFileWriter.close();
+ } catch (IOException exception) {
+ System.out.println("ERROR CLOSING PARQUET FILE: ");
+ exception.printStackTrace();
+ }
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
index a82ad989d26..6df437f01e2 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
@@ -4,7 +4,7 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
index de6a914e42c..f1f71a6eb7f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.spark;
+import com.google.common.io.Files;
import htsjdk.samtools.*;
import htsjdk.samtools.BAMSBIIndexer;
import htsjdk.samtools.seekablestream.SeekableFileStream;
@@ -18,7 +19,6 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.ReadConstants;
-import org.codehaus.plexus.util.FileUtils;
import picard.cmdline.programgroups.OtherProgramGroup;
import java.io.*;
@@ -166,7 +166,7 @@ private static void assertBamIsCoordinateSorted(final SAMFileHeader header) {
private static void assertIsBam(final File inputBam) {
if(!BamFileIoUtils.isBamFile(inputBam)) {
throw new UserException.BadInput("A splitting index is only relevant for a bam file, but a "
- + "file with extension "+ FileUtils.getExtension(inputBam.getName()) + " was specified.");
+ + "file with extension "+ Files.getFileExtension(inputBam.getName()) + " was specified.");
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
index 95bb083f03b..de91be5e5b6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
@@ -89,7 +89,10 @@ public enum ComplexVariantSubtype {
piDUP_RF,
dDUP,
dDUP_iDEL,
- INS_iDEL
+ INS_iDEL,
+ CTX_PP_QQ,
+ CTX_PQ_QP,
+ CTX_INV
}
// not defined in output vcf header but used in internal id that is currently output in the ID column
@@ -163,6 +166,7 @@ public enum ComplexVariantSubtype {
public static final String NONCODING_BREAKPOINT = "PREDICTED_NONCODING_BREAKPOINT";
public static final String NEAREST_TSS = "PREDICTED_NEAREST_TSS";
public static final String TSS_DUP = "PREDICTED_TSS_DUP";
+ public static final String PARTIAL_DISPERSED_DUP = "PREDICTED_PARTIAL_DISPERSED_DUP";
// SVTYPE classes
public enum StructuralVariantAnnotationType {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
index 6c016f84436..2f51a7df114 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
@@ -4,7 +4,7 @@
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
-import com.netflix.servo.util.VisibleForTesting;
+import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.*;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.SequenceUtil;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
index 8edcd595881..0f95878b389 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
@@ -132,8 +132,12 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) {
Utils.nonNull(dictionary);
validatePosition(contigA, positionA, dictionary);
validatePosition(contigB, positionB, dictionary);
- Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
- "End coordinate cannot precede start");
+ // CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and
+ // B references the source sequence.
+ if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
+ Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
+ "End coordinate cannot precede start");
+ }
}
private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
index da4e66f3fb6..9ca5056a679 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
@@ -31,6 +31,7 @@
*/
public class ClosestSVFinder {
+ protected final boolean sortOutput;
protected final Map truthIdToItemMap;
protected final Map idToClusterMap;
private final SVConcordanceLinkage linkage;
@@ -49,7 +50,9 @@ public class ClosestSVFinder {
*/
public ClosestSVFinder(final SVConcordanceLinkage linkage,
final Function collapser,
+ final boolean sortOutput,
final SAMSequenceDictionary dictionary) {
+ this.sortOutput = sortOutput;
this.linkage = Utils.nonNull(linkage);
this.collapser = Utils.nonNull(collapser);
outputBuffer = new PriorityQueue<>(SVCallRecordUtils.getCallComparator(dictionary));
@@ -60,6 +63,15 @@ public ClosestSVFinder(final SVConcordanceLinkage linkage,
lastItemContig = null;
}
+ /**
+ * Sorts output by default
+ */
+ public ClosestSVFinder(final SVConcordanceLinkage linkage,
+ final Function collapser,
+ final SAMSequenceDictionary dictionary) {
+ this(linkage, collapser, true, dictionary);
+ }
+
/**
* Should be run frequently to reduce memory usage. Forced flushes must be run when a new contig is encountered.
* @param force flushes all variants in the output buffer regardless of position
@@ -70,8 +82,12 @@ public List flush(final boolean force) {
.map(c -> new ClosestPair(c.getItem(), c.getClosest()))
.map(collapser)
.collect(Collectors.toList());
- outputBuffer.addAll(collapsedRecords);
- return flushBuffer(force);
+ if (sortOutput) {
+ outputBuffer.addAll(collapsedRecords);
+ return flushBuffer(force);
+ } else {
+ return collapsedRecords;
+ }
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
index 6c36592b6ae..16b5142fb84 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
@@ -4,7 +4,7 @@
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.aeonbits.owner.util.Collections;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
index b0ff4985cec..1d9949b219e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
@@ -3,7 +3,7 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.read.GATKRead;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
index 931a6bb5256..7045f4678ec 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
@@ -4,7 +4,7 @@
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableInt;
import org.broadinstitute.barclay.argparser.Argument;
import org.apache.commons.lang3.tuple.Triple;
import org.broadinstitute.barclay.help.DocumentedFeature;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
index addbd29dfd7..96a749de50c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
@@ -4,7 +4,7 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableInt;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
index 9c2d99b97b7..3ef5ade7148 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
@@ -4,7 +4,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.Utils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
index 9d78efd6d1b..f8bb3b3cea9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
@@ -7,7 +7,7 @@
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
index 5b4af7ec0ff..ef01d6fbf62 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
@@ -5,7 +5,7 @@
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.ReferenceContext;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
index 0c4a07f015a..acf1c74bf9c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
@@ -4,6 +4,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.annotator.InfoFieldAnnotation;
@@ -149,7 +150,7 @@ private String establishReadGroupFlowOrder(final LocalContext localContext, fina
// if here, no flow order was found. may we use a default?
if ( isActualFlowOrderRequired() ) {
localContext.generateAnnotation = false;
- flowMissingOneShotLogger.warn("this.getClass().getSimpleName() + \" annotation will not be calculated, no '\" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + \"' argument provided\"");
+ flowMissingOneShotLogger.warn(this.getClass().getSimpleName() + " annotation will not be calculated, no '" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + "' argument provided");
}
return FlowBasedRead.DEFAULT_FLOW_ORDER;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
index 44b07f6fcfe..93420133913 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
@@ -1,7 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.contamination;
import htsjdk.samtools.util.OverlapDetector;
-import org.apache.commons.lang.mutable.MutableDouble;
+import org.apache.commons.lang3.mutable.MutableDouble;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.optim.univariate.UnivariatePointValuePair;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
index 1a023488270..607f6691c58 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
@@ -13,7 +13,7 @@
import java.util.stream.IntStream;
public class ContaminationSegmenter {
- public static final Range ALT_FRACTIONS_FOR_SEGMENTATION = Range.between(0.1, 0.9);
+ public static final Range ALT_FRACTIONS_FOR_SEGMENTATION = Range.of(0.1, 0.9);
public static final double KERNEL_SEGMENTER_LINEAR_COST = 1.0;
public static final double KERNEL_SEGMENTER_LOG_LINEAR_COST = 1.0;
public static final int KERNEL_SEGMENTER_DIMENSION = 100;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
index b3ddc3e21b6..2014ce3030e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
@@ -2,12 +2,14 @@
import htsjdk.samtools.*;
import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.math3.util.Precision;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -81,6 +83,19 @@
@ExperimentalFeature
public final class FlowFeatureMapper extends ReadWalker {
+ static class CopyAttrInfo {
+ public final String name;
+ public final VCFHeaderLineType type;
+ public final String desc;
+
+ public CopyAttrInfo(final String spec) {
+ final String[] toks = spec.split(",");
+ name = toks[0];
+ type = toks.length > 1 ? VCFHeaderLineType.valueOf(toks[1]) : VCFHeaderLineType.String;
+ desc = toks.length > 2 ? StringUtils.join(Arrays.copyOfRange(toks, 2, toks.length), ",") : ("copy-attr: " + name);
+ }
+ }
+
private static final Logger logger = LogManager.getLogger(FlowFeatureMapper.class);
private static final String VCB_SOURCE = "fm";
@@ -101,8 +116,18 @@ public final class FlowFeatureMapper extends ReadWalker {
private static final String VCF_SMQ_RIGHT = "X_SMQ_RIGHT";
private static final String VCF_SMQ_LEFT_MEAN = "X_SMQ_LEFT_MEAN";
private static final String VCF_SMQ_RIGHT_MEAN = "X_SMQ_RIGHT_MEAN";
+ private static final String VCF_ADJACENT_REF_DIFF = "X_ADJACENT_REF_DIFF";
+
private static final Double LOWEST_PROB = 0.0001;
+ private static final int VENDOR_QUALITY_CHECK_FLAG = 0x200;
+
+ private static final String INCLUDE_QC_FAILED_READS_FULL_NAME = "include-qc-failed-reads";
+
+ final private List copyAttrInfo = new LinkedList<>();
+
+ // order here is according to SequenceUtil.VALID_BASES_UPPER
+ final private String scoreForBaseNames[] = new String[SequenceUtil.VALID_BASES_UPPER.length];
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
@@ -131,6 +156,10 @@ public final class FlowFeatureMapper extends ReadWalker {
@Argument(fullName=HaplotypeCallerArgumentCollection.OUTPUT_BLOCK_LOWER_BOUNDS, doc = "Output the band lower bound for each GQ block regardless of the data it represents", optional = true)
public boolean floorBlocks = false;
+ @Advanced
+ @Argument(fullName=INCLUDE_QC_FAILED_READS_FULL_NAME, doc = "include reads with QC failed flag", optional = true)
+ public boolean includeQcFailedReads = false;
+
@ArgumentCollection
public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection();
@@ -175,6 +204,9 @@ protected static class MappedFeature implements Comparable {
int smqLeftMean;
int smqRightMean;
+ double scoreForBase[];
+ boolean adjacentRefDiff;
+
public MappedFeature(GATKRead read, FlowFeatureMapperArgumentCollection.MappingFeatureEnum type, byte[] readBases,
byte[] refBases, int readBasesOffset, int start, int offsetDelta) {
this.read = read;
@@ -315,8 +347,20 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT, 1, VCFHeaderLineType.Integer, "Ordinal Median quality of N bases to the right of the feature"));
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_LEFT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the left of the feature"));
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the right of the feature"));
- for ( String name : fmArgs.copyAttr ) {
- headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + name, 1, VCFHeaderLineType.String, "copy-attr: " + name));
+ for ( String spec : fmArgs.copyAttr ) {
+ final CopyAttrInfo info = new CopyAttrInfo(spec);
+ headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + info.name, 1, info.type, info.desc));
+ copyAttrInfo.add(info);
+ }
+
+ // validation mode?
+ if ( fmArgs.reportAllAlts ) {
+ for ( int baseIndex = 0 ; baseIndex < SequenceUtil.VALID_BASES_UPPER.length ; baseIndex++ ) {
+ headerInfo.add(new VCFInfoHeaderLine(scoreNameForBase(baseIndex), 1, VCFHeaderLineType.Float, "Base specific mapping score"));
+ }
+ }
+ if ( fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff ) {
+ headerInfo.add(new VCFInfoHeaderLine(VCF_ADJACENT_REF_DIFF, 1, VCFHeaderLineType.Flag, "Adjacent base filter indication: indel in the adjacent 5 bases to the considered base on the read"));
}
final VCFHeader vcfHeader = new VCFHeader(headerInfo);
@@ -324,6 +368,13 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f
return vcfHeader;
}
+ private String scoreNameForBase(int baseIndex) {
+ if ( scoreForBaseNames[baseIndex] == null ) {
+ scoreForBaseNames[baseIndex] = VCF_SCORE + "_" + new String(new byte[]{SequenceUtil.VALID_BASES_UPPER[baseIndex]});
+ }
+ return scoreForBaseNames[baseIndex];
+ }
+
@Override
public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
@@ -337,6 +388,11 @@ public void apply(final GATKRead read, final ReferenceContext referenceContext,
return;
}
+ // include qc-failed reads?
+ if ( ((read.getFlags() & VENDOR_QUALITY_CHECK_FLAG) != 0) && !includeQcFailedReads ) {
+ return;
+ }
+
// flush qeues up to this read
flushQueue(read, referenceContext);
@@ -349,6 +405,20 @@ public void apply(final GATKRead read, final ReferenceContext referenceContext,
// score the feature
fr.score = scoreFeature(fr);
+ // score for validation mode?
+ if ( fmArgs.reportAllAlts) {
+ fr.scoreForBase = new double[SequenceUtil.VALID_BASES_UPPER.length];
+ for ( int baseIndex = 0 ; baseIndex < fr.scoreForBase.length ; baseIndex++ ) {
+ final byte base = SequenceUtil.VALID_BASES_UPPER[baseIndex];
+ boolean incl = base != fr.readBases[0];
+ if ( incl ) {
+ fr.scoreForBase[baseIndex] = scoreFeature(fr, base);
+ } else {
+ fr.scoreForBase[baseIndex] = Double.NaN;
+ }
+ }
+ }
+
// emit feature if filters in
if ( filterFeature(fr) ) {
featureQueue.add(fr);
@@ -413,14 +483,17 @@ private void enrichFeature(final MappedFeature fr) {
}
private double scoreFeature(final MappedFeature fr) {
+ return scoreFeature(fr, (byte)0);
+ }
+ private double scoreFeature(final MappedFeature fr, byte altBase) {
// build haplotypes
final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), fr.read);
- final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder);
+ final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder, altBase);
// create flow read
- final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder,
- rgInfo.maxClass, fbargs);
+ final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+
final int diffLeft = haplotypes[0].getStart() - flowRead.getStart() + fr.offsetDelta;
final int diffRight = flowRead.getEnd() - haplotypes[0].getEnd();
flowRead.applyBaseClipping(Math.max(0, diffLeft), Math.max(diffRight, 0), false);
@@ -524,16 +597,24 @@ public static double computeLikelihoodLocal(final FlowBasedRead read, final Flow
return result;
}
- private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder) {
+ private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder, byte altBase) {
// build bases for flow haplotypes
// NOTE!!!: this code assumes length of feature on read and reference is the same
// this is true for SNP but not for INDELs - it will have to be re-written!
// TODO: write for INDEL
- final byte[] bases = fr.read.getBasesNoCopy();
+ byte[] bases = fr.read.getBasesNoCopy();
int offset = fr.readBasesOffset;
int refStart = fr.start;
int refModOfs = 0;
+
+ // install alt base?
+ byte orgBase = 0;
+ if ( altBase != 0 ) {
+ orgBase = fr.refBases[0];
+ fr.refBases[0] = altBase;
+ }
+
if ( offset > 0 ) {
// reach into hmer before
offset--;
@@ -569,6 +650,11 @@ private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final Strin
new FlowBasedHaplotype(refHaplotype, flowOrder)
};
+ // restore changes
+ if ( altBase != 0 ) {
+ fr.refBases[0] = orgBase;
+ }
+
// return
return result;
}
@@ -590,7 +676,11 @@ private void emitFeature(final MappedFeature fr) {
// create alleles
final Collection alleles = new LinkedList<>();
- alleles.add(Allele.create(fr.readBases, false));
+ if ( fmArgs.reportAllAlts && Arrays.equals(fr.readBases, fr.refBases) ) {
+ alleles.add(Allele.create("*".getBytes(), false));
+ } else {
+ alleles.add(Allele.create(fr.readBases, false));
+ }
alleles.add(Allele.create(fr.refBases, true));
// create variant context builder
@@ -625,11 +715,34 @@ private void emitFeature(final MappedFeature fr) {
vcb.attribute(VCF_SMQ_RIGHT_MEAN, fr.smqRightMean);
}
- for ( String name : fmArgs.copyAttr ) {
- if ( fr.read.hasAttribute(name) ) {
- vcb.attribute(fmArgs.copyAttrPrefix + name, fr.read.getAttributeAsString(name));
+ for ( CopyAttrInfo info : copyAttrInfo ) {
+ if ( fr.read.hasAttribute(info.name) ) {
+ final String attrName = fmArgs.copyAttrPrefix + info.name;
+ if ( info.type == VCFHeaderLineType.Integer ) {
+ vcb.attribute(attrName, fr.read.getAttributeAsInteger(info.name));
+ } else if ( info.type == VCFHeaderLineType.Float ) {
+ vcb.attribute(attrName, fr.read.getAttributeAsFloat(info.name));
+ } else {
+ vcb.attribute(attrName, fr.read.getAttributeAsString(info.name));
+ }
+ }
+ }
+
+ // validation mode?
+ if ( fmArgs.reportAllAlts ) {
+ if ( fr.scoreForBase != null ) {
+ for (int baseIndex = 0; baseIndex < SequenceUtil.VALID_BASES_UPPER.length; baseIndex++) {
+ if (!Double.isNaN(fr.scoreForBase[baseIndex])) {
+ vcb.attribute(scoreNameForBase(baseIndex), String.format("%.5f", fr.scoreForBase[baseIndex]));
+ }
+ }
}
}
+ if ( fr.adjacentRefDiff ) {
+ vcb.attribute(VCF_ADJACENT_REF_DIFF, true);
+ }
+
+ // build it!
final VariantContext vc = vcb.make();
// write to file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
index 22c13b00878..03660746d2b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
@@ -32,7 +32,7 @@ enum MappingFeatureEnum {
/**
* attributes to copy from bam
**/
- @Argument(fullName = "copy-attr", doc = "attributes to copy from bam", optional = true)
+ @Argument(fullName = "copy-attr", doc = "attributes to copy from bam. format ,,. types: Integer, Float, String, Character, Flag", optional = true)
public List copyAttr = new LinkedList<>();
/**
@@ -116,4 +116,18 @@ enum MappingFeatureEnum {
@Hidden
@Argument(fullName = "surrounding-mean-quality-size", doc = "number of bases around the feature to calculate surrounding mean quality", optional = true)
public Integer surroundingMeanQualitySize = null;
+
+ /**
+ * validation mode - if not specified, this feature is off
+ **/
+ @Hidden
+ @Argument(fullName = "report-all-alts", doc = "In this mode (aka validation mode), every base of every read in the input CRAM and interval is reported, and an X_SCORE value is calculated for all 3 possible alts", optional = true)
+ public boolean reportAllAlts = false;
+
+ /**
+ * adjacent-ref-diff mode - if not specified, this feature is off
+ **/
+ @Hidden
+ @Argument(fullName = "tag-bases-with-adjacent-ref-diff", doc = "In this mode bases that have an adjacent difference from the reference on the same read are not discarded, and tagged with X_ADJACENT_REF_DIFFm", optional = true)
+ public boolean tagBasesWithAdjacentRefDiff = false;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
index 05c952c317a..3f23cb5a6e6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
@@ -1,7 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.featuremapping;
import htsjdk.samtools.CigarElement;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
@@ -21,22 +21,34 @@
public class SNVMapper implements FeatureMapper {
- final int identBefore;
- final int identAfter;
+ final int surroundBefore;
+ final int surroundAfter;
final int minCigarElementLength;
final LevenshteinDistance levDistance = new LevenshteinDistance();
final Integer smqSize;
final Integer smqSizeMean;
+ final boolean ignoreSurround;
+ final int spanBefore;
+ final int spanAfter;
+
+ final FlowFeatureMapperArgumentCollection fmArgs;
+
public SNVMapper(FlowFeatureMapperArgumentCollection fmArgs) {
- identBefore = fmArgs.snvIdenticalBases;
- identAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : identBefore;
- minCigarElementLength = identBefore + 1 + identAfter;
+ surroundBefore = fmArgs.snvIdenticalBases;
+ surroundAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : surroundBefore;
smqSize = fmArgs.surroundingMediaQualitySize;
smqSizeMean = fmArgs.surroundingMeanQualitySize;
+ this.fmArgs = fmArgs;
+
+ // ignore surround
+ ignoreSurround = fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff;
+ spanBefore = ignoreSurround ? 0 : surroundBefore;
+ spanAfter = ignoreSurround ? 0 : surroundAfter;
+ minCigarElementLength = spanBefore + 1 + spanAfter;
// adjust minimal read length
- FlowBasedRead.setMinimalReadLength(1 + 1 + identAfter);
+ FlowBasedRead.setMinimalReadLength(1 + 1 + spanAfter);
}
@Override
@@ -93,30 +105,44 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons
if ( length >= minCigarElementLength &&
cigarElement.getOperator().consumesReadBases() &&
cigarElement.getOperator().consumesReferenceBases() ) {
- readOfs += identBefore;
- refOfs += identBefore;
- for ( int ofs = identBefore ; ofs < length - identAfter ; ofs++, readOfs++, refOfs++ ) {
+ readOfs += spanBefore;
+ refOfs += spanBefore;
+ for ( int ofs = spanBefore ; ofs < length - spanAfter ; ofs++, readOfs++, refOfs++ ) {
- if ( ref[refOfs] != 'N' && bases[readOfs] != ref[refOfs] ) {
+ if ( ref[refOfs] != 'N' && (fmArgs.reportAllAlts || (bases[readOfs] != ref[refOfs])) ) {
// check that this is really a SNV (must be surrounded by identical ref)
boolean surrounded = true;
- for ( int i = 0 ; i < identBefore && surrounded ; i++ ) {
- if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) {
+ for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) {
+ final int bIndex = readOfs-1-i;
+ final int rIndex = refOfs-1-i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
+ }
+ if ( bases[bIndex] != ref[rIndex] ) {
surrounded = false;
}
}
- for ( int i = 0 ; i < identAfter && surrounded ; i++ ) {
- if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) {
+ for (int i = 0; i < surroundAfter && surrounded ; i++ ) {
+ final int bIndex = readOfs+1+i;
+ final int rIndex = refOfs+1+i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
+ }
+ if ( bases[bIndex] != ref[rIndex] ) {
surrounded = false;
}
}
- if ( !surrounded ) {
+ if ( (!fmArgs.reportAllAlts && !fmArgs.tagBasesWithAdjacentRefDiff) && !surrounded ) {
continue;
}
// add this feature
FlowFeatureMapper.MappedFeature feature = FlowFeatureMapper.MappedFeature.makeSNV(read, readOfs, ref[refOfs], referenceContext.getStart() + refOfs, readOfs - refOfs);
+ if ( (fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff) && !surrounded )
+ feature.adjacentRefDiff = true;
feature.nonIdentMBasesOnRead = nonIdentMBases;
feature.refEditDistance = refEditDistance;
if ( !read.isReverseStrand() )
@@ -166,8 +192,8 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons
features.add(feature);
}
}
- readOfs += identAfter;
- refOfs += identAfter;
+ readOfs += spanAfter;
+ refOfs += spanAfter;
} else {
@@ -243,8 +269,8 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen
cigarElement.getOperator().consumesReferenceBases() ) {
// break out if not enough clearing
- if ( (start < referenceContext.getStart() + refOfs + identBefore) ||
- (start >= referenceContext.getStart() + refOfs + length - identAfter) )
+ if ( (start < referenceContext.getStart() + refOfs + spanBefore) ||
+ (start >= referenceContext.getStart() + refOfs + length - spanAfter) )
return FilterStatus.Filtered;
int delta = start - (referenceContext.getStart() + refOfs);
@@ -255,13 +281,25 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen
// check that this is really a SNV (must be surrounded by identical ref)
boolean surrounded = true;
- for ( int i = 0 ; i < identBefore && surrounded ; i++ ) {
- if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) {
+ for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) {
+ final int bIndex = readOfs-1-i;
+ final int rIndex = refOfs-1-i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
+ }
+ if ( bases[bIndex] != ref[rIndex] ) {
surrounded = false;
}
}
- for ( int i = 0 ; i < identAfter && surrounded ; i++ ) {
- if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) {
+ for (int i = 0; i < surroundAfter && surrounded ; i++ ) {
+ final int bIndex = readOfs+1+i;
+ final int rIndex = refOfs+1+i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
+ }
+ if ( bases[bIndex] != ref[rIndex] ) {
surrounded = false;
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
index 7efb5687ac3..23550fd75a6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
@@ -4,7 +4,7 @@
import com.google.common.primitives.Ints;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
index cee262d277c..7b32156c40a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
@@ -5,7 +5,7 @@
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.Tuple;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java
new file mode 100644
index 00000000000..f3191fbaadb
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java
@@ -0,0 +1,925 @@
+package org.broadinstitute.hellbender.tools.walkers.groundtruth;
+
+import com.opencsv.CSVReader;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMReadGroupRecord;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.math3.util.Precision;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentArgumentCollection;
+import org.broadinstitute.hellbender.tools.walkers.featuremapping.FlowFeatureMapper;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerUtils;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentLikelihoodEngine;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.LikelihoodEngineArgumentCollection;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReadLikelihoodCalculationEngine;
+import org.broadinstitute.hellbender.utils.BaseUtils;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
+import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype;
+import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.read.*;
+import org.broadinstitute.hellbender.utils.report.GATKReport;
+import org.broadinstitute.hellbender.utils.report.GATKReportTable;
+
+import java.io.*;
+import java.text.DecimalFormat;
+import java.util.*;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * Converts Ultima reads into flow-based annotation, and provides some general statistics regarding
+ * quality and errors relative to the reference. Ultima data is flow-based, and thus the original computed
+ * quality refers to each flow, rather than each base. In the Ultima cram/bam, there is a per-base representation
+ * of the original flow qualities, where the original quality is distributed along each flow (homopolymer).
+ * In order to reconstitute the original flow information, the tool incorporates the information encoded
+ * in the Ultima cram/bam, and outputs both the read in flow space, as well as a conversion of the aligned
+ * reference portion into flow space, and an alignment score.
+ *
+ * Input
+ *
+ * - Ultima aligned SAM/BAM/CRAM
+ *
+ *
+ * Output
+ *
+ * - Per read ground truth information CSV and a ground truth scoring quality report, in GATK report format
+ *
+ *
+ * CSV Output Description
+ * csv with the read representation in flow space. The csv includes the following columns:
+ * ReadName
+ * ReadKey : The signal of the read at each flow according to the flow order
+ * ReadIsReversed : Whether the read is reversed in the alignment
+ * ReadMQ : The mapping quality of the read
+ * ReadRQ : The read rq value
+ * GroundTruthKey : The aligned reference section, translated into per-flow signals
+ * ReadSequence
+ * Score : A flow-based alignment score. Since the alignment is per-flow, in the case that there’s a cycle skip, the read and reference flow signals will not be aligned, and therefore the score will be inaccurate.
+ * NormalizedScore: A flow-based normalized alignment score
+ * ErrorProbability : The error of each flow (corresponds to the signals in ReadKey)
+ * ReadKeyLength
+ * GroundTruthKeyLength
+ * CycleSkipStatus : One of NS (Non Skip), PCS (Possible Cycle Skip), or CS (Cycle Skip)
+ * Cigar
+ * LowestQBaseTP
+ *
+ * GATK Report Description
+ * In the quality report (optional), the following tables are included:
+ *
+ * qualReport:error rate per qual : The error rate for each quality. Columns:
+ *
+ * - qual: The encoded quality
+ *
- count: The number of times the quality was observed
+ *
- error: The error rate of the flows with this qual
+ *
- phred: the error translated into a phred score
+ *
+ *
+ * qual_hmerReport:error rate per qual by hmer. The error rate for each quality and hmer combination. Columns:
+ *
+ * - qual: The encoded quality
+ *
- hmer: The hmer length
+ *
- count: The number of times the quality was observed
+ *
- error: The error rate of the flows with this qual
+ *
+ *
+ * qual_hmer_deviation_base_Report:error rate per qual by hmer and deviation. The count of errors for each qual, hmer, deviation and base
+ *
+ * - qual: The encoded quality
+ *
- hmer: The hmer length
+ *
- deviation: The deviation (difference in signal, relative to the reference)
+ *
- base: The base
+ *
- count: The number of times the deviation was observed
+ *
+ *
+ * Phred/qual statistics per flow position report. Various statistics for each flow position in relationship to the found quality value. Columns:
+ *
+ * - flow - flow position
+ * - count - count of observations
+ * - min - minimal observed quality
+ * - max - maximal observed quality
+ * - mean - mean observed value
+ * - median - median observed value
+ * - std - standard deviation
+ * - p1...Pn - percentil columns, accotding to the --quality-percentiles parameter
+ *
+ *
+ * Usage examples
+ *
+ * gatk GroundTruthScorer \
+ * -I input.bam \
+ * -R reference.fasta.gz
+ * -L chr20 \
+ * --output-csv output.csv \
+ * --report-file report.txt \
+ * --omit-zeros-from-report \ (optional)
+ * --features-file dbsnp.chr9.vcf.gz \ (optional)
+ * --genome-prior genome_prior.csv (optional)
+ *
+ *
+ * {@GATK.walkertype ReadWalker}
+ */
+
+@CommandLineProgramProperties(
+ summary = "Ground Truth Scorer",
+ oneLineSummary = "Score reads against a reference/ground truth",
+ programGroup = FlowBasedProgramGroup.class
+)
+@DocumentedFeature
+@ExperimentalFeature
+public class GroundTruthScorer extends ReadWalker {
+ private static final Logger logger = LogManager.getLogger(GroundTruthScorer.class);
+
+ public static final String OUTPUT_CSV_LONG_NAME = "output-csv";
+ public static final String REPORT_FILE_LONG_NAME = "report-file";
+ public static final String USE_SOFTCLIPPED_BASES_LONG_NAME = "use-softclipped-bases";
+ public static final String GENOME_PRIOR_LONG_NAME = "genome-prior";
+ public static final String FEATURES_FILE_LONG_NAME = "features-file";
+ public static final String NORMALIZED_SCORE_THRESHOLD_LONG_NAME = "normalized-score-threshold";
+ public static final String ADD_MEAN_CALL_LONG_NAME = "add-mean-call";
+ public static final String GT_NO_OUTPUT_LONG_NAME = "gt-no-output";
+ public static final String OMIT_ZEROS_FROM_REPORT = "omit-zeros-from-report";
+ public static final String QUALITY_PERCENTILES = "quality-percentiles";
+ public static final String EXCLUDE_ZERO_FLOWS = "exclude-zero-flows";
+
+ private static final int QUAL_VALUE_MAX = 60;
+ private static final int HMER_VALUE_MAX = 100; //TODO: This should become a parameter
+ private static final int BASE_VALUE_MAX = FlowBasedRead.DEFAULT_FLOW_ORDER.length() - 1;
+
+ private static final double NORMALIZED_SCORE_THRESHOLD_DEFAULT = -0.1;
+
+ /*
+ Private accumulator class for counting false/true observations (hence Boolean).
+
+ Observations are counted at a top level and are also optionally classified into a set of bins (the
+ number of which is fixed upon construction). The bins themselves are also BooleanAccumulator objects,
+ resulting in a tree like multi-level accumulator.
+
+
+ GroundTruthScores builds a four level deep accumulation tree, which can support observations of a
+ boolean event with 3-deep context (bin1,bin2,bin3).
+
+ Once accumulation is done, the instance is able to generate a suitable GATKReportTable for any given
+ bin depth (1, 2 or 3).
+
+ */
+ private static class BooleanAccumulator {
+ long falseCount;
+ long trueCount;
+ BooleanAccumulator[] bins;
+
+ // add an observation to this accumulator
+ void add(final boolean b) {
+ if (b) {
+ trueCount++;
+ } else {
+ falseCount++;
+ }
+ }
+
+ // add an observation to this accumulator and to one of the bins in the level under it (1 deep)
+ void add(final boolean b, final int bin) {
+ add(b);
+ if ( bins != null && bin >= 0 && bin < bins.length ) {
+ bins[bin].add(b);
+ } else {
+ logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped");
+ bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b);
+ }
+ }
+
+ // add an observation to this accumulator and to two levels of bins under it (2 deep)
+ void add(final boolean b, final int bin, final int bin2) {
+ add(b);
+ if ( bins != null && bin >= 0 && bin < bins.length ) {
+ bins[bin].add(b, bin2);
+ } else {
+ logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped");
+ bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b, bin2);
+ }
+ }
+
+ // add an observation to this accumulator and to three levels of bins under it (3 deep)
+ void add(final boolean b, final int bin, final int bin2, final int bin3) {
+ add(b);
+ if ( bins != null && bin >= 0 && bin < bins.length ) {
+ bins[bin].add(b, bin2, bin3);
+ } else {
+ logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped");
+ bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b, bin2, bin3);
+ }
+ }
+
+ // get observation count of this accumulator
+ long getCount() {
+ return falseCount + trueCount;
+ }
+
+ // get the false rate/ration for this accumulator
+ double getFalseRate() {
+ return (getCount() == 0) ? 0.0 : ((double)falseCount / getCount());
+ }
+
+ // create a set of accumulators with 3-deep bin nesting
+ static BooleanAccumulator[] newReport(final int size, final int binCount, final int binCount2, final int binCount3) {
+ BooleanAccumulator[] report = new BooleanAccumulator[size];
+ for ( byte i = 0 ; i < report.length ; i++ ) {
+ report[i] = new BooleanAccumulator();
+ if ( binCount != 0 ) {
+ report[i].bins = new BooleanAccumulator[binCount];
+ for ( int j = 0 ; j < report[i].bins.length ; j++ ) {
+ report[i].bins[j] = new BooleanAccumulator();
+ if ( binCount2 != 0 ) {
+ report[i].bins[j].bins = new BooleanAccumulator[binCount2];
+ for ( int k = 0 ; k < report[i].bins[j].bins.length ; k++ ) {
+ report[i].bins[j].bins[k] = new BooleanAccumulator();
+ if (binCount3 != 0) {
+ report[i].bins[j].bins[k].bins = new BooleanAccumulator[binCount3];
+ for (int m = 0; m < report[i].bins[j].bins[k].bins.length; m++) {
+ report[i].bins[j].bins[k].bins[m] = new BooleanAccumulator();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return report;
+ }
+
+ // create a GATK report from a set of accumulators without nesting into their bins
+ static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name, final double probThreshold, final boolean omitZeros) {
+ final GATKReportTable table = new GATKReportTable(name + "Report", "error rate per " + name, 4);
+ table.addColumn(name, "%d");
+ table.addColumn("count", "%d");
+ table.addColumn("error", "%f");
+ table.addColumn("phred", "%d");
+ int rowIndex = 0;
+ for (int i = 0; i < report.length; i++) {
+ if ( omitZeros && i != 0 && report[i].getCount() == 0 )
+ continue;
+ else {
+ final double rate = report[i].getFalseRate();
+ final double phredRate = (rate == 0.0 && report[i].getCount() != 0 && probThreshold != 0.0) ? probThreshold : rate;
+
+ table.set(rowIndex, 0, i);
+ table.set(rowIndex, 1, report[i].getCount());
+ table.set(rowIndex, 2, rate);
+ table.set(rowIndex, 3, phredRate != 0 ? (int) Math.ceil(-10.0 * Math.log10(phredRate)) : 0);
+ rowIndex++;
+ }
+ }
+ return table;
+ }
+
+ // create a GATK report from a set of accumulators while nesting into one level of their bins (1 deep)
+ static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name1, final String name2, final boolean omitZeros) {
+ final GATKReportTable table = new GATKReportTable(name1 + "_" + name2 + "Report", "error rate per " + name1 + " by " + name2, 4);
+ table.addColumn(name1, "%d");
+ table.addColumn(name2, "%d");
+ table.addColumn("count", "%d");
+ table.addColumn("error", "%f");
+ int rowIndex = 0;
+ for (int i = 0; i < report.length; i++) {
+ for ( int j = 0; j < report[i].bins.length ; j++ ) {
+ if ( omitZeros && (i != 0 || j != 0) && report[i].bins[j].getCount() == 0 )
+ continue;
+ else {
+ table.set(rowIndex, 0, i);
+ table.set(rowIndex, 1, j);
+ table.set(rowIndex, 2, report[i].bins[j].getCount());
+ table.set(rowIndex, 3, report[i].bins[j].getFalseRate());
+ rowIndex++;
+ }
+ }
+ }
+ return table;
+ }
+
+ // create a GATK report from a set of accumulators while nesting into two levels of their bins (2 deep)
+ static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name1, final String name2, final String name3, final String name4, final boolean omitZeros) {
+ final GATKReportTable table = new GATKReportTable(name1 + "_" + name2 + "_" + name3 + "_" + name4 + "_Report", "error rate per " + name1 + " by " + name2 + " and " + name3, 5);
+ table.addColumn(name1, "%d");
+ table.addColumn(name2, "%d");
+ table.addColumn(name3, "%s");
+ table.addColumn(name4, "%s");
+ table.addColumn("count", "%d");
+ int rowIndex = 0;
+ for (int i = 0; i < report.length; i++) {
+ for ( int j = 0; j < report[i].bins.length ; j++ ) {
+ for ( int k = 0; k < report[i].bins[j].bins.length ; k++ ) {
+ for ( int m = 0; m < report[i].bins[j].bins[k].bins.length ; m++ ) {
+ if ( omitZeros && (i != 0 || j != 0 || k != 0 || m != 0) && report[i].bins[j].bins[k].bins[m].getCount() == 0 )
+ continue;
+ else {
+ table.set(rowIndex, 0, i);
+ table.set(rowIndex, 1, j);
+ table.set(rowIndex, 2, binToDeviation(k));
+ table.set(rowIndex, 3, String.format("%c", binToBase(m)));
+ table.set(rowIndex, 4, report[i].bins[j].bins[k].bins[m].getCount());
+ rowIndex++;
+ }
+ }
+ }
+ }
+ }
+ return table;
+ }
+ }
+
+ private static class PercentileReport extends SeriesStats {
+
+ static GATKReportTable newReportTable(final Vector report, String qualityPercentiles) {
+ String[] qp = qualityPercentiles.split(",");
+ final GATKReportTable table = new GATKReportTable("PhredBinAccumulator", "PhredBinAccumulator", 8 + qp.length);
+ table.addColumn("flow", "%d");
+ table.addColumn("count", "%d");
+ table.addColumn("min", "%f");
+ table.addColumn("max", "%f");
+ table.addColumn("mean", "%f");
+ table.addColumn("median", "%f");
+ table.addColumn("std", "%f");
+ for ( final String p : qp ) {
+ table.addColumn("p" + p, "%f");
+ }
+ int rowIndex = 0;
+ for ( final PercentileReport r : report ) {
+ int col = 0;
+ table.set(rowIndex, col++, rowIndex);
+ table.set(rowIndex, col++, r.getCount());
+ table.set(rowIndex, col++, r.getMin());
+ table.set(rowIndex, col++, r.getMax());
+ table.set(rowIndex, col++, r.getMean());
+ table.set(rowIndex, col++, r.getMedian());
+ table.set(rowIndex, col++, r.getStd());
+ for ( String p : qp ) {
+ table.set(rowIndex, col++, r.getPercentile(Double.parseDouble(p)));
+ }
+ rowIndex++;
+ }
+ return table;
+ }
+
+ void addProb(double p) {
+ super.add(-10 * Math.log10(p));
+ }
+ }
+
+ @Argument(fullName = OUTPUT_CSV_LONG_NAME, doc="main CSV output file. supported file extensions: .csv, .csv.gz.")
+ public GATKPath outputCsvPath = null;
+
+ @Argument(fullName = REPORT_FILE_LONG_NAME, doc="report output file.", optional = true)
+ public GATKPath reportFilePath = null;
+
+ @ArgumentCollection
+ public LikelihoodEngineArgumentCollection likelihoodArgs = new LikelihoodEngineArgumentCollection();
+
+ @ArgumentCollection
+ public FlowBasedAlignmentArgumentCollection fbargs = new FlowBasedAlignmentArgumentCollection();
+
+ @Argument(fullName = USE_SOFTCLIPPED_BASES_LONG_NAME, doc="", optional = true)
+ public boolean useSoftclippedBases;
+
+ @Argument(fullName = GENOME_PRIOR_LONG_NAME, doc="CSV input file containing genome-prior (one line per base with hmer frequencies).", optional = true)
+ public GATKPath genomePriorPath;
+
+ @Argument(fullName = FEATURES_FILE_LONG_NAME, doc="A VCF file containing features to be used as a use for filtering reads.", optional = true)
+ public FeatureDataSource features;
+
+ @Argument(fullName = NORMALIZED_SCORE_THRESHOLD_LONG_NAME, doc="threshold for normalized score, below which reads are ignored", optional = true)
+ public double normalizedScoreThreshold = NORMALIZED_SCORE_THRESHOLD_DEFAULT;
+
+ @Argument(fullName = ADD_MEAN_CALL_LONG_NAME, doc="Add ReadMeanCall and ReadProbs columns to output", optional = true)
+ public boolean addMeanCalll;
+
+ @Argument(fullName = GT_NO_OUTPUT_LONG_NAME, doc = "do not generate output records", optional = true)
+ public boolean noOutput = false;
+
+ @Argument(fullName = OMIT_ZEROS_FROM_REPORT, doc = "omit zero values from output report", optional = true)
+ public boolean omitZerosFromReport = false;
+
+ @Argument(fullName = QUALITY_PERCENTILES, doc = "list of quality percentiles, defaults to 10,25,50,75,90", optional = true)
+ public String qualityPercentiles = "10,25,50,75,90";
+
+ @Argument(fullName = EXCLUDE_ZERO_FLOWS, doc = "should flows with a call of zero be included in the percentile report?", optional = true)
+ public boolean excludeZeroFlows = false;
+
+ // locals
+ private FlowBasedAlignmentLikelihoodEngine likelihoodCalculationEngine;
+ private PrintWriter outputCsv;
+ private DecimalFormat doubleFormat = new DecimalFormat("0.0#####");
+ private GenomePriorDB genomePriorDB;
+ private BooleanAccumulator[] qualReport;
+ private String[] csvFieldOrder;
+ private Vector percentileReports;
+
+ // static/const
+ static final private String[] CSV_FIELD_ORDER_BASIC = {
+ "ReadName", "ReadKey", "ReadIsReversed", "ReadMQ", "ReadRQ", "GroundTruthKey", "ReadSequence",
+ "Score", "NormalizedScore", "ErrorProbability",
+ "ReadKeyLength", "GroundTruthKeyLength", "CycleSkipStatus", "Cigar", "LowestQBaseTP"
+ };
+ static final private String[] CSV_FIELD_ORDER_MEAN_CALL = {
+ "ReadProbs", "ReadMeanCall"
+ };
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+
+ // establish csv fields
+ List order = new LinkedList<>(Arrays.asList(CSV_FIELD_ORDER_BASIC));
+ if ( addMeanCalll ) {
+ order.addAll(Arrays.asList(CSV_FIELD_ORDER_MEAN_CALL));
+ }
+ csvFieldOrder = order.toArray(new String[0]);
+
+ // create likelihood engine
+ final ReadLikelihoodCalculationEngine engine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(likelihoodArgs, false);
+ if ( engine instanceof FlowBasedAlignmentLikelihoodEngine ) {
+ likelihoodCalculationEngine = (FlowBasedAlignmentLikelihoodEngine)engine;
+ } else {
+ throw new GATKException("must use a flow based likelihood calculation engine");
+ }
+
+ // open genome prior if provided
+ if ( genomePriorPath != null ) {
+ try {
+ genomePriorDB = new GenomePriorDB(genomePriorPath);
+ } catch (IOException e) {
+ throw new GATKException("failed to open genome-prior file: " + genomePriorPath);
+ }
+ }
+
+ // open output, write header
+ try {
+ if (outputCsvPath.toPath().toString().endsWith(".gz")) {
+ outputCsv = new PrintWriter(new GZIPOutputStream(outputCsvPath.getOutputStream()));
+ } else {
+ outputCsv = new PrintWriter(outputCsvPath.getOutputStream());
+ }
+ } catch (IOException e) {
+ throw new GATKException("failed to open csv output: " + outputCsvPath, e);
+ }
+ emitCsvHeaders();
+
+ // initialize reports
+ if ( reportFilePath != null ) {
+ qualReport = BooleanAccumulator.newReport(QUAL_VALUE_MAX + 1, HMER_VALUE_MAX + 1, deviationToBin(HMER_VALUE_MAX + 1), BASE_VALUE_MAX + 1);
+
+ // establish max hmer size of flow input
+ int maxClass = FlowBasedRead.MAX_CLASS;
+ final SAMFileHeader header = getHeaderForReads();
+ if ( header != null ) {
+ for ( SAMReadGroupRecord rg : header.getReadGroups() ) {
+ if ( rg.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG) != null ) {
+ maxClass = Math.max(maxClass, Integer.parseInt(rg.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG)));
+ }
+ }
+ }
+ percentileReports = new Vector<>();
+ }
+ }
+
+ @Override
+ public void closeTool() {
+
+ // close main output csv
+ if ( outputCsv != null ) {
+ outputCsv.close();
+ }
+
+ // write reports
+ if ( reportFilePath != null ) {
+ final GATKReport report = new GATKReport(
+ BooleanAccumulator.newReportTable(qualReport, "qual", fbargs.probabilityRatioThreshold, omitZerosFromReport),
+ BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", omitZerosFromReport),
+ BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", "deviation", "base", omitZerosFromReport),
+ PercentileReport.newReportTable(percentileReports, qualityPercentiles)
+ );
+ try ( final PrintStream ps = new PrintStream(reportFilePath.getOutputStream()) ) {
+ report.print(ps);
+ }
+ }
+
+ super.closeTool();
+ }
+
+ @Override
+ public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
+
+ // working with unmapped reads is really not practical, as we need the reference to work out ground truth
+ if ( read.isUnmapped() )
+ return;
+ if ( referenceContext.getWindow() == null )
+ return;
+
+ // handle read clipping
+ final GATKRead clippedRead;
+ if (isSoftClipped(read) ) {
+ if (useSoftclippedBases) {
+ referenceContext.setWindow(read.getStart() - read.getUnclippedStart(), read.getUnclippedEnd() - read.getEnd());;
+ clippedRead = ReadClipper.revertSoftClippedBases(read);
+ } else {
+ clippedRead = ReadClipper.hardClipSoftClippedBases(read);
+ }
+ } else {
+ clippedRead = read;
+ }
+
+ // filter?
+ if ( (features != null) && !filter(clippedRead, referenceContext) ) {
+ return;
+ }
+
+ // create flow read/haplotype
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), clippedRead);
+ final FlowBasedRead flowRead = new FlowBasedRead(clippedRead, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+ final Haplotype haplotype = new Haplotype(referenceContext.getBases(), true);
+ final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(haplotype, rgInfo.flowOrder);
+
+ // is this really needed?
+ if ( !flowRead.isValid() ) {
+ return;
+ }
+
+ // compute score
+ final int hapKeyLength = flowHaplotype.getKeyLength();
+ final double score = FlowFeatureMapper.computeLikelihoodLocal(flowRead, flowHaplotype, hapKeyLength, false);
+ final double normalizedScore = score / flowRead.getKeyLength();
+ if ( normalizedScore < normalizedScoreThreshold )
+ return;
+
+ // compute error probability
+ final double[] errorProb = computeErrorProb(flowRead, genomePriorDB);
+
+ // cycle skip
+ final FlowBasedReadUtils.CycleSkipStatus cycleSkipStatus = FlowBasedReadUtils.getCycleSkipStatus(flowRead, referenceContext);
+
+ // accumulate reports
+ if ( cycleSkipStatus != FlowBasedReadUtils.CycleSkipStatus.CS && qualReport != null ) {
+ addToQualReport(flowRead, referenceContext, errorProb);
+ }
+
+ // emit
+ try {
+ emit(flowRead, flowHaplotype, score, normalizedScore, errorProb, read, cycleSkipStatus);
+ } catch (IOException e) {
+ throw new GATKException("failed to write output record", e);
+ }
+ }
+
+ private boolean filter(final GATKRead read, final ReferenceContext referenceContext) {
+
+ // loop on features contained in the read, check that they are in agreement with read data
+ Iterator iter = features.query(new SimpleInterval(read));
+ byte[] ref = referenceContext.getBases();
+ while ( iter.hasNext() ) {
+ final VariantContext vc = iter.next();
+ for ( int refCoord = vc.getStart() ; refCoord <= vc.getEnd() ; refCoord++ ) {
+
+ // get byte from read
+ Optional readByte = ReadUtils.getReadBaseAtReferenceCoordinate(read, refCoord);
+ if ( !readByte.isPresent() ) {
+ return false;
+ }
+
+ // get byte from reference
+ byte refByte = ref[refCoord - referenceContext.getWindow().getStart()];
+
+ // compare
+ if ( refByte != readByte.get() ) {
+ return false;
+ }
+ }
+ }
+
+ // if here, no interference detected
+ return true;
+ }
+
+ /*
+ * compute error probability vector for a read
+ *
+ * The vector has one element for each flow key, representing the probability complementing the call-probability to 1
+ * This is further complicated by the optional presence of a genome-prior database, which provides factoring for
+ * each hmer length (on a base basis)
+ */
+ private double[] computeErrorProb(final FlowBasedRead flowRead, final GenomePriorDB genomePriorDB) {
+
+ final int[] key = flowRead.getKey();
+ final byte[] flowOrder = flowRead.getFlowOrderArray();
+
+ final double[] probCol = new double[flowRead.getMaxHmer() + 1];
+ double[] result = new double[key.length];
+
+ for ( int i = 0 ; i < key.length ; i++ ) {
+
+ // step 1 - extract column & sum
+ double sum = 0;
+ for ( int j = 0 ; j < probCol.length ; j++ ) {
+ sum += (probCol[j] = flowRead.getProb(i, j));
+ }
+
+ // step 2 - normalize column
+ if ( sum != 0.0 ) {
+ for (int j = 0; j < probCol.length; j++) {
+ probCol[j] /= sum;
+ }
+ }
+
+ // step 3 - scale according to prior genome?
+ if ( genomePriorDB != null ) {
+
+ long[] prior = genomePriorDB.getPriorForBase(flowOrder[i]);
+ sum = 0;
+ if ( prior != null ) {
+ for (int j = 0; j < probCol.length; j++) {
+ sum += (probCol[j] *= prior[j]);
+ }
+ }
+
+ // assign normalized result
+ if ( sum != 0.0 ) {
+ result[i] = 1 - (probCol[Math.min(probCol.length - 1, Math.min(key[i], flowRead.getMaxHmer()))] / sum);
+ } else {
+ // revert to non-prior normalization
+ result[i] = 1 - probCol[Math.min(key[i], flowRead.getMaxHmer())];
+ }
+ } else {
+
+ // assign normalized result
+ result[i] = 1 - probCol[Math.min(key[i], flowRead.getMaxHmer())];
+ }
+
+ // accumulate error probabilities
+ if ( percentileReports != null ) {
+ if ( key[i] != 0 || !excludeZeroFlows ) {
+ while ( percentileReports.size() < (i + 1) ) {
+ percentileReports.add(new PercentileReport());
+ }
+ percentileReports.get(i).addProb(result[i]);
+ }
+ }
+ }
+
+ return result;
+ }
+
+ /*
+ * compute lowest-quality-base-tp-value vector for a read
+ *
+ * The vector has one element for each flow key, representing the value of the tp for the hmer base
+ * which has the lowest quality value
+ *
+ * example:
+ * Bases: TTTTT
+ * Qs: ABCBA
+ * Tp: -1 1 0 1 -1
+ * Output: -1
+ */
+ private byte[] computeLowestQBaseTP(final FlowBasedRead flowRead) {
+
+ final int[] key = flowRead.getKey();
+ byte[] result = new byte[key.length];
+ final byte[] tp = flowRead.getAttributeAsByteArray("tp");
+ final byte[] qual = flowRead.getBaseQualitiesNoCopy();
+
+ // loop om key
+ int seq_i = 0;
+ for ( int i = 0 ; i < key.length ; i++ ) {
+
+ // extract hmer length, zero is easy
+ int hmer = key[i];
+ if ( hmer == 0 ) {
+ result[i] = 0;
+ continue;
+ }
+
+ // scan qualities for the lowest value, start with first
+ // as qualities and tp are symetric, we can scan up to the middle
+ // when finding the middle (offset) account for even/odd hmers
+ result[i] = tp[seq_i];
+ byte lowestQ = qual[seq_i];
+ int hmer_scan_length = (hmer + 1) / 2;
+ for ( int j = 1 ; j < hmer_scan_length ; j++ ) {
+ if ( qual[seq_i + j] < lowestQ ) {
+ result[i] = tp[seq_i + j];
+ lowestQ = qual[seq_i + j];
+ }
+ }
+
+ // advance
+ seq_i += hmer;
+ }
+
+ return result;
+ }
+
+ private void emitCsvHeaders() {
+
+ outputCsv.println(StringUtils.join(csvFieldOrder, ","));
+ }
+
+ private void emit(final FlowBasedRead flowRead, final FlowBasedHaplotype refHaplotype, double score, final double normalizedScore, final double[] errorProb,
+ GATKRead read,
+ FlowBasedReadUtils.CycleSkipStatus cycleSkipStatus) throws IOException {
+
+ // build line columns
+ final Map cols = new LinkedHashMap<>();
+
+ // read info
+ cols.put("ReadName", flowRead.getName());
+ cols.put("ReadIsReversed", flowRead.isReverseStrand() ? 1 : 0);
+ cols.put("ReadMQ", flowRead.getMappingQuality());
+ cols.put("ReadRQ", flowRead.getAttributeAsFloat("rq"));
+ cols.put("CycleSkipStatus", cycleSkipStatus);
+ cols.put("Cigar", read.getCigar().toString());
+
+
+ // keys, seq, etc
+ cols.put("ReadKey", "\"" + StringUtils.join(flowRead.getKey(), ',') + "\"");
+ cols.put("GroundTruthKey", "\"" + StringUtils.join(refHaplotype.getKey(), ',') + "\"");
+ cols.put("ReadSequence", flowRead.getBasesString());
+ cols.put("ReadKeyLength", flowRead.getKeyLength());
+ cols.put("GroundTruthKeyLength", refHaplotype.getKeyLength());
+
+ // scores
+ cols.put("Score", score);
+ cols.put("NormalizedScore", normalizedScore);
+ cols.put("ErrorProbability", "\"" + StringUtils.join(
+ Arrays.stream(errorProb).mapToObj(v -> doubleFormat.format(v)).toArray(),
+ ',') + "\"");
+
+ // lowest q base tp
+ final byte[] lowestQBaseTP = computeLowestQBaseTP(flowRead);
+ cols.put("LowestQBaseTP", "\"" + StringUtils.join(lowestQBaseTP, ',') + "\"");
+
+ // add read probabilities
+ if ( addMeanCalll ) {
+ double[][] readProbsAndMeanCall = collectReadProbs(flowRead);
+ cols.put("ReadProbs", "\"" + StringUtils.join(
+ Arrays.stream(readProbsAndMeanCall[0]).mapToObj(v -> doubleFormat.format(v)).toArray(),
+ ',') + "\"");
+ cols.put("ReadMeanCall", "\"" + StringUtils.join(
+ Arrays.stream(readProbsAndMeanCall[1]).mapToObj(v -> doubleFormat.format(v)).toArray(),
+ ',') + "\"");
+ }
+
+ // construct line
+ StringBuilder sb = new StringBuilder();
+ int colIndex = 0;
+ for ( String field : csvFieldOrder ) {
+ if ( colIndex++ > 0 ) {
+ sb.append(',');
+ }
+ if ( !cols.containsKey(field) ) {
+ throw new GATKException("column missing from csv line: " + field);
+ }
+ sb.append(cols.get(field));
+ cols.remove(field);
+ }
+ if ( cols.size() > 0 ) {
+ throw new GATKException("invalid columns on csv line: " + cols.keySet());
+ }
+
+ // output line
+ if ( !noOutput ) {
+ outputCsv.println(sb);
+ }
+ }
+
+ private double[][] collectReadProbs(final FlowBasedRead read) {
+
+ final int keyLength = read.getKeyLength();
+ final int maxHmer = read.getMaxHmer();
+ final double[] probs = new double[keyLength * (maxHmer + 1)];
+ final double[] meanCall = new double[keyLength];
+
+ // retrieve probs
+ int pos = 0;
+ for ( int flow = 0 ; flow < keyLength ; flow++ ) {
+ double mc = 0;
+ int mc_sum = 0;
+ for ( int hmer = 0 ; hmer <= maxHmer ; hmer++ ) {
+ final double p = read.getProb(flow, hmer);
+ probs[pos++] = p;
+ mc += (p * hmer);
+ mc_sum += hmer;
+ }
+ meanCall[flow] = mc / mc_sum;
+ }
+
+ return new double[][] {probs, meanCall};
+ }
+
+ static private class GenomePriorDB {
+
+ final private Map db = new LinkedHashMap<>();
+
+ GenomePriorDB(GATKPath path) throws IOException {
+
+ final CSVReader csvReader = new CSVReader(new InputStreamReader(path.getInputStream()));
+ String[] line;
+ while ( (line = csvReader.readNext()) != null ) {
+ long[] prior = new long[HMER_VALUE_MAX + 1];
+ Byte base = line[0].getBytes()[0];
+ for ( int i = 0 ; i < prior.length ; i++ ) {
+ if ( i == 0 ){
+ prior[i] = Long.parseLong(line[i+1]);
+ } else {
+ prior[i] = Long.parseLong(line[i]);
+ }
+ }
+ db.put(base, prior);
+ }
+ }
+
+ long[] getPriorForBase(byte base) {
+ return db.get(base);
+ }
+ }
+
+ private void addToQualReport(FlowBasedRead flowRead, ReferenceContext referenceContext, final double[] errorProb) {
+
+ // convert reference to key space
+ final Haplotype haplotype = new Haplotype(referenceContext.getBases(), true);
+ final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(haplotype, flowRead.getFlowOrder());
+
+ // access keys & flow order
+ final int[] readKey = flowRead.getKey();
+ final int[] hapKey = flowHaplotype.getKey();
+ final byte[] flowOrder = flowRead.getFlowOrder().getBytes();
+
+ // loop on key positions
+ if ( readKey.length != hapKey.length ) {
+ return;
+ }
+ for ( int flow = 0 ; flow < readKey.length ; flow++ ) {
+
+ // determine quality
+ final double prob = Precision.round(errorProb[flow], (QUAL_VALUE_MAX / 10) + 1);
+ final int qual = (int)Math.ceil(-10 * Math.log10(prob));
+
+ // determine if matches reference
+ final int deviation = readKey[flow] - hapKey[flow];
+ final boolean same = (deviation == 0);
+
+ // accumulate
+ if ( qual < qualReport.length ) {
+ int baseBin = baseToBin(flowOrder[flow % flowOrder.length], flowRead.isReverseStrand());
+ qualReport[qual].add(same, readKey[flow], deviationToBin(deviation), baseBin);
+ }
+ }
+ }
+
+ static private int baseToBin(byte base, boolean isReverseStrand) {
+ final byte trueBase = !isReverseStrand ? base : BaseUtils.simpleComplement(base);
+ return FlowBasedRead.DEFAULT_FLOW_ORDER.indexOf(trueBase);
+ }
+
+ static private byte binToBase(int bin) {
+ return (byte)FlowBasedRead.DEFAULT_FLOW_ORDER.charAt(bin);
+ }
+
+ // 0,-1,1,-2,2... -> 0,1,2,3,4...
+ static private int deviationToBin(final int deviation) {
+ if ( deviation >= 0 ) {
+ return deviation * 2;
+ } else {
+ return (-deviation * 2) - 1;
+ }
+ }
+
+ // 0,1,2,3,4... -> 0,-1,1,-2,2...
+ static private String binToDeviation(final int bin) {
+ if ( bin == 0 ) {
+ return "0";
+ } else if ( (bin % 2) == 0 ) {
+ return String.format("+%d", bin / 2);
+ } else {
+ return String.format("%d", -((bin+1) / 2));
+ }
+ }
+
+ static private boolean isSoftClipped( final GATKRead read ) {
+ if ( read.isUnmapped() )
+ return false;
+ if ( read.getCigar().getFirstCigarElement() == null )
+ return false;
+ final CigarOperator firstOperator = read.getCigar().getFirstCigarElement().getOperator();
+ final CigarOperator lastOperator = read.getCigar().getLastCigarElement().getOperator();
+ return (firstOperator == CigarOperator.SOFT_CLIP && lastOperator != CigarOperator.SOFT_CLIP) ||
+ (firstOperator != CigarOperator.SOFT_CLIP && lastOperator == CigarOperator.SOFT_CLIP);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java
new file mode 100644
index 00000000000..151e0ae4867
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java
@@ -0,0 +1,112 @@
+package org.broadinstitute.hellbender.tools.walkers.groundtruth;
+
+import org.apache.commons.collections.map.LazySortedMap;
+
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class SeriesStats {
+
+ // local state
+ private double last = Double.NaN;
+ private int count = 0;
+ private double sum = 0;
+ private double min = Double.NaN;
+ private double max = Double.NaN;
+ private SortedMap bins = new TreeMap<>();
+
+ void add(double v) {
+
+ // save in simple values
+ last = v;
+ sum += v;
+ if ( count > 0 ) {
+ min = Math.min(min, v);
+ max = Math.max(max, v);
+ } else {
+ min = max = v;
+ }
+ count++;
+
+ // save in bins
+ if ( bins.containsKey(v) ) {
+ bins.get(v).incrementAndGet();
+ } else {
+ bins.put(v, new AtomicInteger(1));
+ }
+ }
+
+ public double getLast() {
+ return last;
+ }
+
+ public int getCount() {
+ return count;
+ }
+
+ public double getMin() {
+ return (count != 0) ? min : Double.NaN;
+ }
+
+ public double getMax() {
+ return (count != 0) ? max : Double.NaN;
+ }
+
+ public int getUniq() {
+ return bins.size();
+ }
+
+ public double getMean() {
+ return (count != 0) ? (sum / count) : Double.NaN;
+ }
+
+ public double getMedian() {
+ return getPercentile(50);
+ }
+
+ public double getPercentile(double precentile) {
+ if ( count == 0 ) {
+ return Double.NaN;
+ } else if ( count == 1 ) {
+ return last;
+ } else {
+
+ int percentileIndex = (int)(count * precentile / 100);
+ int index = 0;
+ for (Map.Entry entry : bins.entrySet() ) {
+ int binSize = entry.getValue().get();
+ if ( percentileIndex >= index && (percentileIndex < (index + binSize)) ) {
+ return entry.getKey();
+ }
+ index += binSize;
+ }
+
+ // if here, we need the highest entry
+ return bins.lastKey();
+ }
+ }
+
+ public double getStd() {
+
+ if (count == 0) {
+ return Double.NaN;
+ }
+
+ // calculate mean
+ double mean = getMean();
+
+ // calculate sum of sqr(deviations)
+ double vsum = 0;
+ for (Map.Entry entry : bins.entrySet()) {
+ int binSize = entry.getValue().get();
+ vsum += (Math.pow(entry.getKey() - mean, 2) * binSize);
+ }
+
+ // calculate variance and std
+ double variance = vsum / count;
+ return Math.sqrt(variance);
+ }
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
index 17728fa6035..ebc0f61d3f6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
@@ -340,7 +340,7 @@ private AlleleLikelihoods getAlleleLikelihoodMatrix(final Alle
.forEach(alleleHaplotypeMap.get(notAllele)::add);
final AlleleLikelihoods alleleLikelihoods = readLikelihoods.marginalize(alleleHaplotypeMap);
- logger.debug(() -> String.format("GALM: %s %d %d", allele.toString(), alleleHaplotypeMap.get(allele).size(), alleleHaplotypeMap.get(notAllele).size()));
+ logger.debug(() -> String.format("GALM: %s %d %d", allele, alleleHaplotypeMap.get(allele.altAllele()).size(), alleleHaplotypeMap.get(notAllele).size()));
return alleleLikelihoods;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java
index f504d400b13..557e7f0add9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java
@@ -127,6 +127,21 @@
* argument. Note however that very high ploidies (such as are encountered in large pooled experiments) may cause
* performance challenges including excessive slowness. We are working on resolving these limitations.
*
+ * For having variable ploidy in different regions, like making haploid calls outside the PAR on chrX or chrY,
+ * see the --ploidy-regions flag. The -ploidy flag sets the default ploidy to use everywhere, and --ploidy-regions
+ * should be a .bed or .interval_list with "name" column containing the desired ploidy to use in that region
+ * when genotyping. Note that variants near the boundary may not have the matching ploidy since the ploidy used will
+ * be determined using the following precedence:
+ *
+ * - ploidy given in --ploidy-regions for all intervals overlapping the active region when calling your variant
+ * (with ties broken by using largest ploidy); note ploidy interval may only overlap the active region and determine
+ * the ploidy of your variant even if the end coordinate written for your variant lies outside the given region;
+ * - ploidy given via global -ploidy flag;
+ * - ploidy determined by the default global built-in constant for humans (2).
+ *
+ *
+ * Coordinates for the PAR for CRCh38 can be found here.
+ *
* Additional Notes
*
* - When working with PCR-free data, be sure to set `-pcr_indel_model NONE` (see argument below).
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
index 7ab58d73264..f818659bb5e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
@@ -1,7 +1,10 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;
+import htsjdk.tribble.NamedFeature;
import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.arrow.util.VisibleForTesting;
import org.apache.commons.lang3.ArrayUtils;
+import org.apache.commons.lang3.SerializationUtils;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
@@ -9,6 +12,7 @@
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
+import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
@@ -23,9 +27,11 @@
/**
* Set of arguments for the {@link HaplotypeCallerEngine}
*/
-public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{
+public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;
+ public static final String PLOIDY_REGIONS_NAME = "ploidy-regions";
+
public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands";
public static final String GQ_BAND_SHORT_NAME = "GQB";
public static final String DO_NOT_CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "do-not-correct-overlapping-quality";
@@ -61,6 +67,9 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
return new HaplotypeCallerReadThreadingAssemblerArgumentCollection();
}
+ @Argument(fullName = PLOIDY_REGIONS_NAME, shortName = PLOIDY_REGIONS_NAME, doc = "Interval file with column specifying desired ploidy for genotyping models. Overrides default ploidy and user-provided --ploidy argument in specific regions.", optional = true)
+ public FeatureInput ploidyRegions = null;
+
/**
* You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This
* is especially useful if your samples are all in the same file but you need to run them individually through HC
@@ -312,6 +321,12 @@ boolean isFlowBasedCallingMode() {
return flowMode != FlowMode.NONE;
}
+ // Copy method used to create new hcArgs with same fields except input ploidy model
+ public HaplotypeCallerArgumentCollection copyWithNewPloidy(int ploidy) {
+ HaplotypeCallerArgumentCollection newArgsWithNewPloidy = SerializationUtils.clone(this);
+ newArgsWithNewPloidy.standardArgs.genotypeArgs.samplePloidy = ploidy;
+ return newArgsWithNewPloidy;
+ }
/**
* the different flow modes, in terms of their parameters and their values
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
index 71f6d64ede7..9e09eeba8a9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
@@ -3,7 +3,10 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.OverlapDetector;
import htsjdk.samtools.util.RuntimeIOException;
+import htsjdk.tribble.NamedFeature;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
@@ -20,12 +23,15 @@
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
+import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.MinimalGenotypingEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.OutputMode;
import org.broadinstitute.hellbender.tools.walkers.genotyper.StandardCallerArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
+import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.haplotype.Event;
import org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles;
@@ -83,8 +89,31 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator {
protected final OutputStreamWriter assemblyDebugOutStream;
- // the genotyping engine for the isActive() determination
- private MinimalGenotypingEngine activeRegionEvaluationGenotyperEngine = null;
+ /**
+ * List of interval file entries including regions and custom ploidy values to apply in that region.
+ */
+ protected final List ploidyRegions = new ArrayList<>();
+
+ /**
+ * An OverlapDetector object for checking whether region overlaps given ploidyRegions.
+ */
+ protected final OverlapDetector ploidyRegionsOverlapDetector;
+
+ /**
+ * List of all custom ploidies provided by user
+ */
+ private final LinkedHashSet allCustomPloidies;
+
+ /**
+ * The default genotyping engine for the isActive() determination
+ */
+ private MinimalGenotypingEngine defaultActiveRegionEvaluationGenotyperEngine = null;
+
+ /**
+ * Map of user-provided ploidy values to corresponding active region genotyper. Values are checked as valid Integers during
+ * initialization, but Strings are used as keys to avoid parsing repeatedly during runtime.
+ */
+ private final HashMap ploidyToActiveEvaluationGenotyper = new HashMap<>();
protected ReadThreadingAssembler assemblyEngine = null;
@@ -93,7 +122,16 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator {
// If we are in PDHMM mode we need to hold onto two likelihoods engines for the fallback
private ReadLikelihoodCalculationEngine pdhmmLikelihoodCalculationEngine = null;
- protected HaplotypeCallerGenotypingEngine genotypingEngine = null;
+ /**
+ * The default genotyping engine to use for actual variant calling and genotyping in an active region.
+ */
+ protected HaplotypeCallerGenotypingEngine defaultGenotypingEngine = null;
+
+ /**
+ * Map of user-provided ploidy values to corresponding genotyper. Values are checked as valid Integers during
+ * initialization, but Strings are used as keys to avoid parsing repeatedly during runtime.
+ */
+ protected final HashMap ploidyToGenotyperMap = new HashMap<>();
private VariantAnnotatorEngine annotationEngine = null;
@@ -163,7 +201,7 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator {
/**
* Create and initialize a new HaplotypeCallerEngine given a collection of HaplotypeCaller arguments, a reads header,
* and a reference file
- * @param hcArgs command-line arguments for the HaplotypeCaller
+ * @param hcArgs command-line arguments for the HaplotypeCaller
* @param assemblyRegionArgs
* @param createBamOutIndex true to create an index file for the bamout
* @param createBamOutMD5 true to create an md5 file for the bamout
@@ -196,6 +234,21 @@ public HaplotypeCallerEngine(final HaplotypeCallerArgumentCollection hcArgs, Ass
HaplotypeCallerGenotypingDebugger.initialize(hcArgs.genotyperDebugOutStream);
}
+ // Parse the user provided custom ploidy regions into ploidyRegions object containing SimpleCounts
+ if (this.hcArgs.ploidyRegions != null) {
+ FeatureDataSource ploidyDataSource = new FeatureDataSource<>(this.hcArgs.ploidyRegions, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, NamedFeature.class);
+ ploidyDataSource.forEach(r -> this.ploidyRegions.add(new SimpleCount(r)));
+ }
+
+ for (SimpleCount region : this.ploidyRegions) {
+ if (!IntervalUtils.intervalIsOnDictionaryContig(region.getInterval(), readsHeader.getSequenceDictionary())) {
+ throw new UserException("Invalid region provided for --ploidy-regions at " + region.getContig() + ":" + region.getStart() + "-" + region.getEnd() + ". Contig name or endpoint doesn't match read sequence dictionary.");
+ }
+ }
+
+ this.ploidyRegionsOverlapDetector = OverlapDetector.create(this.ploidyRegions);
+ this.allCustomPloidies = this.ploidyRegions.stream().map(SimpleCount::getCount).collect(Collectors.toCollection(LinkedHashSet::new));
+
trimmer = new AssemblyRegionTrimmer(assemblyRegionArgs, readsHeader.getSequenceDictionary());
initialize(createBamOutIndex, createBamOutMD5);
}
@@ -242,8 +295,16 @@ private void initialize(boolean createBamOutIndex, final boolean createBamOutMD5
initializeActiveRegionEvaluationGenotyperEngine();
- genotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD);
- genotypingEngine.setAnnotationEngine(annotationEngine);
+ defaultGenotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD);
+ defaultGenotypingEngine.setAnnotationEngine(annotationEngine);
+
+ // Create other custom genotyping engines if user provided ploidyRegions
+ for (final int ploidy : this.allCustomPloidies) {
+ HaplotypeCallerArgumentCollection newPloidyHcArgs = hcArgs.copyWithNewPloidy(ploidy);
+ HaplotypeCallerGenotypingEngine newGenotypingEngine = new HaplotypeCallerGenotypingEngine(newPloidyHcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD);
+ newGenotypingEngine.setAnnotationEngine(annotationEngine);
+ this.ploidyToGenotyperMap.put(ploidy, newGenotypingEngine);
+ }
boolean isFlowBased = (hcArgs.likelihoodArgs.likelihoodEngineImplementation == ReadLikelihoodCalculationEngine.Implementation.FlowBased)
|| (hcArgs.likelihoodArgs.likelihoodEngineImplementation == ReadLikelihoodCalculationEngine.Implementation.FlowBasedHMM);
@@ -381,8 +442,18 @@ private void initializeActiveRegionEvaluationGenotyperEngine() {
// Seems that at least with some test data we can lose genuine haploid variation if we use ploidy == 1
activeRegionArgs.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, hcArgs.standardArgs.genotypeArgs.samplePloidy);
- activeRegionEvaluationGenotyperEngine = new MinimalGenotypingEngine(activeRegionArgs, samplesList);
- activeRegionEvaluationGenotyperEngine.setLogger(logger);
+ defaultActiveRegionEvaluationGenotyperEngine = new MinimalGenotypingEngine(activeRegionArgs, samplesList);
+ defaultActiveRegionEvaluationGenotyperEngine.setLogger(logger);
+
+ // If custom ploidyRegions provided, create corresponding map for active region determination genotyper
+ for (final int ploidy : this.allCustomPloidies) {
+ StandardCallerArgumentCollection newPloidyActiveArgs = new StandardCallerArgumentCollection();
+ newPloidyActiveArgs.copyStandardCallerArgsFrom(activeRegionArgs);
+ newPloidyActiveArgs.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, ploidy);
+ MinimalGenotypingEngine newActiveGenotypingEngine = new MinimalGenotypingEngine(newPloidyActiveArgs, samplesList);
+ newActiveGenotypingEngine.setLogger(logger);
+ this.ploidyToActiveEvaluationGenotyper.put(ploidy, newActiveGenotypingEngine);
+ }
}
/**
@@ -455,7 +526,7 @@ public VCFHeader makeVCFHeader( final SAMSequenceDictionary sequenceDictionary,
final Set headerInfo = new HashSet<>();
headerInfo.addAll(defaultToolHeaderLines);
- headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
+ headerInfo.addAll(defaultGenotypingEngine.getAppropriateVCFInfoHeaders());
// all annotation fields from VariantAnnotatorEngine
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions(emitReferenceConfidence()));
// all callers need to add these standard annotation header lines
@@ -523,6 +594,57 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence
vcfWriter.writeHeader(makeVCFHeader(sequenceDictionary, defaultToolHeaderLines));
}
+ /**
+ * Determines the appropriate ploidy to use at given site for different genotyping engines.
+ * @param region Current region of interest
+ * @return Ploidy value to use here given user inputs, or -1 if fall back to default
+ */
+ private int getPloidyToUseAtThisSite(Locatable region) {
+ Set overlaps = this.ploidyRegionsOverlapDetector.getOverlaps(region);
+ // Return first engine for interval overlapping this region
+ if (!overlaps.isEmpty()) {
+ Iterator intervals = overlaps.iterator();
+ int max = intervals.next().getCount();
+ while (intervals.hasNext()) {
+ int next = intervals.next().getCount();
+ if (next > max) {
+ max = next;
+ }
+ }
+ return max;
+ } else {
+ return -1; // Sentinel value to fall back to default genotyper
+ }
+ }
+
+ /**
+ * Selects appropriate active region genotyping engine for given region
+ * @param region Current region of interest
+ * @return Active genotyping engine with correct ploidy setting for given region
+ */
+ private MinimalGenotypingEngine getLocalActiveGenotyper(Locatable region) {
+ int currentPloidy = getPloidyToUseAtThisSite(region);
+ if (currentPloidy == -1) {
+ return this.defaultActiveRegionEvaluationGenotyperEngine;
+ } else {
+ return this.ploidyToActiveEvaluationGenotyper.get(currentPloidy);
+ }
+ }
+
+ /**
+ * Selects appropriate genotyping engine for given region.
+ * @param region Current region of interest, e.g. AssemblyRegion
+ * @return Genotyping engine with correct ploidy setting for given region
+ */
+ protected HaplotypeCallerGenotypingEngine getLocalGenotypingEngine(Locatable region) {
+ int currentPloidy = getPloidyToUseAtThisSite(region);
+ if (currentPloidy == -1) {
+ return this.defaultGenotypingEngine;
+ } else {
+ return this.ploidyToGenotyperMap.get(currentPloidy);
+ }
+ }
+
/**
* Given a pileup, returns an ActivityProfileState containing the probability (0.0 to 1.0) that it's an "active" site.
*
@@ -537,6 +659,8 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence
*/
@Override
public ActivityProfileState isActive(final AlignmentContext context, final ReferenceContext ref, final FeatureContext features) {
+ MinimalGenotypingEngine localActiveGenotypingEngine = getLocalActiveGenotyper(ref);
+
if (forceCallingAllelesPresent && features.getValues(hcArgs.alleles, ref).stream().anyMatch(vc -> hcArgs.forceCallFiltered || vc.isNotFiltered())) {
return new ActivityProfileState(ref.getInterval(), 1.0);
}
@@ -546,7 +670,7 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
return new ActivityProfileState(ref.getInterval(), 0.0);
}
- final int ploidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
+ final int ploidy = localActiveGenotypingEngine.getConfiguration().genotypeArgs.samplePloidy;
final List noCall = GATKVariantContextUtils.noCallAlleles(ploidy); // used to noCall all genotypes until the exact model is applied
final Map splitContexts;
@@ -565,7 +689,7 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
sample.getValue().getBasePileup().forEach(p -> PileupBasedAlleles.addMismatchPercentageToRead(p.getRead(), readsHeader, ref));
}
// The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not.
- final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
+ final int activeRegionDetectionHackishSamplePloidy = localActiveGenotypingEngine.getConfiguration().genotypeArgs.samplePloidy;
final double[] genotypeLikelihoods = ((RefVsAnyResult) referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(
activeRegionDetectionHackishSamplePloidy,
sample.getValue().getBasePileup(), ref.getBase(),
@@ -579,9 +703,9 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
if (genotypes.size() == 1) {
// Faster implementation using exact marginalization instead of iteration
- isActiveProb = activeRegionEvaluationGenotyperEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector());
+ isActiveProb = localActiveGenotypingEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector());
} else {
- final VariantContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getEnd(), alleles).genotypes(genotypes).make());
+ final VariantContext vcOut = localActiveGenotypingEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getEnd(), alleles).genotypes(genotypes).make());
isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb(vcOut.getPhredScaledQual());
}
@@ -607,6 +731,8 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
* @return List of variants discovered in the region (may be empty)
*/
public List callRegion(final AssemblyRegion region, final FeatureContext features, final ReferenceContext referenceContext) {
+ final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(region);
+
if ( hcArgs.justDetermineActiveRegions ) {
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
return NO_CALLS;
@@ -799,7 +925,7 @@ public List callRegion(final AssemblyRegion region, final Featur
if (hcArgs.filterAlleles) {
logger.debug("Filtering alleles");
- AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, genotypingEngine);
+ AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine);
//need to update haplotypes to find the alleles
EventMap.buildEventMapsForHaplotypes(readLikelihoods.alleles(),
assemblyResult.getFullReferenceWithPadding(),
@@ -849,7 +975,7 @@ public List callRegion(final AssemblyRegion region, final Featur
}
}
- final CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods(
+ final CalledHaplotypes calledHaplotypes = localGenotypingEngine.assignGenotypeLikelihoods(
haplotypes,
subsettedReadLikelihoodsFinal,
perSampleFilteredReadList,
@@ -890,7 +1016,7 @@ public List callRegion(final AssemblyRegion region, final Featur
result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(),
calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
- subsettedReadLikelihoodsFinal, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null,
+ subsettedReadLikelihoodsFinal, localGenotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null,
VCpriors));
trimmingResult.nonVariantRightFlankRegion().ifPresent(flank -> result.addAll(referenceModelForNoVariation(flank, false, VCpriors)));
@@ -948,6 +1074,7 @@ protected boolean containsCalls(final CalledHaplotypes calledHaplotypes) {
* @return a list of variant contexts (can be empty) to emit for this ref region
*/
protected List referenceModelForNoVariation(final AssemblyRegion region, final boolean needsToBeFinalized, final List VCpriors) {
+ final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(region);
if ( emitReferenceConfidence() ) {
if ( needsToBeFinalized ) {
AssemblyBasedCallerUtils.finalizeRegion(region,
@@ -967,7 +1094,7 @@ protected List referenceModelForNoVariation(final AssemblyRegion
final List haplotypes = Collections.singletonList(refHaplotype);
return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes,
paddedLoc, region, AssemblyBasedCallerUtils.createDummyStratifiedReadMap(refHaplotype, samplesList, readsHeader, region),
- genotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors);
+ localGenotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors);
}
else {
return NO_CALLS;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java
index b5b5c2cfa88..e47245c9f91 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java
@@ -460,6 +460,7 @@ private void uncollapse(final CallRegionContext context) {
}
private void filter(final CallRegionContext context) {
+ final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(context.region);
try {
// no need for this step?
@@ -479,7 +480,7 @@ private void filter(final CallRegionContext context) {
context.suspiciousLocations = new HashSet<>();
if (hcArgs.filterAlleles) {
logger.debug("Filtering alleles");
- AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, genotypingEngine);
+ AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine);
//need to update haplotypes to find the alleles
EventMap.buildEventMapsForHaplotypes(context.readLikelihoods.alleles(),
context.assemblyResult.getFullReferenceWithPadding(),
@@ -520,6 +521,7 @@ private void filter(final CallRegionContext context) {
}
private void genotype(final CallRegionContext context) {
+ final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(context.region);
// no need for this step?
if ( context.regionVariants != null ) {
@@ -542,7 +544,7 @@ private void genotype(final CallRegionContext context) {
// haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
List haplotypes = context.readLikelihoods.alleles();
- final CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods(
+ final CalledHaplotypes calledHaplotypes = localGenotypingEngine.assignGenotypeLikelihoods(
haplotypes,
context.readLikelihoods,
perSampleFilteredReadList,
@@ -582,7 +584,7 @@ private void genotype(final CallRegionContext context) {
result.addAll(referenceConfidenceModel.calculateRefConfidence(context.assemblyResult.getReferenceHaplotype(),
calledHaplotypes.getCalledHaplotypes(), context.assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
- context.readLikelihoods, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null,
+ context.readLikelihoods, localGenotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null,
context.VCpriors));
context.nonVariantRightFlankRegion.ifPresent(flank -> result.addAll(referenceModelForNoVariation(flank, false, context.VCpriors)));
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java
index 095e2dab8ea..dd02b8e8d4b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java
@@ -2,7 +2,7 @@
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
-import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
similarity index 53%
rename from src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java
rename to src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
index c870d9693ee..6a4d3f5c06a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
@@ -1,26 +1,22 @@
-package org.broadinstitute.hellbender.tools.walkers.annotator;
+package org.broadinstitute.hellbender.tools.walkers.mutect;
import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.StringUtils;
-import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
-import org.broadinstitute.hellbender.engine.FeatureContext;
-import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality;
+import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
-import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import java.util.*;
import java.util.stream.Collectors;
@@ -31,69 +27,23 @@
* [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is
* 1,2,3,4|5,6,7,8
*/
-@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
- summary="Featurized read sets for Mutect3 training data")
-public class FeaturizedReadSets implements JumboGenotypeAnnotation {
- public static final int DEFAULT_BASE_QUALITY = 25;
-
- private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE;
+public class FeaturizedReadSets {
+ private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class);
- public static final int FEATURES_PER_READ = 11;
+ public static final int DEFAULT_BASE_QUALITY = 25;
private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
- // downsample ref reads to this count if needed
- private final int maxRefCount;
-
- public FeaturizedReadSets(final int maxRefCount) {
- this.maxRefCount = maxRefCount;
- }
-
- public FeaturizedReadSets() {
- this(DEFAULT_MAX_REF_COUNT);
- }
-
- @Override
- public void annotate(final ReferenceContext ref,
- final FeatureContext features,
- final VariantContext vc,
- final Genotype g,
- final GenotypeBuilder gb,
- final AlleleLikelihoods likelihoods,
- final AlleleLikelihoods fragmentLikelihoods,
- final AlleleLikelihoods haplotypeLikelihoods) {
- Utils.nonNull(vc);
- Utils.nonNull(g);
- Utils.nonNull(gb);
-
- if ( likelihoods == null) {
- return;
- }
-
- final List>> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()),
- likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE);
-
- // flatten twice: over all reads supporting each allele and over all alleles
- // we can partition by allele with the countsInAlleleOrder annotation
- // and by read using the constant feature vector size
- final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream()
- .flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream))
- .mapToInt(n -> n)
- .toArray();
-
- final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray();
-
- gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder);
- gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder);
- }
+ private FeaturizedReadSets() { }
public static List>> getReadVectors(final VariantContext vc,
final Collection samples,
final AlleleLikelihoods likelihoods,
final AlleleLikelihoods haplotypeLikelihoods,
final int refDownsample,
- final int altDownsample) {
- return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap());
+ final int altDownsample,
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
+ return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode);
}
// returns Lists (in allele order) of lists of read vectors supporting each allele
@@ -103,7 +53,8 @@ public static List>> getReadVectors(final VariantContext vc,
final AlleleLikelihoods haplotypeLikelihoods,
final int refDownsample,
final int altDownsample,
- final Map altDownsampleMap) {
+ final Map altDownsampleMap,
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final Map> readsByAllele = likelihoods.alleles().stream()
.collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));
@@ -126,17 +77,14 @@ public static List>> getReadVectors(final VariantContext vc,
.forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));
return vc.getAlleles().stream()
- .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList()))
+ .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode)).collect(Collectors.toList()))
.collect(Collectors.toList());
}
- @Override
- public List getKeyNames() {
- return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY);
- }
-
- private static List featurize(final GATKRead read, final VariantContext vc, final Map bestHaplotypes) {
+ private static List featurize(final GATKRead read, final VariantContext vc,
+ final Map bestHaplotypes,
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final List result = new ArrayList<>();
result.add(read.getMappingQuality());
result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
@@ -151,23 +99,41 @@ private static List featurize(final GATKRead read, final VariantContext
result.add(Math.abs(read.getFragmentLength()));
- // distances from ends of fragment
- final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
- final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
- result.add(vc.getStart() - fragmentStart);
- result.add(fragmentEnd - vc.getEnd());
+ if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) {
+ // distances from ends of fragment
+ final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
+ final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
+ result.add(vc.getStart() - fragmentStart);
+ result.add(fragmentEnd - vc.getEnd());
+ }
+
+ // Ultima-specific read tags
+ if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) {
+ result.add(read.getAttributeAsInteger("si")); // si is an integer on the order of 100s or 1000s
+ result.add((int) (1000*read.getAttributeAsFloat("rq"))); // rq is a float from 0 and 1, so we multiply by 1000 and round
+ }
// mismatches versus best haplotype
final Haplotype haplotype = bestHaplotypes.get(read);
- final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
- final GATKRead copy = read.copy();
- copy.setCigar(readToHaplotypeAlignment.getCigar());
- final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
- result.add(mismatchCount);
-
- final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
- result.add((int) indelsVsBestHaplotype);
- Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features");
+
+ // TODO: fix this
+ // I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina
+ if (!bestHaplotypes.containsKey(read)) {
+ logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(),
+ vc.getContig(), vc.getStart()));
+ result.add(3);
+ result.add(2);
+ } else {
+ final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
+ final GATKRead copy = read.copy();
+ copy.setCigar(readToHaplotypeAlignment.getCigar());
+ final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
+ result.add(mismatchCount);
+
+ final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
+ result.add((int) indelsVsBestHaplotype);
+ }
+ Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");
return result;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
index d25c7a33fc5..0087441ae1a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
@@ -81,10 +81,11 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample";
public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset";
public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth";
+ public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode";
public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10;
public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20;
- public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 20;
+ public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 1;
@Override
protected int getDefaultMaxMnpDistance() { return 1; }
@@ -205,6 +206,25 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection")
public File mutect3Dataset;
+ @Advanced
+ @Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset. Depends on sequencing technology.")
+ public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;
+
+ public enum Mutect3DatasetMode {
+ ILLUMINA(11),
+ ULTIMA(11);
+
+ final private int numReadFeatures;
+
+ Mutect3DatasetMode(final int numReadFeatures) {
+ this.numReadFeatures = numReadFeatures;
+ }
+
+ public int getNumReadFeatures() {
+ return numReadFeatures;
+ }
+ }
+
/**
* VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field)
* contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
index e6a080cd14f..942c2de15b2 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
@@ -4,18 +4,15 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.tuple.Triple;
-import org.apache.commons.math3.util.CombinatoricsUtils;
import org.apache.commons.math3.util.FastMath;
-import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity;
-import org.broadinstitute.hellbender.tools.walkers.annotator.FeaturizedReadSets;
import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
-import org.broadinstitute.hellbender.utils.NaturalLogUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix;
@@ -46,9 +43,6 @@ private enum Label {
ARTIFACT, VARIANT, UNLABELED, IGNORE
}
- // number of features for each vectorized read
- private static final int FEATURES_PER_READ = FeaturizedReadSets.FEATURES_PER_READ;
-
// number of additional variant features for assembly complexity, reference context
private static final int NUM_EXTRA_FEATURES = 9;
@@ -108,7 +102,8 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode,
public void addData(final ReferenceContext ref, final VariantContext vc, Optional> truthVCs,
final AlleleLikelihoods likelihoods,
final AlleleLikelihoods logFragmentLikelihoods,
- final AlleleLikelihoods logFragmentAlleleLikelihoods) {
+ final AlleleLikelihoods logFragmentAlleleLikelihoods,
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final String refBases = ReferenceBases.annotate(ref, vc);
final String refAllele = vc.getReference().getBaseString();
final String contig = vc.getContig();
@@ -132,6 +127,20 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
final int normalDepth = (int) MathUtils.sum(normalADs);
final boolean hasNormal = normalDepth > 0;
+ final List allRefAlleles = new ArrayList<>();
+ allRefAlleles.add(vc.getReference());
+ truthVCs.ifPresent(vcs -> vcs.forEach(var -> allRefAlleles.add(var.getReference())));
+ final Allele longestRef = allRefAlleles.stream().sorted(Comparator.comparingInt(Allele::length).reversed()).findFirst().get();
+
+ // skip(1) excludes the reference allele
+ final List remappedAltAlleles = ReferenceConfidenceVariantContextMerger.remapAlleles(vc, longestRef).stream()
+ .skip(1).toList();
+
+ final Set truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream()
+ .filter(truthVC -> ! truthVC.isFiltered())
+ .flatMap(truthVC -> ReferenceConfidenceVariantContextMerger.remapAlleles(truthVC, longestRef).stream())
+ .collect(Collectors.toSet());
+
final List