Skip to content

Commit

Permalink
Make VCFCodec the default for query streams from GenomicsDB (#6675)
Browse files Browse the repository at this point in the history
* Make VCFCodec the default for query streams from GenomicsDB and other minor changes

* Add GenotypeGVCFs integration test for both VCF and BCF2 codecs. Also add test to check that a  combined INFO field resulting in a 64-bit value is handled by VCFCodec, but not by BCF2Codec
  • Loading branch information
nalinigans authored Jun 26, 2020
1 parent 7f17b00 commit bc0994c
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,8 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final GATKPath path, final File reference, final GenomicsDBOptions genomicsDBOptions) {
final String workspace = IOUtils.getGenomicsDBAbsolutePath(path) ;
if (workspace == null) {
throw new IllegalArgumentException("Trying to create a GenomicsDBReader from non-GenomicsDB inputpath " + path);
} else if (Files.notExists(IOUtils.getPath(workspace))) {
throw new IllegalArgumentException("Trying to create a GenomicsDBReader from non-GenomicsDB input path " + path);
} else if (Files.notExists(IOUtils.getPath(workspace.endsWith("/") ? workspace : workspace + "/"))) {
throw new UserException("GenomicsDB workspace " + path + " does not exist");
}

Expand All @@ -401,10 +401,10 @@ protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final
try {
final GenomicsDBExportConfiguration.ExportConfiguration exportConfigurationBuilder =
createExportConfiguration(workspace, callsetJson, vidmapJson, vcfHeader, genomicsDBOptions);
if (genomicsDBOptions.useVCFCodec()) {
return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new VCFCodec(), Optional.empty());
} else {
if (genomicsDBOptions.useBCFCodec()) {
return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new BCF2Codec(), Optional.empty());
} else {
return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new VCFCodec(), Optional.empty());
}
} catch (final IOException e) {
throw new UserException("Couldn't create GenomicsDBFeatureReader", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

public class GenomicsDBArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;
public static final String USE_VCF_CODEC_LONG_NAME = "genomicsdb-use-vcf-codec";
public static final String USE_BCF_CODEC_LONG_NAME = "genomicsdb-use-bcf-codec";
public static final String SHARED_POSIXFS_OPTIMIZATIONS = "genomicsdb-shared-posixfs-optimizations";

private static final boolean DEFAULT_CALL_GENOTYPES = false;
private static final boolean DEFAULT_USE_VCF_CODEC = false;
private static final boolean DEFAULT_USE_BCF_CODEC = false;
private static final boolean DEFAULT_SHARED_POSIXFS_OPTIMIZATIONS = false;

/**
Expand All @@ -21,18 +21,20 @@ public class GenomicsDBArgumentCollection implements Serializable {
public boolean callGenotypes = DEFAULT_CALL_GENOTYPES;
public int maxDiploidAltAllelesThatCanBeGenotyped = GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED;


/**
* Currently there is no support for 64-bit fields in BCFCodec. Specifying this option will allow
* for 64-bit width positions and INFO fields and for computed annotation sizes to exceed the 32-bit
* integer space while encoding/decoding with GenomicsDB.
* Currently there is no support for 64-bit fields in BCF2Codec. The VCFCodec allows for 64-bit
* width positions and INFO fields and for computed annotation sizes to exceed the 32-bit
* integer space while encoding/decoding with GenomicsDB. Use the BCF2Codec option if and
* only if performance is an issue.
*/
@Advanced
@Argument(
fullName = USE_VCF_CODEC_LONG_NAME,
doc = "Use VCF Codec Streaming for data from GenomicsDB instead of the default BCF",
fullName = USE_BCF_CODEC_LONG_NAME,
doc =
"Use BCF Codec Streaming for data from GenomicsDB instead of the default VCFCodec. BCFCodec performs slightly better but currently does not support "
+ "64-bit width positions and INFO fields and for computed annotation sizes to exceed 32-bit integer space.",
optional = true)
public boolean useVCFCodec = DEFAULT_USE_VCF_CODEC;
public boolean useBCFCodec = DEFAULT_USE_BCF_CODEC;

@Argument(fullName = SHARED_POSIXFS_OPTIMIZATIONS,
doc = "Allow for optimizations to improve the usability and performance for shared Posix Filesystems(e.g. NFS, Lustre). " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,15 +203,15 @@ public final class GenomicsDBImport extends GATKTool {
public static final String SHARED_POSIXFS_OPTIMIZATIONS = GenomicsDBArgumentCollection.SHARED_POSIXFS_OPTIMIZATIONS;

@Argument(fullName = WORKSPACE_ARG_LONG_NAME,
doc = "Workspace for GenomicsDB. Must be a POSIX file system path, but can be a relative path. " +
doc = "Workspace for GenomicsDB. Can be a POSIX file system absolute or relative path or a HDFS/GCS URL. " +
"Use this argument when creating a new GenomicsDB workspace. " +
"Either this or "+INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" must be specified." +
" Must be an empty or non-existent directory.",
mutex = {INCREMENTAL_WORKSPACE_ARG_LONG_NAME,INTERVAL_LIST_LONG_NAME})
private String workspace;

@Argument(fullName = INCREMENTAL_WORKSPACE_ARG_LONG_NAME,
doc = "Workspace when updating GenomicsDB. Must be a POSIX file system path, but can be a relative path. " +
doc = "Workspace when updating GenomicsDB. Can be a POSIX file system absolute or relative path or a HDFS/GCS URL. " +
"Use this argument when adding new samples to an existing GenomicsDB workspace or "+
"when using the "+INTERVAL_LIST_LONG_NAME+" option. " +
"Either this or "+WORKSPACE_ARG_LONG_NAME+" must be specified. " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public final class GenomicsDBOptions {
final private boolean callGenotypes;
final private int maxDiploidAltAllelesThatCanBeGenotyped;
final private int maxGenotypeCount;
final private boolean useVCFCodec;
final private boolean useBCFCodec;
final private boolean sharedPosixFSOptimizations;

public GenomicsDBOptions() {
Expand All @@ -32,7 +32,7 @@ public GenomicsDBOptions(final Path reference, GenomicsDBArgumentCollection geno
this.callGenotypes = genomicsdbArgs.callGenotypes;
this.maxDiploidAltAllelesThatCanBeGenotyped = genomicsdbArgs.maxDiploidAltAllelesThatCanBeGenotyped;
this.maxGenotypeCount = genotypeCalcArgs.MAX_GENOTYPE_COUNT;
this.useVCFCodec = genomicsdbArgs.useVCFCodec;
this.useBCFCodec = genomicsdbArgs.useBCFCodec;
this.sharedPosixFSOptimizations = genomicsdbArgs.sharedPosixFSOptimizations;
}

Expand All @@ -52,8 +52,8 @@ public int getMaxGenotypeCount() {
return maxGenotypeCount;
}

public boolean useVCFCodec() {
return useVCFCodec;
public boolean useBCFCodec() {
return useBCFCodec;
}

public boolean sharedPosixFSOptimizations() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTes
private static final String INTERVAL_PICARD_STYLE_EXPECTED = toolsTestDir + "GenomicsDBImport/interval_expected.interval_list";
private static final String MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED =
toolsTestDir + "GenomicsDBImport/multiple_non_adjacent_intervals_combine_gvcfs_expected.interval_list";
private static final String TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE = GENOMICSDB_TEST_DIR + "/int64_test.tar.gz";
//Consider a gVCF with a REF block chr20:50-150. Importing this data into GenomicsDB using multiple intervals
//-L chr20:1-100 and -L chr20:101-200 will cause the REF block to be imported into both the arrays
//Now, when reading data from the workspace (assume full scan) - the data is split into 2 REF block intervals chr20:50-100
Expand Down Expand Up @@ -1073,6 +1074,27 @@ public void testWithMiscOptionsNoOverwrite() throws IOException {
basicWriteAndQueryWithOptions(workspace, options);
}

@Test
public void testQueryWithComputationsExceeding32BitsDefault() throws IOException {
final String folder = createTempDir("computations_exceed_32bits").getAbsolutePath();
IOUtils.extractTarGz(Paths.get(TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE), Paths.get(folder));
IOUtils.deleteOnExit(IOUtils.getPath(folder));
final String workspace = folder + "/bigint_genomicsdb_ws";
checkGenomicsDBAgainstExpected(workspace, new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("1"))), folder+"/expected_combined_bigint.vcf",
folder+"/reference/chr1_10MB.fasta.gz", true, ATTRIBUTES_TO_IGNORE, false, false, true);
}

// The following test should fail with a Throwable because of limitations in BCF2Codec - see https://github.com/broadinstitute/gatk/issues/6548
@Test(expectedExceptions = Throwable.class)
public void testQueryWithComputationsExceeding32BitsBCFCodec() throws IOException {
final String folder = createTempDir("computations_exceed_32bits_bcf2codec").getAbsolutePath() + "/testQueryWithComputationsExceed32Bits";
IOUtils.extractTarGz(Paths.get(TEST_INT64_SUPPORT_GENOMICSDB_BUNDLE), Paths.get(folder));
IOUtils.deleteOnExit(IOUtils.getPath(folder));
final String workspace = folder + "/bigint_genomicsdb_ws";
checkGenomicsDBAgainstExpected(workspace, new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("1"))), folder+"/expected_combined_bigint.vcf",
folder+"/reference/chr1_10MB.fasta.gz", true, ATTRIBUTES_TO_IGNORE, false, false, false);
}

@Test(groups = {"bucket"})
public void testWriteToAndQueryFromGCS() throws IOException {
final String workspace = BucketUtils.randomRemotePath(getGCPTestStaging(), "", "") + "/";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.GenomicsDBTestUtils;
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBImport;
import org.broadinstitute.hellbender.tools.walkers.annotator.RMSMappingQuality;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand Down Expand Up @@ -220,8 +221,6 @@ public void testGenotypesOnly(File input, File expected, List<String> extraArgs,
//this is different from the above data provider because we can currently only load a single interval into a genomics db in a sane way
//so we need to provide a list of intervals and then look at each one
public Object[][] getGVCFsForGenomicsDB(){


return new Object[][]{
{getTestFile(BASE_PAIR_GVCF), getTestFile(BASE_PAIR_EXPECTED), new SimpleInterval("20", 1, 11_000_000), b37_reference_20_21},
{CEUTRIO_20_21_GATK3_4_G_VCF, getTestFile("CEUTrio.20.gatk3.7_30_ga4f720357.expected.vcf"), new SimpleInterval("20", 1, 11_000_000), b37_reference_20_21},
Expand All @@ -244,10 +243,26 @@ public Object[][] getGVCFsForGenomicsDBOverMultipleIntervals() {

//this only tests single-sample
@Test(dataProvider = "getGVCFsForGenomicsDB", timeOut = 1000000)
public void assertMatchingGenotypesFromTileDB(File input, File expected, Locatable interval, String reference) throws IOException {
public void assertMatchingGenotypesFromGenomicsDB(File input, File expected, Locatable interval, String reference) throws IOException {
final File tempGenomicsDB = GenomicsDBTestUtils.createTempGenomicsDB(input, interval);
final String genomicsDBUri = GenomicsDBTestUtils.makeGenomicsDBUri(tempGenomicsDB);
runGenotypeGVCFSAndAssertSomething(genomicsDBUri, expected, NO_EXTRA_ARGS, VariantContextTestUtils::assertVariantContextsHaveSameGenotypes, reference);

// The default option with GenomicsDB input uses VCFCodec for decoding, test BCFCodec explicitly
final List<String> args = new ArrayList<String>();
args.add("--"+GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME);
runGenotypeGVCFSAndAssertSomething(genomicsDBUri, expected, args, VariantContextTestUtils::assertVariantContextsHaveSameGenotypes, reference);
}

private void runAndCheckGenomicsDBOutput(final ArgumentsBuilder args, final File expected, final File output) {
Utils.resetRandomGenerator();
runCommandLine(args);

// Note that if this isn't working it will take *FOREVER*
// runs in 0.06 minutes with no input intervals specfied
final List<VariantContext> expectedVC = VariantContextTestUtils.getVariantContexts(expected);
final List<VariantContext> actualVC = VariantContextTestUtils.getVariantContexts(output);
assertForEachElementInLists(actualVC, expectedVC, VariantContextTestUtils::assertVariantContextsHaveSameGenotypes);
}

@Test(dataProvider = "getGVCFsForGenomicsDBOverMultipleIntervals")
Expand All @@ -259,21 +274,17 @@ public void testGenotypeGVCFsMultiIntervalGDBQuery(File input, File expected, Li

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(reference))
.add("V", genomicsDBUri)
.addOutput(output);
.add("V", genomicsDBUri);
args.addOutput(output);
intervals.forEach(args::addInterval);
args.addRaw("--" + GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME);
args.addRaw("--only-output-calls-starting-in-intervals"); //note that this will restrict calls to just the specified intervals

Utils.resetRandomGenerator();
runCommandLine(args);

//Note that if this isn't working it will take *FOREVER*
// runs in 0.06 minutes with no input intervals specfied
final List<VariantContext> expectedVC = VariantContextTestUtils.getVariantContexts(expected);
final List<VariantContext> actualVC = VariantContextTestUtils.getVariantContexts(output);
assertForEachElementInLists(actualVC, expectedVC, VariantContextTestUtils::assertVariantContextsHaveSameGenotypes);
runAndCheckGenomicsDBOutput(args, expected, output);

// The default option with GenomicsDB input uses VCFCodec for decoding, test BCFCodec explicitly
args.addRaw("--"+GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME);
runAndCheckGenomicsDBOutput(args, expected, output);
}

//this tests single-sample with new MQ format
Expand All @@ -285,6 +296,11 @@ public void assertMatchingAnnotationsFromGenomicsDB_newMQformat(File input, File
final VCFHeader header = VCFHeaderReader.readHeaderFrom(new SeekablePathStream(IOUtils.getPath(expected.getAbsolutePath())));
final List<String> attributesToFilter = Stream.concat(ATTRIBUTES_WITH_JITTER.stream(), ATTRIBUTES_TO_IGNORE.stream()).collect(Collectors.toList());
runGenotypeGVCFSAndAssertSomething(genomicsDBUri, expected, NO_EXTRA_ARGS, (a, e) -> VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, ATTRIBUTES_TO_IGNORE, ATTRIBUTES_WITH_JITTER, header), reference);

// The default option with GenomicsDB input uses VCFCodec for decoding, test BCFCodec explicitly
final List<String> args = new ArrayList<String>();
args.add("--"+GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME);
runGenotypeGVCFSAndAssertSomething(genomicsDBUri, expected, args, (a, e) -> VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, ATTRIBUTES_TO_IGNORE, ATTRIBUTES_WITH_JITTER, header), reference);
}

@Test(dataProvider = "gvcfsToGenotype")
Expand Down
Loading

0 comments on commit bc0994c

Please sign in to comment.