use of htsjdk.variant.vcf.VCFFileReader in project gatk by broadinstitute.
the class SortVcf method sortInputs.
/**
* Merge the inputs and sort them by adding each input's content to a single SortingCollection.
* <p/>
* NB: It would be better to have a merging iterator as in MergeSamFiles, as this would perform better for pre-sorted inputs.
* Here, we are assuming inputs are unsorted, and so adding their VariantContexts iteratively is fine for now.
* MergeVcfs exists for simple merging of presorted inputs.
*
* @param readers - a list of VCFFileReaders, one for each input VCF
* @param outputHeader - The merged header whose information we intend to use in the final output file
*/
private SortingCollection<VariantContext> sortInputs(final List<VCFFileReader> readers, final VCFHeader outputHeader) {
final ProgressLogger readProgress = new ProgressLogger(logger, 25000, "read", "records");
// NB: The default MAX_RECORDS_IN_RAM may not be appropriate here. VariantContexts are smaller than SamRecords
// We would have to play around empirically to find an appropriate value. We are not performing this optimization at this time.
final SortingCollection<VariantContext> sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(outputHeader), outputHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
int readerCount = 1;
for (final VCFFileReader reader : readers) {
logger.info("Reading entries from input file " + readerCount);
for (final VariantContext variantContext : reader) {
sorter.add(variantContext);
readProgress.record(variantContext.getContig(), variantContext.getStart());
}
reader.close();
readerCount++;
}
return sorter;
}
use of htsjdk.variant.vcf.VCFFileReader in project gatk by broadinstitute.
the class SortVcf method collectFileReadersAndHeaders.
private void collectFileReadersAndHeaders(final List<String> sampleList, SAMSequenceDictionary samSequenceDictionary) {
for (final File input : INPUT) {
final VCFFileReader in = new VCFFileReader(input, false);
final VCFHeader header = in.getFileHeader();
final SAMSequenceDictionary dict = in.getFileHeader().getSequenceDictionary();
if (dict == null || dict.isEmpty()) {
if (null == samSequenceDictionary) {
throw new IllegalArgumentException("Sequence dictionary was missing or empty for the VCF: " + input.getAbsolutePath() + " Please add a sequence dictionary to this VCF or specify SEQUENCE_DICTIONARY.");
}
header.setSequenceDictionary(samSequenceDictionary);
} else {
if (null == samSequenceDictionary) {
samSequenceDictionary = dict;
} else {
try {
samSequenceDictionary.assertSameDictionary(dict);
} catch (final AssertionError e) {
throw new IllegalArgumentException(e);
}
}
}
if (sampleList.isEmpty()) {
sampleList.addAll(header.getSampleNamesInOrder());
} else {
if (!sampleList.equals(header.getSampleNamesInOrder())) {
throw new IllegalArgumentException("Input file " + input.getAbsolutePath() + " has sample names that don't match the other files.");
}
}
inputReaders.add(in);
inputHeaders.add(header);
}
}
use of htsjdk.variant.vcf.VCFFileReader in project gatk by broadinstitute.
the class DbSnpBitSetUtil method loadVcf.
/** Private helper method to read through the VCF and create one or more bit sets. */
private static void loadVcf(final File dbSnpFile, final SAMSequenceDictionary sequenceDictionary, final Map<DbSnpBitSetUtil, Set<DbSnpVariantType>> bitSetsToVariantTypes) {
final VCFFileReader variantReader = new VCFFileReader(dbSnpFile);
final CloseableIterator<VariantContext> variantIterator = variantReader.iterator();
while (variantIterator.hasNext()) {
final VariantContext kv = variantIterator.next();
for (final Map.Entry<DbSnpBitSetUtil, Set<DbSnpVariantType>> tuple : bitSetsToVariantTypes.entrySet()) {
final DbSnpBitSetUtil bitset = tuple.getKey();
final Set<DbSnpVariantType> variantsToMatch = tuple.getValue();
BitSet bits = bitset.sequenceToBitSet.get(kv.getContig());
if (bits == null) {
final int nBits;
if (sequenceDictionary == null)
nBits = kv.getEnd() + 1;
else
nBits = sequenceDictionary.getSequence(kv.getContig()).getSequenceLength() + 1;
bits = new BitSet(nBits);
bitset.sequenceToBitSet.put(kv.getContig(), bits);
}
if (variantsToMatch.isEmpty() || (kv.isSNP() && variantsToMatch.contains(DbSnpVariantType.SNP)) || (kv.isIndel() && variantsToMatch.contains(DbSnpVariantType.insertion)) || (kv.isIndel() && variantsToMatch.contains(DbSnpVariantType.deletion))) {
for (int i = kv.getStart(); i <= kv.getEnd(); i++) bits.set(i, true);
}
}
}
CloserUtil.close(variantIterator);
CloserUtil.close(variantReader);
}
use of htsjdk.variant.vcf.VCFFileReader in project gatk by broadinstitute.
the class StructuralVariationDiscoveryPipelineSparkIntegrationTest method svDiscoveryVCFEquivalenceTest.
public static void svDiscoveryVCFEquivalenceTest(final String generatedVCFPath, final String expectedVCFPath, final List<String> attributesToIgnore, final boolean onHDFS) throws Exception {
VCFFileReader fileReader;
CloseableIterator<VariantContext> iterator;
final List<VariantContext> actualVcs;
if (onHDFS) {
final File tempLocalVCF = BaseTest.createTempFile("variants", "vcf");
tempLocalVCF.deleteOnExit();
BucketUtils.copyFile(generatedVCFPath, tempLocalVCF.getAbsolutePath());
fileReader = new VCFFileReader(tempLocalVCF, false);
} else {
fileReader = new VCFFileReader(new File(generatedVCFPath), false);
}
iterator = fileReader.iterator();
actualVcs = Utils.stream(iterator).collect(Collectors.toList());
CloserUtil.close(iterator);
CloserUtil.close(fileReader);
fileReader = new VCFFileReader(new File(expectedVCFPath), false);
iterator = fileReader.iterator();
final List<VariantContext> expectedVcs = Utils.stream(iterator).collect(Collectors.toList());
CloserUtil.close(iterator);
CloserUtil.close(fileReader);
BaseTest.assertCondition(actualVcs, expectedVcs, (a, e) -> VariantContextTestUtils.assertVariantContextsAreEqual(a, e, attributesToIgnore));
}
use of htsjdk.variant.vcf.VCFFileReader in project gatk by broadinstitute.
the class CreateSomaticPanelOfNormals method doWork.
public Object doWork() {
final List<File> inputVcfs = new ArrayList<>(vcfs);
final Collection<CloseableIterator<VariantContext>> iterators = new ArrayList<>(inputVcfs.size());
final Collection<VCFHeader> headers = new HashSet<>(inputVcfs.size());
final VCFHeader headerOfFirstVcf = new VCFFileReader(inputVcfs.get(0), false).getFileHeader();
final SAMSequenceDictionary sequenceDictionary = headerOfFirstVcf.getSequenceDictionary();
final VariantContextComparator comparator = headerOfFirstVcf.getVCFRecordComparator();
for (final File vcf : inputVcfs) {
final VCFFileReader reader = new VCFFileReader(vcf, false);
iterators.add(reader.iterator());
final VCFHeader header = reader.getFileHeader();
Utils.validateArg(comparator.isCompatible(header.getContigLines()), () -> vcf.getAbsolutePath() + " has incompatible contigs.");
headers.add(header);
}
final VariantContextWriter writer = GATKVariantContextUtils.createVCFWriter(outputVcf, sequenceDictionary, false, Options.INDEX_ON_THE_FLY);
writer.writeHeader(new VCFHeader(VCFUtils.smartMergeHeaders(headers, false)));
final MergingIterator<VariantContext> mergingIterator = new MergingIterator<>(comparator, iterators);
SimpleInterval currentPosition = new SimpleInterval("FAKE", 1, 1);
final List<VariantContext> variantsAtThisPosition = new ArrayList<>(20);
while (mergingIterator.hasNext()) {
final VariantContext vc = mergingIterator.next();
if (!currentPosition.overlaps(vc)) {
processVariantsAtSamePosition(variantsAtThisPosition, writer);
variantsAtThisPosition.clear();
currentPosition = new SimpleInterval(vc.getContig(), vc.getStart(), vc.getStart());
}
variantsAtThisPosition.add(vc);
}
mergingIterator.close();
writer.close();
return "SUCCESS";
}
Aggregations