use of htsjdk.variant.vcf.VCFFileReader in project gatk by broadinstitute.
the class LiftOverVcf method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
IOUtil.assertFileIsReadable(CHAIN);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(REJECT);
////////////////////////////////////////////////////////////////////////
// Setup the inputs
////////////////////////////////////////////////////////////////////////
final LiftOver liftOver = new LiftOver(CHAIN);
final VCFFileReader in = new VCFFileReader(INPUT, false);
logger.info("Loading up the target reference genome.");
final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final Map<String, byte[]> refSeqs = new HashMap<>();
for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
}
CloserUtil.close(walker);
////////////////////////////////////////////////////////////////////////
// Setup the outputs
////////////////////////////////////////////////////////////////////////
final VCFHeader inHeader = in.getFileHeader();
final VCFHeader outHeader = new VCFHeader(inHeader);
outHeader.setSequenceDictionary(walker.getSequenceDictionary());
final VariantContextWriter out = new VariantContextWriterBuilder().setOption(Options.INDEX_ON_THE_FLY).setOutputFile(OUTPUT).setReferenceDictionary(walker.getSequenceDictionary()).build();
out.writeHeader(outHeader);
final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT).unsetOption(Options.INDEX_ON_THE_FLY).build();
final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
rejects.writeHeader(rejectHeader);
////////////////////////////////////////////////////////////////////////
// Read the input VCF, lift the records over and write to the sorting
// collection.
////////////////////////////////////////////////////////////////////////
long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
logger.info("Lifting variants over and sorting.");
final SortingCollection<VariantContext> sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");
for (final VariantContext ctx : in) {
++total;
final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
final Interval target = liftOver.liftOver(source, 1.0);
if (target == null) {
rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
failedLiftover++;
} else {
// Fix the alleles if we went from positive to negative strand
final List<Allele> alleles = new ArrayList<>();
for (final Allele oldAllele : ctx.getAlleles()) {
if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
alleles.add(oldAllele);
} else {
alleles.add(Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()));
}
}
// Build the new variant context
final VariantContextBuilder builder = new VariantContextBuilder(ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);
builder.id(ctx.getID());
builder.attributes(ctx.getAttributes());
builder.genotypes(ctx.getGenotypes());
builder.filters(ctx.getFilters());
builder.log10PError(ctx.getLog10PError());
// Check that the reference allele still agrees with the reference sequence
boolean mismatchesReference = false;
for (final Allele allele : builder.getAlleles()) {
if (allele.isReference()) {
final byte[] ref = refSeqs.get(target.getContig());
final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length());
if (!refString.equalsIgnoreCase(allele.getBaseString())) {
mismatchesReference = true;
}
break;
}
}
if (mismatchesReference) {
rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
failedAlleleCheck++;
} else {
sorter.add(builder.make());
}
}
progress.record(ctx.getContig(), ctx.getStart());
}
final NumberFormat pfmt = new DecimalFormat("0.0000%");
final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
logger.info("Processed ", total, " variants.");
logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
logger.info(Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over.");
logger.info(pct, " of variants were not successfully lifted over and written to the output.");
rejects.close();
in.close();
////////////////////////////////////////////////////////////////////////
// Write the sorted outputs to the final output file
////////////////////////////////////////////////////////////////////////
sorter.doneAdding();
progress = new ProgressLogger(logger, 1000000, "written");
logger.info("Writing out sorted records to final VCF.");
for (final VariantContext ctx : sorter) {
out.add(ctx);
progress.record(ctx.getContig(), ctx.getStart());
}
out.close();
sorter.cleanup();
return null;
}
use of htsjdk.variant.vcf.VCFFileReader in project gatk by broadinstitute.
the class SplitVcfs method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
final ProgressLogger progress = new ProgressLogger(logger, 10000);
final VCFFileReader fileReader = new VCFFileReader(INPUT);
final VCFHeader fileHeader = fileReader.getFileHeader();
final SAMSequenceDictionary sequenceDictionary = SEQUENCE_DICTIONARY != null ? SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY).getSequenceDictionary() : fileHeader.getSequenceDictionary();
if (CREATE_INDEX && sequenceDictionary == null) {
throw new UserException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output.");
}
final VariantContextWriterBuilder builder = new VariantContextWriterBuilder().setReferenceDictionary(sequenceDictionary).clearOptions();
if (CREATE_INDEX)
builder.setOption(Options.INDEX_ON_THE_FLY);
try (final VariantContextWriter snpWriter = builder.setOutputFile(SNP_OUTPUT).build();
final VariantContextWriter indelWriter = builder.setOutputFile(INDEL_OUTPUT).build()) {
snpWriter.writeHeader(fileHeader);
indelWriter.writeHeader(fileHeader);
int incorrectVariantCount = 0;
final CloseableIterator<VariantContext> iterator = fileReader.iterator();
while (iterator.hasNext()) {
final VariantContext context = iterator.next();
if (context.isIndel())
indelWriter.add(context);
else if (context.isSNP())
snpWriter.add(context);
else {
if (STRICT)
throw new IllegalStateException("Found a record with type " + context.getType().name());
else
incorrectVariantCount++;
}
progress.record(context.getContig(), context.getStart());
}
if (incorrectVariantCount > 0) {
logger.debug("Found " + incorrectVariantCount + " records that didn't match SNP or INDEL");
}
CloserUtil.close(iterator);
CloserUtil.close(fileReader);
}
return null;
}
use of htsjdk.variant.vcf.VCFFileReader in project gatk by broadinstitute.
the class ConvertGSVariantsToSegmentsIntegrationTest method composeExpectedSegments.
private List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> composeExpectedSegments(final File vcf, final TargetCollection<Target> targets) throws IOException {
final VCFFileReader reader = new VCFFileReader(vcf, false);
final List<HiddenStateSegmentRecord<CopyNumberTriState, Target>> result = new ArrayList<>();
reader.iterator().forEachRemaining(vc -> {
final int targetCount = targets.indexRange(vc).size();
for (final Genotype genotype : vc.getGenotypes()) {
final int cn = Integer.parseInt(genotype.getExtendedAttribute("CN").toString());
final double[] cnp = Stream.of(genotype.getExtendedAttribute("CNP").toString().replaceAll("\\[\\]", "").split(",")).mapToDouble(Double::parseDouble).toArray();
final double cnpSum = MathUtils.approximateLog10SumLog10(cnp);
final CopyNumberTriState call = expectedCall(cn);
final double exactLog10Prob = expectedExactLog10(call, cnp);
final HiddenStateSegment<CopyNumberTriState, Target> expectedSegment = new HiddenStateSegment<>(new SimpleInterval(vc), targetCount, Double.parseDouble(genotype.getExtendedAttribute("CNF").toString()), 0.000, call, -10.0 * exactLog10Prob, Double.NaN, Double.NaN, Double.NaN, -10.0 * (cnp[ConvertGSVariantsToSegments.NEUTRAL_COPY_NUMBER_DEFAULT] - cnpSum));
result.add(new HiddenStateSegmentRecord<>(genotype.getSampleName(), expectedSegment));
}
});
return result;
}
Aggregations