use of htsjdk.variant.vcf.VCFContigHeaderLine in project gatk by broadinstitute.
the class VcfUtilsUnitTest method testUpdateContigsReferenceNameOnly.
@Test(dataProvider = "testData")
public void testUpdateContigsReferenceNameOnly(final Set<VCFHeaderLine> inHeaderLines, final SAMSequenceDictionary seqDict, final File referenceFile, final boolean refNameOnly, final String expectedRefName) {
Set<VCFHeaderLine> resultLines = VcfUtils.updateHeaderContigLines(inHeaderLines, referenceFile, seqDict, refNameOnly);
Assert.assertEquals(resultLines.size(), referenceFile == null ? 2 : 3);
for (VCFHeaderLine resultLine : resultLines) {
if (resultLine instanceof VCFContigHeaderLine) {
VCFContigHeaderLine headerLine = (VCFContigHeaderLine) resultLine;
SAMSequenceRecord samSeqRec = headerLine.getSAMSequenceRecord();
if (samSeqRec.getSequenceName().equals("contig1")) {
Assert.assertEquals(samSeqRec.getSequenceLength(), 100);
} else if (samSeqRec.getSequenceName().equals("contig2")) {
Assert.assertEquals(samSeqRec.getSequenceLength(), 200);
} else {
Assert.fail("Bad sequence name in header lines");
}
} else {
if (referenceFile != null) {
Assert.assertEquals(resultLine.getValue(), expectedRefName);
} else {
Assert.fail("Unexpected reference name in header lines");
}
}
}
}
use of htsjdk.variant.vcf.VCFContigHeaderLine in project gatk by broadinstitute.
the class VcfUtilsUnitTest method createHeaderLines.
private Set<VCFHeaderLine> createHeaderLines() {
Set<VCFHeaderLine> headerLines = new HashSet<>(2);
headerLines.add(new VCFContigHeaderLine("##contig=<ID=1,length=249250621,assembly=b37>", vcfHeaderVersion, "", 0));
headerLines.add(new VCFContigHeaderLine("##contig=<ID=2,length=249250622,assembly=b37>", vcfHeaderVersion, "", 0));
return headerLines;
}
use of htsjdk.variant.vcf.VCFContigHeaderLine in project gatk by broadinstitute.
the class VcfUtils method updateHeaderContigLines.
//TODO: these should be refactored/consolidated as part of
// https://github.com/broadinstitute/gatk/issues/121 and
// https://github.com/broadinstitute/gatk/issues/1116
/**
* Given a set of VCF header lines, update the set with contig
* lines from the provided reference dictionary.
* @param oldLines
* @param referenceFile
* @param refDict
* @param referenceNameOnly
* @return Updated list of VCF header lines.
*/
public static Set<VCFHeaderLine> updateHeaderContigLines(final Set<VCFHeaderLine> oldLines, final File referenceFile, final SAMSequenceDictionary refDict, final boolean referenceNameOnly) {
final Set<VCFHeaderLine> lines = new LinkedHashSet<>(oldLines.size());
for (final VCFHeaderLine line : oldLines) {
if (line instanceof VCFContigHeaderLine) {
// skip old contig lines
continue;
}
if (line.getKey().equals(VCFHeader.REFERENCE_KEY)) {
// skip the old reference key
continue;
}
lines.add(line);
}
lines.addAll(makeContigHeaderLines(refDict, referenceFile).stream().collect(Collectors.toList()));
if (referenceFile != null) {
final String referenceValue;
if (referenceNameOnly) {
final int extensionStart = referenceFile.getName().lastIndexOf(".");
referenceValue = extensionStart == -1 ? referenceFile.getName() : referenceFile.getName().substring(0, extensionStart);
} else {
referenceValue = "file://" + referenceFile.getAbsolutePath();
}
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, referenceValue));
}
return lines;
}
use of htsjdk.variant.vcf.VCFContigHeaderLine in project jvarkit by lindenb.
the class VcfGroupByPopulation method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator vcfIn, VariantContextWriter out) {
try {
Reader r = IOUtils.openFileForBufferedReading(this.mappingFile);
parsePopulationMapping(r);
r.close();
VCFHeader header = vcfIn.getHeader();
Set<String> samplesInVcf = new HashSet<>(header.getSampleNamesInOrder());
this.sample2population.keySet().retainAll(samplesInVcf);
Map<String, Set<String>> population2samples = new HashMap<>();
for (String sample : this.sample2population.keySet()) {
String pop = this.sample2population.get(sample);
Set<String> samples = population2samples.get(pop);
if (samples == null) {
samples = new HashSet<>();
population2samples.put(pop, samples);
}
samples.add(sample);
}
for (String sample : header.getSampleNamesInOrder()) {
if (!this.sample2population.containsKey(sample)) {
throw new IOException("Sample " + sample + " not affected to a population");
}
}
Set<VCFHeaderLine> metaData = new LinkedHashSet<>();
for (VCFHeaderLine vhl : header.getMetaDataInInputOrder()) {
if (!(vhl instanceof VCFContigHeaderLine))
continue;
metaData.add(vhl);
}
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
/* FORMAT */
metaData.add(new VCFFormatHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Total Number of Samples"));
metaData.add(new VCFFormatHeaderLine("R", 1, VCFHeaderLineType.Integer, "Number of alleles REF (hom:=2,het:=1)"));
metaData.add(new VCFFormatHeaderLine("A", 1, VCFHeaderLineType.Integer, "Number of alleles ALT (hom:=2,het:=1)"));
metaData.add(new VCFFormatHeaderLine("UNC", 1, VCFHeaderLineType.Integer, "Number of NON-called samples"));
metaData.add(new VCFFormatHeaderLine("F", 1, VCFHeaderLineType.Float, "Allele Frequency A/(R+A)"));
metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
/* INFO */
metaData.add(new VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Total Number of Samples"));
metaData.add(new VCFInfoHeaderLine("R", 1, VCFHeaderLineType.Integer, "Number of alleles REF (hom:=2,het:=1)"));
metaData.add(new VCFInfoHeaderLine("A", 1, VCFHeaderLineType.Integer, "Number of alleles ALT (hom:=2,het:=1)"));
metaData.add(new VCFInfoHeaderLine("UNC", 1, VCFHeaderLineType.Integer, "Number of NON-called samples"));
metaData.add(new VCFInfoHeaderLine("F", 1, VCFHeaderLineType.Float, "Allele Frequency A/(R+A)"));
metaData.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
VCFHeader h2 = new VCFHeader(metaData, population2samples.keySet());
out.writeHeader(h2);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfIn.getHeader());
while (vcfIn.hasNext()) {
VariantContext ctx = progress.watch(vcfIn.next());
VariantContextBuilder vcb = new VariantContextBuilder(ctx.getSource(), ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (ctx.hasID())
vcb.id(ctx.getID());
GCount count_ctx = new GCount();
List<Genotype> genotypes = new ArrayList<>(population2samples.size());
for (String pop : population2samples.keySet()) {
GCount count = new GCount();
Set<String> samples = population2samples.get(pop);
for (String sample : samples) {
Genotype g = ctx.getGenotype(sample);
count.watch(g);
}
GenotypeBuilder gb = new GenotypeBuilder(pop);
gb.attribute("NS", samples.size());
gb.attribute("R", count.R);
gb.attribute("A", count.A);
gb.attribute("UNC", count.uncalled);
if (count.R + count.A > 0) {
gb.attribute("F", (float) count.A / (float) (count.R + count.A));
}
if (count.dp >= 0) {
gb.attribute("DP", count.dp);
if (count_ctx.dp == -1)
count_ctx.dp = 0;
}
genotypes.add(gb.make());
count_ctx.R += count.R;
count_ctx.A += count.A;
count_ctx.uncalled += count.uncalled;
count_ctx.dp += count.dp;
}
vcb.attribute("R", count_ctx.R);
vcb.attribute("A", count_ctx.A);
vcb.attribute("UNC", count_ctx.uncalled);
if (count_ctx.R + count_ctx.A > 0) {
vcb.attribute("F", (float) count_ctx.A / (float) (count_ctx.R + count_ctx.A));
}
if (count_ctx.dp >= 0) {
vcb.attribute("DP", count_ctx.dp);
}
vcb.attribute("NS", this.sample2population.keySet().size());
vcb.genotypes(genotypes);
out.add(vcb.make());
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
use of htsjdk.variant.vcf.VCFContigHeaderLine in project jvarkit by lindenb.
the class MsaToVcf method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter w = null;
LineIterator r = null;
try {
final String inputName = oneFileOrNull(args);
if (inputName == null) {
LOG.info("Reading from stdin");
r = IOUtils.openStreamForLineIterator(stdin());
} else {
LOG.info("Reading from " + inputName);
r = IOUtils.openURIForLineIterator(inputName);
}
Format format = Format.None;
/**
* try to guess format
*/
while (r.hasNext() && format == Format.None) {
final String line = r.peek();
if (line.trim().isEmpty()) {
r.next();
continue;
}
if (line.startsWith("CLUSTAL")) {
format = Format.Clustal;
// consume
r.next();
break;
} else if (line.startsWith(">")) {
format = Format.Fasta;
break;
} else {
LOG.error("MSA format not recognized in " + line);
return -1;
}
}
LOG.info("format : " + format);
/**
* parse lines as FASTA
*/
if (Format.Fasta.equals(format)) {
this.consensus = new FastaConsensus();
AlignSequence curr = null;
while (r.hasNext()) {
String line = r.next();
if (line.startsWith(">")) {
curr = new AlignSequence();
curr.name = line.substring(1).trim();
if (sample2sequence.containsKey(curr.name)) {
LOG.error("Sequence ID " + curr.name + " defined twice");
return -1;
}
sample2sequence.put(curr.name, curr);
} else if (curr != null) {
curr.seq.append(line.trim());
this.align_length = Math.max(this.align_length, curr.seq.length());
}
}
/*
//remove heading & trailing '-'
for(final String sample:this.sample2sequence.keySet())
{
final AlignSequence seq = this.sample2sequence.get(sample);
int i=0;
while(i<this.align_length && seq.at(i)==DELETION)
{
seq.seq.setCharAt(i, CLIPPING);
++i;
}
i= this.align_length-1;
while(i>=0 && seq.at(i)==DELETION)
{
seq.seq.setCharAt(i, CLIPPING);
--i;
}
}*/
} else /**
* parse lines as CLUSTAL
*/
if (Format.Clustal.equals(format)) {
ClustalConsensus clustalconsensus = new ClustalConsensus();
this.consensus = clustalconsensus;
AlignSequence curr = null;
int columnStart = -1;
while (r.hasNext()) {
String line = r.next();
if (line.trim().isEmpty() || line.startsWith("CLUSTAL W")) {
columnStart = -1;
continue;
}
if (line.charAt(0) == ' ') {
if (columnStart == -1) {
LOG.error("illegal consensus line for " + line);
return -1;
}
/* if consensus doesn't exist in the first rows */
while (clustalconsensus.seq.length() < (this.align_length - (line.length() - columnStart))) {
clustalconsensus.seq.append(" ");
}
clustalconsensus.seq.append(line.substring(columnStart));
} else {
if (columnStart == -1) {
columnStart = line.indexOf(' ');
if (columnStart == -1) {
LOG.error("no whithespace in " + line);
return -1;
}
while (columnStart < line.length() && line.charAt(columnStart) == ' ') {
columnStart++;
}
}
String seqname = line.substring(0, columnStart).trim();
curr = this.sample2sequence.get(seqname);
if (curr == null) {
curr = new AlignSequence();
curr.name = seqname;
this.sample2sequence.put(curr.name, curr);
}
int columnEnd = line.length();
// remove blanks and digit at the end
while (columnEnd - 1 > columnStart && (line.charAt(columnEnd - 1) == ' ' || Character.isDigit(line.charAt(columnEnd - 1)))) {
columnEnd--;
}
curr.seq.append(line.substring(columnStart, columnEnd));
this.align_length = Math.max(align_length, curr.seq.length());
}
}
} else {
LOG.error("Undefined input format");
return -1;
}
CloserUtil.close(r);
/* sequence consensus was set*/
if (consensusRefName != null) {
AlignSequence namedSequence = null;
if ((namedSequence = sample2sequence.get(consensusRefName)) == null) {
LOG.error("Cannot find consensus sequence \"" + consensusRefName + "\" in list of sequences: " + this.sample2sequence.keySet().toString());
return -1;
}
this.consensus = new NamedConsensus(namedSequence);
}
/**
* we're done, print VCF
*/
/**
* first, print header
*/
Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
// super.addMetaData(vcfHeaderLines);
Map<String, String> mapping = new HashMap<String, String>();
mapping.put("ID", REF);
mapping.put("length", String.valueOf(this.align_length));
vcfHeaderLines.add(new VCFContigHeaderLine(mapping, 1));
Set<String> samples = new TreeSet<String>(this.sample2sequence.keySet());
VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
w = super.openVariantContextWriter(this.outputFile);
w.writeHeader(vcfHeader);
/**
* loop over data, print header
*/
int pos1 = 0;
while (pos1 < align_length) {
// is it a real variation or print all sites
boolean is_variation;
if (consensus.at(pos1) == MATCH) {
if (this.printAllSites) {
is_variation = false;
} else {
++pos1;
continue;
}
} else {
is_variation = true;
}
int pos2 = pos1 + 1;
// don't extend if no variation and printAllSites
while (is_variation && pos2 < align_length && consensus.at(pos2) != MATCH) {
++pos2;
}
boolean is_subsitution = (pos1 + 1 == pos2);
if (// need pos1>0 because ALT contains prev base.
is_subsitution && pos1 != 0 && is_variation) {
for (Sequence seq : this.sample2sequence.values()) {
if (seq.at(pos1) == DELETION) {
is_subsitution = false;
break;
}
}
}
Set<Allele> alleles = new HashSet<Allele>();
VariantContextBuilder vcb = new VariantContextBuilder();
List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
/* longest variant */
String longest = null;
Counter<String> countAlleles = new Counter<String>();
Map<String, String> sample2genotype = new HashMap<String, String>(samples.size());
String namedConsensusRefAllele = "N";
/* loop over the sequences of each seample */
for (String sample : samples) {
Sequence seq = this.sample2sequence.get(sample);
String al = null;
if (is_subsitution) {
if (seq.at(pos1) == CLIPPING)
continue;
al = String.valueOf(seq.at(pos1));
} else {
StringBuilder sb = new StringBuilder(pos2 - pos1);
for (// yes -1
int i = Math.max(0, pos1 - 1); i < pos2; ++i) {
if (seq.at(i) == CLIPPING)
continue;
if (seq.at(i) == DELETION)
continue;
sb.append(seq.at(i));
}
if (sb.length() == 0)
continue;
al = sb.toString();
}
/* did we find the longest allele ?*/
if (longest == null || longest.length() < al.length()) {
// reset count of most frequent, we'll use the longest indel or subst
countAlleles = new Counter<String>();
longest = al;
}
countAlleles.incr(al);
sample2genotype.put(sample, al);
/* if consensus sequence name was defined , record this allele for future use */
if (consensusRefName != null && sample.equals(consensusRefName)) {
namedConsensusRefAllele = al;
}
}
if (countAlleles.isEmpty()) {
if (// printAllSites=false
printAllSites == false) {
continue;
}
/* no a real variation, just add a dummy 'N' */
countAlleles.incr("N");
}
String refAllStr;
if (consensusRefName == null) {
refAllStr = countAlleles.getMostFrequent();
} else {
refAllStr = namedConsensusRefAllele;
}
final Allele refAllele = Allele.create(refAllStr.replaceAll("[^ATGCatgc]", "N"), true);
alleles.add(refAllele);
/* loop over samples, , build each genotype */
for (String sample : sample2genotype.keySet()) {
Allele al = null;
if (!sample2genotype.containsKey(sample)) {
// nothing
} else if (sample2genotype.get(sample).equals(refAllStr)) {
al = refAllele;
} else {
al = Allele.create(sample2genotype.get(sample).replaceAll("[^ATGCatgc]", "N"), false);
alleles.add(al);
}
if (al != null) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
final List<Allele> sampleAlleles = new ArrayList<Allele>(2);
sampleAlleles.add(al);
if (!haploid)
sampleAlleles.add(al);
gb.alleles(sampleAlleles);
gb.DP(1);
genotypes.add(gb.make());
} else {
genotypes.add(GenotypeBuilder.createMissing(sample, haploid ? 1 : 2));
}
}
// got to 1-based ref if subst, for indel with use pos(base)-1
final int start = pos1 + (is_subsitution ? 1 : 0);
vcb.start(start);
vcb.stop(start + (refAllStr.length() - 1));
vcb.chr(REF);
HashMap<String, Object> atts = new HashMap<String, Object>();
atts.put(VCFConstants.DEPTH_KEY, genotypes.size());
vcb.attributes(atts);
vcb.alleles(alleles);
vcb.genotypes(genotypes);
w.add(vcb.make());
pos1 = pos2;
}
w.close();
if (outFasta != null) {
final PrintWriter fasta = super.openFileOrStdoutAsPrintWriter(outFasta);
for (final String sample : samples) {
fasta.println(">" + sample);
final Sequence seq = this.sample2sequence.get(sample);
for (int i = 0; i < align_length; ++i) {
fasta.print(seq.at(i));
}
fasta.println();
}
fasta.println(">CONSENSUS");
for (int i = 0; i < align_length; ++i) {
fasta.print(consensus.at(i));
}
fasta.println();
fasta.flush();
fasta.close();
}
LOG.info("Done");
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(w);
}
}
Aggregations