use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class MiniCaller method doWork.
@Override
public int doWork(final List<String> args) {
ConcatSam.ConcatSamIterator iter = null;
try {
if (this.fastaFile == null) {
LOG.error("no REF");
return -1;
}
/* load faid */
final ReferenceGenomeFactory referenceGenomeFactory = new ReferenceGenomeFactory();
this.referenceGenome = referenceGenomeFactory.openFastaFile(this.fastaFile);
this.dictionary = this.referenceGenome.getDictionary();
if (this.dictionary == null) {
LOG.error(JvarkitException.FastaDictionaryMissing.getMessage(this.fastaFile.getPath()));
}
/* create sam record iterator */
iter = new ConcatSam.Factory().addInterval(this.rgnStr).setEnableUnrollList(true).open(args);
final SAMFileHeader samFileheader = iter.getFileHeader();
final SAMSequenceDictionary dict = samFileheader.getSequenceDictionary();
if (dict == null) {
LOG.error(JvarkitException.BamDictionaryMissing.getMessage(String.join(", ", args)));
return -1;
}
if (!SequenceUtil.areSequenceDictionariesEqual(dict, this.dictionary)) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict, this.dictionary));
return -1;
}
final List<SAMReadGroupRecord> groups = samFileheader.getReadGroups();
if (groups == null || groups.isEmpty()) {
LOG.error("No group defined in input");
return -1;
}
final Set<String> sampleSet = groups.stream().map(srgr -> this.samRecordPartition.apply(srgr, samRecordPartition.name())).collect(Collectors.toSet());
/* create VCF metadata */
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
metaData.add(new VCFFormatHeaderLine("DPG", // one value of each genotype
VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Depth for each allele"));
metaData.add(new VCFFormatHeaderLine("DP4", 4, VCFHeaderLineType.Integer, "Depth ReforAlt|Strand : RF,RR,AF,AR"));
metaData.add(new VCFInfoHeaderLine("INDEL", 1, VCFHeaderLineType.Flag, "Variant is indel"));
// addMetaData(metaData);
final VCFHeader vcfHeader = new VCFHeader(metaData, sampleSet);
vcfHeader.setSequenceDictionary(this.dictionary);
/* create variant context */
this.variantContextWriter = super.openVariantContextWriter(outputFile);
this.variantContextWriter.writeHeader(vcfHeader);
ReferenceContig genomicSeq = null;
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.dictionary);
for (; ; ) {
SAMRecord rec = null;
if (iter.hasNext()) {
rec = progress.watch(iter.next());
if (rec.getReadUnmappedFlag())
continue;
if (this.readFilter.filterOut(rec))
continue;
/* flush buffer if needed */
while (!this.buffer.isEmpty() && (this.buffer.get(0).tid < rec.getReferenceIndex() || (this.buffer.get(0).tid == rec.getReferenceIndex() && (this.buffer.get(0).getEnd()) < rec.getAlignmentStart()))) {
this.buffer.remove(0).print();
}
/* get genomic sequence at this position */
if (genomicSeq == null || !genomicSeq.getContig().equals(rec.getContig())) {
genomicSeq = this.referenceGenome.getContig(rec.getContig());
}
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
int readPos = 0;
// 0 based-reference
int refPos0 = rec.getAlignmentStart() - 1;
final byte[] bases = rec.getReadBases();
final byte[] quals = rec.getBaseQualities();
final String sampleName = this.samRecordPartition.getPartion(rec, samRecordPartition.name());
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
switch(op) {
case P:
break;
case H:
break;
case S:
readPos += ce.getLength();
break;
// go
case N:
case D:
{
if (// we need base before deletion
refPos0 > 0) {
char refBase = genomicSeq.charAt(refPos0 - 1);
/* we use base before deletion */
final StringBuilder sb = new StringBuilder(ce.getLength());
sb.append(refBase);
for (int i = 0; i < ce.getLength(); ++i) {
sb.append(genomicSeq.charAt(refPos0 + i));
}
findContext(rec.getReferenceIndex(), // we use base *before deletion */
refPos0 - 1, Allele.create(sb.toString(), true)).getSample(sampleName).getAllele(Allele.create(String.valueOf(refBase), false)).incr(rec.getReadNegativeStrandFlag());
}
refPos0 += ce.getLength();
break;
}
case I:
{
if (refPos0 > 0) {
// float qual=0;
char refBase = Character.toUpperCase(genomicSeq.charAt(refPos0 - 1));
final StringBuilder sb = new StringBuilder(1 + ce.getLength());
sb.append(refBase);
for (int i = 0; i < ce.getLength(); ++i) {
sb.append((char) bases[readPos + i]);
// qual+=(readPos + i < quals.length?quals[ readPos + i]:0);
}
findContext(rec.getReferenceIndex(), // we use base *before deletion */
refPos0 - 1, Allele.create(String.valueOf(refBase), true)).getSample(sampleName).getAllele(Allele.create(sb.toString().toUpperCase(), false)).incr(rec.getReadNegativeStrandFlag());
}
readPos += ce.getLength();
break;
}
case EQ:
case M:
case X:
{
for (int i = 0; i < ce.getLength(); ++i) {
findContext(rec.getReferenceIndex(), refPos0 + i, Allele.create(String.valueOf(genomicSeq.charAt(refPos0 + i)), true)).getSample(sampleName).getAllele(Allele.create(String.valueOf((char) bases[readPos + i]), false)).incr(rec.getReadNegativeStrandFlag());
}
readPos += ce.getLength();
refPos0 += ce.getLength();
break;
}
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + op);
}
}
} else {
break;
}
}
while (!buffer.isEmpty()) buffer.remove(0).print();
progress.finish();
iter.close();
iter = null;
this.variantContextWriter.close();
this.variantContextWriter = null;
return RETURN_OK;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(this.referenceGenome);
CloserUtil.close(this.variantContextWriter);
}
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class VcfRegistryCGI method doWork.
private void doWork(XMLStreamWriter w, final GroupFile gf) throws XMLStreamException {
Position pos = parsePosition();
if (pos == null)
return;
w.writeStartElement("div");
w.writeStartElement("h2");
w.writeCharacters(pos.chrom + ":" + pos.pos);
w.writeEndElement();
w.writeStartElement("table");
w.writeStartElement("thead");
w.writeStartElement("tr");
for (String header : new String[] { "CHROM", "POS", "ID", "REF", "QUAL", "Sample", "Alleles", "DP", "GQ", "File" }) {
w.writeStartElement("th");
w.writeCharacters(header);
// td
w.writeEndElement();
}
// tr
w.writeEndElement();
// thead
w.writeEndElement();
w.writeStartElement("tbody");
Set<String> samplesWithGenotypes = new HashSet<String>();
Set<String> allSamples = new HashSet<String>();
for (VcfFile f : getVcfFiles(gf)) {
TabixReader tabixReader = null;
TabixReader.Iterator iter = null;
BlockCompressedInputStream bgzin = null;
VCFHeader header = null;
AbstractVCFCodec vcfCodec = VCFUtils.createDefaultVCFCodec();
LineIterator lineIterator = null;
for (int i = 0; i < 2; i++) {
try {
if (i == 0) {
bgzin = new BlockCompressedInputStream(f.file);
lineIterator = new LineIteratorImpl(new SynchronousLineReader(bgzin));
header = (VCFHeader) vcfCodec.readActualHeader(lineIterator);
allSamples.addAll(header.getGenotypeSamples());
} else {
tabixReader = new TabixReader(f.file.getPath());
String line;
int[] x = tabixReader.parseReg(pos.chrom + ":" + pos.pos + "-" + (pos.pos));
if (x != null && x.length > 2 && x[0] != -1) {
iter = tabixReader.query(x[0], x[1], x[2]);
} else {
}
while (iter != null && (line = iter.next()) != null) {
VariantContext var = vcfCodec.decode(line);
for (String sample : header.getSampleNamesInOrder()) {
final Genotype genotype = var.getGenotype(sample);
if (genotype == null || !genotype.isCalled())
continue;
if (!genotype.isAvailable())
continue;
samplesWithGenotypes.add(sample);
w.writeStartElement("tr");
w.writeStartElement("td");
w.writeCharacters(var.getContig());
w.writeEndElement();
w.writeStartElement("td");
w.writeCharacters(String.valueOf(var.getStart()));
w.writeEndElement();
if (var.hasID()) {
w.writeStartElement("td");
if (var.getID().matches("rs[0-9]+")) {
w.writeStartElement("a");
w.writeAttribute("href", "http://www.ncbi.nlm.nih.gov/snp/" + var.getID().substring(2));
w.writeCharacters(var.getID());
// a
w.writeEndElement();
} else {
w.writeCharacters(var.getID());
}
// td
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
if (var.getReference() != null) {
w.writeStartElement("td");
w.writeCharacters(var.getReference().getBaseString());
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
if (var.hasLog10PError()) {
w.writeStartElement("td");
w.writeCharacters(String.valueOf((int) var.getPhredScaledQual()));
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
w.writeStartElement("td");
w.writeCharacters(sample);
w.writeEndElement();
List<Allele> alleles = genotype.getAlleles();
w.writeStartElement("td");
w.writeStartElement("span");
if (genotype.isHomRef()) {
w.writeAttribute("style", "color:green;");
} else if (genotype.isHomVar()) {
w.writeAttribute("style", "color:red;");
} else if (genotype.isHet()) {
w.writeAttribute("style", "color:blue;");
}
for (int j = 0; j < alleles.size(); ++j) {
if (j > 0)
w.writeCharacters(genotype.isPhased() ? "|" : "/");
w.writeCharacters(alleles.get(j).getBaseString());
}
// span
w.writeEndElement();
w.writeEndElement();
if (genotype.hasDP()) {
w.writeStartElement("td");
w.writeCharacters(String.valueOf(genotype.getDP()));
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
if (genotype.hasGQ()) {
w.writeStartElement("td");
w.writeCharacters(String.valueOf(genotype.getGQ()));
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
w.writeStartElement("td");
w.writeCharacters(f.file.getName());
w.writeEndElement();
// tr
w.writeEndElement();
w.flush();
}
}
}
} catch (Exception err) {
w.writeComment("BOUM " + err);
header = null;
lastException = err;
} finally {
CloserUtil.close(lineIterator);
CloserUtil.close(bgzin);
CloserUtil.close(tabixReader);
CloserUtil.close(iter);
}
if (i == 0 && header == null)
break;
}
w.flush();
}
// tbody
w.writeEndElement();
// table
w.writeEndElement();
allSamples.removeAll(samplesWithGenotypes);
if (!allSamples.isEmpty()) {
w.writeStartElement("h3");
w.writeCharacters("Samples not found");
w.writeEndElement();
w.writeStartElement("ol");
for (String sample : new TreeSet<String>(allSamples)) {
w.writeStartElement("li");
w.writeCharacters(sample);
w.writeEndElement();
}
w.writeEndElement();
}
writeHTMLException(w, this.lastException);
// div
w.writeEndElement();
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class CaseControlPlot method doWork.
@Override
public int doWork(final List<String> args) {
ArchiveFactory archiveFactory = null;
VcfIterator in = null;
VariantContextWriter teeVariantWriter = null;
final List<CaseControlExtractor> excractors = new ArrayList<>();
try {
in = super.openVcfIterator(oneFileOrNull(args));
final VCFHeader header = in.getHeader();
excractors.addAll(parseConfigFile(header));
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else {
pedigree = Pedigree.newParser().parse(header);
}
if (pedigree == null || pedigree.isEmpty()) {
LOG.error("No pedigree defined , or it is empty");
return -1;
}
final Set<Pedigree.Person> casepersons = pedigree.getPersons().stream().filter(F -> F.isAffected() && header.getSampleNamesInOrder().indexOf(F.getId()) != -1).collect(Collectors.toSet());
if (casepersons.isEmpty()) {
LOG.error("No Affected individuals in pedigree/header");
return -1;
}
final Set<Pedigree.Person> controlpersons = pedigree.getPersons().stream().filter(F -> F.isUnaffected() && header.getSampleNamesInOrder().indexOf(F.getId()) != -1).collect(Collectors.toSet());
if (controlpersons.isEmpty()) {
LOG.error("No Unaffected individuals in pedigree/header");
return -1;
}
if (this.teeToStdout) {
teeVariantWriter = super.openVariantContextWriter(null);
teeVariantWriter.writeHeader(header);
}
archiveFactory = ArchiveFactory.open(this.outputDirOrZip);
for (final CaseControlExtractor extractor : excractors) {
extractor.pw = archiveFactory.openWriter(extractor.name);
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
if (teeVariantWriter != null)
teeVariantWriter.add(ctx);
for (final CaseControlExtractor handler : excractors) {
handler.visit(ctx, casepersons, controlpersons);
}
}
for (final CaseControlExtractor handler : excractors) {
handler.close();
}
progress.finish();
if (teeVariantWriter != null) {
teeVariantWriter.close();
teeVariantWriter = null;
}
in.close();
in = null;
archiveFactory.close();
archiveFactory = null;
return RETURN_OK;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(in);
CloserUtil.close(archiveFactory);
CloserUtil.close(teeVariantWriter);
for (final CaseControlExtractor handler : excractors) {
handler.close();
}
}
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class VcfBurdenFilterGenes method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
final VCFHeader header = in.getHeader();
try {
final VCFHeader h2 = addMetaData(new VCFHeader(header));
final VCFFilterHeaderLine filterControlsHeader;
if (!StringUtil.isBlank(this.filterTag)) {
filterControlsHeader = new VCFFilterHeaderLine(this.filterTag.trim(), "Genes not in list " + this.geneFile);
h2.addMetaDataLine(filterControlsHeader);
} else {
filterControlsHeader = null;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary()).logger(LOG);
out.writeHeader(h2);
while (in.hasNext() && !out.checkError()) {
final VariantContext ctx = progess.watch(in.next());
boolean keep = false;
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
// not just set FILTER ?
if (filterControlsHeader == null) {
vcb.rmAttribute(vepParser.getTag());
vcb.rmAttribute(annParser.getTag());
}
final List<String> newVepList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newVepList.add(predStr);
keep = true;
break;
}
}
}
final List<String> newEffList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newEffList.add(predStr);
keep = true;
break;
}
}
// not just set FILTER ?
if (filterControlsHeader == null) {
if (!newVepList.isEmpty())
vcb.attribute(vepParser.getTag(), newVepList);
if (!newEffList.isEmpty())
vcb.attribute(annParser.getTag(), newEffList);
}
if (filterControlsHeader != null) {
if (!keep) {
vcb.filter(filterControlsHeader.getID());
} else if (!ctx.isFiltered()) {
vcb.passFilters();
}
out.add(vcb.make());
} else {
if (keep)
out.add(vcb.make());
}
}
progess.finish();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
}
}
use of htsjdk.variant.vcf.VCFHeader in project jvarkit by lindenb.
the class EvsToVcf method doWork.
@Override
public int doWork(List<String> args) {
VariantContextWriter out = null;
try {
if (!args.isEmpty()) {
LOG.error("Illegal number of arguments");
return -1;
}
JAXBContext jc = JAXBContext.newInstance(SnpData.class);
Unmarshaller unmarshaller = jc.createUnmarshaller();
out = VCFUtils.createVariantContextWriterToStdout();
SAMSequenceDictionary dict = new SAMSequenceDictionary();
_fillDict(dict, "1", 249250621);
_fillDict(dict, "2", 243199373);
_fillDict(dict, "3", 198022430);
_fillDict(dict, "4", 191154276);
_fillDict(dict, "5", 180915260);
_fillDict(dict, "6", 171115067);
_fillDict(dict, "7", 159138663);
_fillDict(dict, "8", 146364022);
_fillDict(dict, "9", 141213431);
_fillDict(dict, "10", 135534747);
_fillDict(dict, "11", 135006516);
_fillDict(dict, "12", 133851895);
_fillDict(dict, "13", 115169878);
_fillDict(dict, "14", 107349540);
_fillDict(dict, "15", 102531392);
_fillDict(dict, "16", 90354753);
_fillDict(dict, "17", 81195210);
_fillDict(dict, "18", 78077248);
_fillDict(dict, "19", 59128983);
_fillDict(dict, "20", 63025520);
_fillDict(dict, "21", 48129895);
_fillDict(dict, "22", 51304566);
_fillDict(dict, "X", 155270560);
_fillDict(dict, "Y", 59373566);
_fillDict(dict, "MT", 16569);
VCFHeader header = new VCFHeader();
header.setSequenceDictionary(dict);
header.addMetaDataLine(new VCFInfoHeaderLine("CONS", VCFHeaderLineCount.INTEGER, VCFHeaderLineType.Float, "conservationScore"));
header.addMetaDataLine(new VCFInfoHeaderLine("GERP", VCFHeaderLineCount.INTEGER, VCFHeaderLineType.Float, "conservationScoreGERP"));
header.addMetaDataLine(new VCFInfoHeaderLine("uaMAF", VCFHeaderLineCount.INTEGER, VCFHeaderLineType.Float, "conservationScoreGERP"));
header.addMetaDataLine(new VCFInfoHeaderLine("aaMAF", VCFHeaderLineCount.INTEGER, VCFHeaderLineType.Float, "conservationScoreGERP"));
header.addMetaDataLine(new VCFInfoHeaderLine("totalMAF", VCFHeaderLineCount.INTEGER, VCFHeaderLineType.Float, "conservationScoreGERP"));
header.addMetaDataLine(new VCFInfoHeaderLine("DP", VCFHeaderLineCount.INTEGER, VCFHeaderLineType.Integer, "conservationScoreGERP"));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
out.writeHeader(header);
Pattern comma = Pattern.compile("[,]");
XMLInputFactory xif = XMLInputFactory.newFactory();
xif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
XMLEventReader xmlr = xif.createXMLEventReader(System.in);
while (xmlr.hasNext() && !System.out.checkError()) {
XMLEvent evt = xmlr.peek();
if (!evt.isStartElement() || !evt.asStartElement().getName().getLocalPart().equals("snpList")) {
xmlr.nextEvent();
continue;
}
SnpData snpData = unmarshaller.unmarshal(xmlr, SnpData.class).getValue();
VariantContextBuilder vcb = new VariantContextBuilder();
Set<Allele> alleles = new HashSet<Allele>();
alleles.add(Allele.create(snpData.getRefAllele(), true));
for (String s : comma.split(snpData.getAltAlleles())) {
if (isEmpty(s))
continue;
alleles.add(Allele.create(s, false));
}
vcb.chr(snpData.getChromosome());
vcb.start(snpData.getChrPosition());
vcb.stop(snpData.getChrPosition() + snpData.getRefAllele().length() - 1);
if (!isEmpty(snpData.getRsIds()) && !snpData.getRsIds().equals("none")) {
vcb.id(snpData.getRsIds());
}
vcb.alleles(alleles);
Float d = parseDouble(snpData.getConservationScore());
if (d != null) {
vcb.attribute("CONS", d);
}
d = parseDouble(snpData.getConservationScoreGERP());
if (d != null) {
vcb.attribute("GERP", d);
}
vcb.attribute("uaMAF", (float) snpData.getUaMAF());
vcb.attribute("aaMAF", (float) snpData.getAaMAF());
vcb.attribute("totalMAF", (float) snpData.getTotalMAF());
vcb.attribute("DP", snpData.getAvgSampleReadDepth());
out.add(vcb.make());
}
xmlr.close();
out.close();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
Aggregations