use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class HaloplexParasite method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
SamReader samReader = null;
final List<Mutation> mutations = new ArrayList<>();
try {
final Set<File> bamFiles = Files.lines(this.bamList.toPath()).filter(T -> !(T.isEmpty() || T.startsWith("#"))).map(T -> new File(T)).collect(Collectors.toSet());
final VCFHeader header = new VCFHeader();
header.setSequenceDictionary(in.getHeader().getSequenceDictionary());
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine("HALOPLEXPARASITE", "fails Parasite Haloplex Sequence");
final VCFInfoHeaderLine infoWord = new VCFInfoHeaderLine(filter.getID(), 1, VCFHeaderLineType.String, "Parasite Haloplex Sequence (Word|Count|Fraction)");
super.addMetaData(header);
out.writeHeader(header);
header.addMetaDataLine(filter);
header.addMetaDataLine(infoWord);
while (in.hasNext()) {
final VariantContext ctx = in.next();
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (!(ctx.isIndel() || ctx.isMixed())) {
out.add(vcb.make());
continue;
}
if (!vcb.getAlleles().stream().filter(A -> !(A.isSymbolic() || A.isNoCall() || A.length() < this.minClipSize)).findAny().isPresent()) {
out.add(vcb.make());
continue;
}
final Mutation mut = new Mutation(ctx);
mutations.add(mut);
}
final Counter<String> words = new Counter<>();
for (final File bamFile : bamFiles) {
LOG.info("Opening " + bamFile);
samReader = createSamReaderFactory().open(bamFile);
for (final Mutation mut : mutations) {
// words seen in that mutation : don't use a Counter
final Set<String> mutWords = new HashSet<>();
/* loop over reads overlapping that mutation */
final SAMRecordIterator sri = samReader.queryOverlapping(mut.contig, mut.start, mut.end);
while (sri.hasNext()) {
final SAMRecord rec = sri.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar.numCigarElements() == 1)
continue;
final byte[] bases = rec.getReadBases();
int refpos = rec.getUnclippedStart();
int readpos = 0;
/* scan cigar overlapping that mutation */
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int ref_end = refpos + (op.consumesReferenceBases() || op.isClipping() ? ce.getLength() : 0);
final int read_end = readpos + (op.consumesReadBases() ? ce.getLength() : 0);
/* check clip is large enough */
if (op.equals(CigarOperator.S) && ce.getLength() >= this.minClipSize) {
/* check clip overlap mutation */
if (!(ref_end < mut.start || refpos > mut.end)) {
/* break read of soft clip into words */
for (int x = 0; x + this.minClipSize <= ce.getLength(); ++x) {
final String substr = new String(bases, readpos + x, this.minClipSize);
if (!substr.contains("N")) {
final String revcomp = AcidNucleics.reverseComplement(substr);
mutWords.add(substr);
if (!revcomp.equals(substr))
mutWords.add(revcomp);
}
}
}
}
refpos = ref_end;
readpos = read_end;
}
}
sri.close();
for (final String w : mutWords) {
words.incr(w);
}
}
// end of for(mutations)
samReader.close();
samReader = null;
}
LOG.info("mutations:" + mutations.size());
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
for (final Mutation mut : mutations) {
progress.watch(mut.contig, mut.start);
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, mut.contig, mut.start, mut.end, mut.alleles);
String worstString = null;
Double worstFraction = null;
final double totalWords = words.getTotal();
for (final Allele a : mut.alleles) {
if (a.isSymbolic() || a.isNoCall() || a.length() < this.minClipSize)
continue;
for (int x = 0; x + this.minClipSize <= a.length(); ++x) {
final String substr = new String(a.getBases(), x, this.minClipSize);
final long count = words.count(substr);
final double fraction = count / totalWords;
if (worstFraction == null || worstFraction < fraction) {
worstString = substr + "|" + count + "|" + fraction;
worstFraction = fraction;
}
}
}
if (worstString != null) {
vcb.attribute(infoWord.getID(), worstString);
}
if (worstFraction != null && worstFraction.doubleValue() >= this.tresholdFraction) {
vcb.filter(filter.getID());
}
out.add(vcb.make());
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(samReader);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfDoest method run.
private void run(final LineIterator lr, final PrintWriter pw) throws IOException {
SortingCollection<TranscriptInfo> sorting = null;
CloseableIterator<TranscriptInfo> iter2 = null;
try {
while (lr.hasNext()) {
VcfIterator in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
final VCFHeader header = in.getHeader();
final Pedigree pedigree = Pedigree.newParser().parse(header);
if (pedigree.isEmpty()) {
throw new IOException("No pedigree found in header VCF header. use VcfInjectPedigree to add it");
}
final SortedSet<Pedigree.Person> individuals = new TreeSet<>();
for (final Pedigree.Person individual : pedigree.getPersons()) {
if (individual.isAffected() || individual.isUnaffected()) {
individuals.add(individual);
}
}
boolean first = true;
pw.println("# samples ( 0: unaffected 1:affected)");
pw.print("population <- data.frame(family=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print("\"" + person.getFamily().getId() + "\"");
first = false;
}
pw.print("),name=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print("\"" + person.getId() + "\"");
first = false;
}
pw.print("),status=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print(person.isUnaffected() ? 0 : 1);
first = false;
}
pw.println("))");
sorting = SortingCollection.newInstance(TranscriptInfo.class, new TranscriptInfoCodec(), new TranscriptInfoCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
/**
* loop over variants
*/
while (in.hasNext() && !pw.checkError()) {
final VariantContext ctx = progess.watch(in.next());
if (ctx.isFiltered())
continue;
if (ctx.getAlternateAlleles().isEmpty())
continue;
final Allele altAllele = ctx.getAltAlleleWithHighestAlleleCount();
final MafCalculator mafCalculator = new MafCalculator(altAllele, ctx.getContig());
boolean genotyped = false;
for (final Pedigree.Person p : pedigree.getPersons()) {
if (!(p.isAffected() || p.isUnaffected()))
continue;
final Genotype g = ctx.getGenotype(p.getId());
if (g == null)
throw new IOException("Strange I cannot find individual " + p + " in the pedigree. Aborting.");
if (g.isCalled()) {
mafCalculator.add(g, p.isMale());
}
if (g.isHet() || g.isHomVar()) {
if (!g.getAlleles().contains(altAllele))
continue;
genotyped = true;
break;
}
}
if (!genotyped)
continue;
final Interval interval = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd());
final List<KnownGene> genes = this.overlap(interval);
if (genes.isEmpty())
continue;
for (final KnownGene kg : genes) {
final TranscriptInfo trInfo = new TranscriptInfo();
trInfo.contig = kg.getContig();
trInfo.txStart = kg.getTxStart();
trInfo.txEnd = kg.getTxEnd();
trInfo.transcriptName = kg.getName();
trInfo.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
trInfo.exonCount = kg.getExonCount();
trInfo.transcriptLength = kg.getTranscriptLength();
trInfo.ctxStart = ctx.getStart();
trInfo.ref = ctx.getReference();
trInfo.alt = altAllele;
trInfo.maf = mafCalculator.getMaf();
trInfo.genotypes = new byte[individuals.size()];
int idx = 0;
for (final Pedigree.Person individual : individuals) {
final Genotype genotype = ctx.getGenotype(individual.getId());
final byte b;
if (genotype.isHomRef()) {
b = 0;
} else if (genotype.isHomVar() && genotype.getAlleles().contains(altAllele)) {
b = 2;
} else if (genotype.isHet() && genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
b = 1;
} else /* we treat 0/2 has hom-ref */
if (genotype.isHet() && !genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
b = 0;
} else /* we treat 2/2 has hom-ref */
if (genotype.isHomVar() && !genotype.getAlleles().contains(altAllele)) {
LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
b = 0;
} else {
b = -9;
}
trInfo.genotypes[idx] = b;
++idx;
}
KnownGene archetype = kg;
/* find gene archetype = longest overlapping */
for (final KnownGene kg2 : genes) {
if (kg2 == kg)
continue;
if (archetype.getStrand().equals(kg2.getStrand()) && archetype.getTranscriptLength() < kg2.getTranscriptLength()) {
archetype = kg2;
}
}
trInfo.archetypeName = archetype.getName();
trInfo.archetypeLength = archetype.getTranscriptLength();
boolean ctxWasFoundInExon = false;
final int ctxPos0 = ctx.getStart() - 1;
int indexInTranscript0 = 0;
for (final KnownGene.Exon exon : kg.getExons()) {
// variant in exon ?
if (!(exon.getStart() > (ctx.getEnd() - 1) || (ctx.getStart() - 1) >= exon.getEnd())) {
ctxWasFoundInExon = true;
indexInTranscript0 += (ctxPos0 - exon.getStart());
if (kg.isNegativeStrand()) {
indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
}
trInfo.indexInTranscript0 = indexInTranscript0;
trInfo.overlapName = exon.getName();
sorting.add(trInfo);
break;
} else {
indexInTranscript0 += (exon.getEnd() - exon.getStart());
}
}
if (ctxWasFoundInExon) {
continue;
}
indexInTranscript0 = 0;
// search closest intron/exon junction
for (int ex = 0; ex + 1 < kg.getExonCount(); ++ex) {
final KnownGene.Exon exon1 = kg.getExon(ex);
indexInTranscript0 += (exon1.getEnd() - exon1.getStart());
final KnownGene.Exon exon2 = kg.getExon(ex + 1);
if (exon1.getEnd() <= ctxPos0 && ctxPos0 < exon2.getStart()) {
final int dist_to_exon1 = ctxPos0 - exon1.getEnd();
final int dist_to_exon2 = exon2.getStart() - ctxPos0;
if (dist_to_exon2 < dist_to_exon1) {
indexInTranscript0++;
}
if (kg.isNegativeStrand()) {
indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
}
trInfo.indexInTranscript0 = indexInTranscript0;
trInfo.overlapName = exon1.getNextIntron().getName();
sorting.add(trInfo);
break;
}
}
}
// end loop over genes
}
// end while loop over variants
progess.finish();
sorting.doneAdding();
LOG.info("done adding");
iter2 = sorting.iterator();
final EqualRangeIterator<TranscriptInfo> eqiter = new EqualRangeIterator<TranscriptInfo>(iter2, new Comparator<TranscriptInfo>() {
@Override
public int compare(final TranscriptInfo o1, final TranscriptInfo o2) {
int i = o1.contig.compareTo(o2.contig);
if (i != 0)
return i;
i = o1.transcriptName.compareTo(o2.transcriptName);
return i;
}
});
while (eqiter.hasNext()) {
final List<TranscriptInfo> list = eqiter.next();
final TranscriptInfo front = list.get(0);
pw.println("# BEGIN TRANSCRIPT " + front.transcriptName + " ##########################################");
pw.println("transcript.chrom <- \"" + front.contig + "\"");
pw.println("transcript.txStart0 <- " + front.txStart + "");
pw.println("transcript.txEnd0 <- " + front.txEnd + "");
pw.println("transcript.name <- \"" + front.transcriptName + "\"");
pw.println("transcript.strand <- \"" + ((char) front.strand) + "\"");
pw.println("transcript.length <- " + front.transcriptLength + "");
pw.println("transcript.exonCount <- " + front.exonCount + "");
pw.println("archetype.name <- \"" + front.archetypeName + "\"");
pw.println("archetype.length <- " + front.archetypeLength + "");
pw.print("variants <- data.frame(chrom=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.contig + "\"");
first = false;
}
pw.print("),chromStart=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.ctxStart);
first = false;
}
pw.print("),chromEnd=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.ctxStart + v.ref.length() - 1);
first = false;
}
pw.print("),refAllele=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.ref.getDisplayString() + "\"");
first = false;
}
pw.print("),altAllele=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.alt.getDisplayString() + "\"");
first = false;
}
pw.print("),positionInTranscript1=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.indexInTranscript0 + 1);
first = false;
}
pw.print("),maf=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.maf);
first = false;
}
pw.print("),overlapName=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.overlapName + "\"");
first = false;
}
pw.println("))");
pw.println("# genotypes as a list. Should be a multiple of length(samples).");
pw.println("# 0 is homref (0/0), 1 is het (0/1), 2 is homvar (1/1)");
pw.println("# if the variant contains another ALT allele: (0/2) and (2/2) are considered 0 (homref)");
pw.print("genotypes <- c(");
first = true;
for (final TranscriptInfo tr : list) {
for (byte g : tr.genotypes) {
if (!first)
pw.print(",");
first = false;
pw.print((int) g);
}
}
pw.println(")");
pw.println("stopifnot(NROW(variants) * NROW(population) == length(genotypes) )");
if (this.userDefinedFunName == null || this.userDefinedFunName.trim().isEmpty()) {
pw.println("## WARNING not user-defined R function was defined");
} else {
pw.println("# consumme data with user-defined R function ");
pw.println(this.userDefinedFunName + "()");
}
pw.println("# END TRANSCRIPT " + front.transcriptName + " ##########################################");
}
// end while eqiter
eqiter.close();
iter2.close();
iter2 = null;
sorting.cleanup();
sorting = null;
}
} finally {
CloserUtil.close(iter2);
if (sorting != null)
sorting.cleanup();
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class CaseControlPlot method doWork.
@Override
public int doWork(final List<String> args) {
ArchiveFactory archiveFactory = null;
VcfIterator in = null;
VariantContextWriter teeVariantWriter = null;
final List<CaseControlExtractor> excractors = new ArrayList<>();
try {
in = super.openVcfIterator(oneFileOrNull(args));
final VCFHeader header = in.getHeader();
excractors.addAll(parseConfigFile(header));
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else {
pedigree = Pedigree.newParser().parse(header);
}
if (pedigree == null || pedigree.isEmpty()) {
LOG.error("No pedigree defined , or it is empty");
return -1;
}
final Set<Pedigree.Person> casepersons = pedigree.getPersons().stream().filter(F -> F.isAffected() && header.getSampleNamesInOrder().indexOf(F.getId()) != -1).collect(Collectors.toSet());
if (casepersons.isEmpty()) {
LOG.error("No Affected individuals in pedigree/header");
return -1;
}
final Set<Pedigree.Person> controlpersons = pedigree.getPersons().stream().filter(F -> F.isUnaffected() && header.getSampleNamesInOrder().indexOf(F.getId()) != -1).collect(Collectors.toSet());
if (controlpersons.isEmpty()) {
LOG.error("No Unaffected individuals in pedigree/header");
return -1;
}
if (this.teeToStdout) {
teeVariantWriter = super.openVariantContextWriter(null);
teeVariantWriter.writeHeader(header);
}
archiveFactory = ArchiveFactory.open(this.outputDirOrZip);
for (final CaseControlExtractor extractor : excractors) {
extractor.pw = archiveFactory.openWriter(extractor.name);
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
if (teeVariantWriter != null)
teeVariantWriter.add(ctx);
for (final CaseControlExtractor handler : excractors) {
handler.visit(ctx, casepersons, controlpersons);
}
}
for (final CaseControlExtractor handler : excractors) {
handler.close();
}
progress.finish();
if (teeVariantWriter != null) {
teeVariantWriter.close();
teeVariantWriter = null;
}
in.close();
in = null;
archiveFactory.close();
archiveFactory = null;
return RETURN_OK;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(in);
CloserUtil.close(archiveFactory);
CloserUtil.close(teeVariantWriter);
for (final CaseControlExtractor handler : excractors) {
handler.close();
}
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfBurdenFilterGenes method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
final VCFHeader header = in.getHeader();
try {
final VCFHeader h2 = addMetaData(new VCFHeader(header));
final VCFFilterHeaderLine filterControlsHeader;
if (!StringUtil.isBlank(this.filterTag)) {
filterControlsHeader = new VCFFilterHeaderLine(this.filterTag.trim(), "Genes not in list " + this.geneFile);
h2.addMetaDataLine(filterControlsHeader);
} else {
filterControlsHeader = null;
}
final List<String> lookColumns = Arrays.asList("CCDS", "Feature", "ENSP", "Gene", "HGNC", "HGNC_ID", "SYMBOL", "RefSeq");
final VepPredictionParser vepParser = new VepPredictionParserFactory(header).get();
final AnnPredictionParser annParser = new AnnPredictionParserFactory(header).get();
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary()).logger(LOG);
out.writeHeader(h2);
while (in.hasNext() && !out.checkError()) {
final VariantContext ctx = progess.watch(in.next());
boolean keep = false;
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
// not just set FILTER ?
if (filterControlsHeader == null) {
vcb.rmAttribute(vepParser.getTag());
vcb.rmAttribute(annParser.getTag());
}
final List<String> newVepList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(vepParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final VepPredictionParser.VepPrediction pred = vepParser.parseOnePrediction(ctx, predStr);
for (final String col : lookColumns) {
final String token = pred.getByCol(col);
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newVepList.add(predStr);
keep = true;
break;
}
}
}
final List<String> newEffList = new ArrayList<>();
for (final String predStr : ctx.getAttributeAsList(annParser.getTag()).stream().map(O -> String.class.cast(O)).collect(Collectors.toList())) {
final AnnPredictionParser.AnnPrediction pred = annParser.parseOnePrediction(predStr);
final String token = pred.getGeneName();
if (!StringUtil.isBlank(token) && this.geneNames.contains(token)) {
newEffList.add(predStr);
keep = true;
break;
}
}
// not just set FILTER ?
if (filterControlsHeader == null) {
if (!newVepList.isEmpty())
vcb.attribute(vepParser.getTag(), newVepList);
if (!newEffList.isEmpty())
vcb.attribute(annParser.getTag(), newEffList);
}
if (filterControlsHeader != null) {
if (!keep) {
vcb.filter(filterControlsHeader.getID());
} else if (!ctx.isFiltered()) {
vcb.passFilters();
}
out.add(vcb.make());
} else {
if (keep)
out.add(vcb.make());
}
}
progess.finish();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.
the class VcfToSql method read.
private void read(File filename) throws IOException {
/* insert ATGC */
this.alleleTable.insert(outputWriter, null, "A");
this.alleleTable.insert(outputWriter, null, "C");
this.alleleTable.insert(outputWriter, null, "G");
this.alleleTable.insert(outputWriter, null, "T");
/* insert this sample */
this.vcfFileTable.insert(outputWriter, null, filename);
final SelectStmt vcffile_id = new SelectStmt(this.vcfFileTable);
final Map<String, SelectStmt> sample2sampleid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> filter2filterid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> chrom2chromId = new HashMap<String, SelectStmt>();
final VcfIterator r = VCFUtils.createVcfIteratorFromFile(filename);
final VCFHeader header = r.getHeader();
/* parse samples */
for (final String sampleName : header.getSampleNamesInOrder()) {
this.sampleTable.insert(outputWriter, null, sampleName);
SelectStmt sample_id = new SelectStmt(this.sampleTable, "name", sampleName);
sample2sampleid.put(sampleName, sample_id);
this.sample2fileTable.insert(outputWriter, null, vcffile_id, sample_id);
}
/* parse filters */
for (final VCFFilterHeaderLine filter : header.getFilterLines()) {
this.filterTable.insert(outputWriter, null, vcffile_id, filter.getID(), filter.getValue());
filter2filterid.put(filter.getID(), new SelectStmt(this.filterTable, "name", filter.getID()));
}
filter2filterid.put(VCFConstants.PASSES_FILTERS_v4, new SelectStmt(this.filterTable, "name", VCFConstants.PASSES_FILTERS_v4));
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
throw new RuntimeException("dictionary missing in VCF");
}
/* parse sequence dict */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
this.chromosomeTable.insert(outputWriter, null, vcffile_id, ssr.getSequenceName(), ssr.getSequenceLength());
chrom2chromId.put(ssr.getSequenceName(), new SelectStmt(this.chromosomeTable, "name", ssr.getSequenceName()));
}
VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
int nVariants = 0;
while (r.hasNext()) {
if (this.outputWriter.checkError())
break;
VariantContext var = progress.watch(r.next());
++nVariants;
/* insert ref allele */
this.alleleTable.insert(outputWriter, null, var.getReference().getBaseString());
/* insert variant */
this.variantTable.insert(outputWriter, null, vcffile_id, nVariants, chrom2chromId.get(var.getContig()), var.getStart(), (var.hasID() ? var.getID() : null), new SelectStmt(this.alleleTable, "bases", var.getReference().getBaseString()), (var.hasLog10PError() ? var.getPhredScaledQual() : null));
SelectStmt variant_id = new SelectStmt(variantTable);
/* insert alternate alleles */
for (Allele alt : var.getAlternateAlleles()) {
/* insert alt allele */
this.alleleTable.insert(outputWriter, null, alt.getBaseString());
this.variant2altTable.insert(outputWriter, null, variant_id, new SelectStmt(this.alleleTable, "bases", alt.getBaseString()));
}
/* insert filters */
for (final String filter : var.getFilters()) {
if (filter2filterid.get(filter) == null) {
throw new IOException("VCF Error: filter " + filter + " is not defined in the VCF header.");
}
this.variant2filters.insert(outputWriter, null, variant_id, filter2filterid.get(filter));
}
if (!this.ignore_info) {
for (final VepPrediction pred : vepPredictionParser.getPredictions(var)) {
/*
vepPrediction.insert(
outputWriter,
null,
variant_id,
pred.getEnsemblGene(),
pred.getEnsemblTranscript(),
pred.getEnsemblProtein(),
pred.getSymbol()
);
SelectStmt pred_id = new SelectStmt(vepPrediction);
for(SequenceOntologyTree.Term t: pred.getSOTerms())
{
String term=t.getAcn().replace(':', '_');
soTermTable.insert(
outputWriter,
null,
term,
t.getAcn()
);//for bioportal compatibility
SelectStmt term_id = new SelectStmt(soTermTable,"acn",term);
vepPrediction2so.insert(
outputWriter,
null,
pred_id,
term_id
);
}
*/
}
}
/* insert genotypes */
for (final String sampleName : sample2sampleid.keySet()) {
final Genotype g = var.getGenotype(sampleName);
if (!g.isAvailable() || g.isNoCall())
continue;
genotypeTable.insert(outputWriter, null, variant_id, sample2sampleid.get(sampleName), g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(0).getBaseString()) : null, g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(1).getBaseString()) : null, g.hasDP() ? g.getDP() : null, g.hasGQ() ? g.getGQ() : null);
}
}
r.close();
}
Aggregations