use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfGtfSplitter method testTranscript.
private boolean testTranscript(final Transcript transcript, final VariantContext ctx) {
if (!transcript.overlaps(ctx)) {
if (this.use_upstream) {
final SimplePosition pos = new SimplePosition(transcript.getContig(), transcript.isPositiveStrand() ? transcript.getStart() : transcript.getEnd());
if (ctx.withinDistanceOf(pos, this.xxxxstream_length))
return true;
}
if (this.use_downstream) {
final SimplePosition pos = new SimplePosition(transcript.getContig(), transcript.isPositiveStrand() ? transcript.getEnd() : transcript.getStart());
if (ctx.withinDistanceOf(pos, this.xxxxstream_length))
return true;
}
return false;
}
if (this.use_exon && transcript.hasExon() && transcript.getExons().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_intron && transcript.hasIntron() && transcript.getIntrons().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_cds && transcript.hasCDS() && transcript.getAllCds().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_utr5) {
if (transcript.isPositiveStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().overlaps(ctx))
return true;
if (transcript.isNegativeStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().overlaps(ctx))
return true;
}
if (this.use_utr3) {
if (transcript.isPositiveStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().overlaps(ctx))
return true;
if (transcript.isNegativeStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().overlaps(ctx))
return true;
}
if (this.use_cds_utr5) {
if (transcript.isPositiveStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (transcript.isNegativeStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
}
if (this.use_cds_utr3) {
if (transcript.isPositiveStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (transcript.isNegativeStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
}
if (this.use_stop && transcript.hasCodonStopDefined() && transcript.getCodonStop().get().getBlocks().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_start && transcript.hasCodonStartDefined() && transcript.getCodonStart().get().getBlocks().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_splice && transcript.hasIntron()) {
for (final Intron intron : transcript.getIntrons()) {
final SimpleInterval splice1 = new SimpleInterval(intron.getContig(), intron.getStart() - 1, intron.getStart());
if (ctx.withinDistanceOf(splice1, this.split_length))
return true;
final SimpleInterval splice2 = new SimpleInterval(intron.getContig(), intron.getEnd(), intron.getEnd() + 1);
if (ctx.withinDistanceOf(splice2, this.split_length))
return true;
}
}
return false;
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfGtfSplitter method split.
private void split(final AbstractSplitter splitter, final VCFReader vcfFileReader, final VCFHeader header, final SAMSequenceDictionary dict, final ArchiveFactory archiveFactory, final Path tmpVcf, final PrintWriter manifest) throws IOException {
final Locatable interval = splitter.getInterval();
final CloseableIterator<VariantContext> iter;
if ((this.use_downstream || this.use_upstream) && this.xxxxstream_length > 0) {
iter = vcfFileReader.query(new SimpleInterval(interval).extend(this.xxxxstream_length));
} else {
iter = vcfFileReader.query(interval);
}
if (!this.enable_empty_vcf && !iter.hasNext()) {
iter.close();
return;
}
final VariantContextWriterBuilder vcwb = new VariantContextWriterBuilder();
vcwb.setCreateMD5(false);
vcwb.setReferenceDictionary(null);
final TabixIndexCreator tabixIndexCreator;
final Path tbiPath;
if (this.index_vcf) {
if (dict == null) {
tabixIndexCreator = new TabixIndexCreator(TabixFormat.VCF);
} else {
tabixIndexCreator = new TabixIndexCreator(dict, TabixFormat.VCF);
}
vcwb.setIndexCreator(tabixIndexCreator);
vcwb.setOption(Options.INDEX_ON_THE_FLY);
tbiPath = tmpVcf.getParent().resolve(tmpVcf.getFileName().toString() + FileExtensions.TABIX_INDEX);
} else {
tabixIndexCreator = null;
tbiPath = null;
}
vcwb.setOutputPath(tmpVcf);
final VariantContextWriter out = vcwb.build();
final VCFHeader header2 = new VCFHeader(this.attCleaner.cleanHeader(header));
super.addMetaData(header2);
splitter.addMetadata(header2);
out.writeHeader(header2);
int count_ctx = 0;
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
if (this.ignoreFiltered && ctx.isFiltered())
continue;
if (!splitter.accept(ctx))
continue;
count_ctx++;
out.add(this.attCleaner.apply(ctx));
}
iter.close();
out.close();
if (!this.enable_empty_vcf && count_ctx == 0) {
Files.delete(tmpVcf);
if (tbiPath != null)
Files.delete(tbiPath);
return;
}
final String md5 = StringUtils.md5(interval.getContig() + ":" + splitter.getId());
final String filename = md5.substring(0, 2) + File.separatorChar + md5.substring(2) + File.separator + splitter.getId().replaceAll("[/\\:]", "_") + (this.use_bcf ? FileExtensions.BCF : FileExtensions.COMPRESSED_VCF);
OutputStream os = archiveFactory.openOuputStream(filename);
IOUtils.copyTo(tmpVcf, os);
os.flush();
os.close();
if (tbiPath != null) {
os = archiveFactory.openOuputStream(filename + FileExtensions.TABIX_INDEX);
IOUtils.copyTo(tmpVcf, os);
os.flush();
os.close();
Files.delete(tbiPath);
}
Files.delete(tmpVcf);
manifest.print(interval.getContig());
manifest.print('\t');
manifest.print(interval.getStart() - 1);
manifest.print('\t');
manifest.print(interval.getEnd());
manifest.print('\t');
splitter.printManifest(manifest);
manifest.print('\t');
manifest.print((archiveFactory.isTarOrZipArchive() ? "" : this.outputFile.toString() + File.separator) + filename);
manifest.print('\t');
manifest.println(count_ctx);
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfRegulomeDB method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VCFIterator in, VariantContextWriter out) {
VCFHeader header = in.getHeader();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
header.addMetaDataLine(new VCFInfoHeaderLine(this.infoTag, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Format: Position|Distance|Rank"));
out.writeHeader(header);
while (in.hasNext()) {
List<String> regDataList = new ArrayList<String>();
VariantContext ctx = in.next();
progress.watch(ctx.getContig(), ctx.getStart());
int start = Math.max(0, ctx.getStart() - this.extend);
int end = ctx.getEnd() + this.extend;
for (Iterator<RegData> iter = this.regDataTabixFileReader.iterator(new SimpleInterval(ctx.getContig(), start, end)); iter.hasNext(); ) {
RegData curr = iter.next();
if (this.acceptRegex != null && !this.acceptRegex.matcher(curr.rank).matches()) {
continue;
}
String str = String.valueOf(curr.chromSart) + "|" + String.valueOf(Math.abs(curr.chromSart - (ctx.getStart() - 1))) + "|" + curr.rank;
regDataList.add(str);
}
if (regDataList.isEmpty()) {
out.add(ctx);
continue;
}
VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.attribute(this.infoTag, regDataList.toArray());
out.add(vcb.make());
}
progress.finish();
return 0;
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VCFPredictions method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VCFIterator r, VariantContextWriter w) {
try {
this.referenceGenome = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidxPath);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
loadGtf(dict);
final VCFHeader header = r.getHeader();
final VCFHeader h2 = new VCFHeader(header);
JVarkitVersion.getInstance().addMetaData(this, h2);
switch(this.outputSyntax) {
case Vep:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
break;
}
case SnpEff:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
break;
}
default:
{
final StringBuilder format = new StringBuilder();
for (FORMAT1 f : FORMAT1.values()) {
if (format.length() > 0)
format.append("|");
format.append(f.name());
}
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
break;
}
}
w.writeHeader(h2);
final RNASequenceFactory rnaSeqFactory = new RNASequenceFactory();
rnaSeqFactory.setContigToGenomicSequence(S -> getGenomicSequence(S));
while (r.hasNext()) {
final VariantContext ctx = r.next();
final String normalizedContig = contigNameConverter.apply(ctx.getContig());
if (StringUtil.isBlank(normalizedContig)) {
w.add(ctx);
continue;
}
final List<Transcript> transcripts = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
final List<Annotation> all_annotations = new ArrayList<>();
final List<Allele> alternateAlleles;
if (ctx.getNAlleles() <= 1) {
// not a variant, just REF
alternateAlleles = Arrays.asList(Allele.NO_CALL);
} else {
alternateAlleles = ctx.getAlternateAlleles();
}
for (final Allele altAllele : alternateAlleles) {
if (altAllele.isReference() || altAllele.equals(Allele.SPAN_DEL) || altAllele.equals(Allele.NON_REF_ALLELE))
continue;
/* intergenic ====================================================== */
if (transcripts.isEmpty()) {
Transcript leftGene = null;
String leftId = "";
String leftName = "";
for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, 1, ctx.getStart())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
final Transcript t = iter.next();
if (leftGene == null || leftGene.getEnd() < t.getEnd()) {
leftGene = t;
leftId = t.getGene().getId();
leftName = t.getGene().getGeneName();
}
}
Transcript rightGene = null;
String rightId = "";
String rightName = "";
for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getEnd(), dict.getSequence(normalizedContig).getSequenceLength())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
final Transcript t = iter.next();
if (rightGene == null || t.getStart() < rightGene.getStart()) {
rightGene = t;
rightId = t.getGene().getId();
rightName = t.getGene().getGeneName();
}
}
// intergenic
final Annotation annot = new Annotation(altAllele);
annot.seqont.add("intergenic");
annot.geneId = leftId + "-" + rightId;
annot.geneName = leftName + "-" + rightName;
all_annotations.add(annot);
} else {
for (final Transcript transcript : transcripts) {
final Annotation annotation = new Annotation(altAllele, transcript);
all_annotations.add(annotation);
if (!transcript.overlaps(ctx)) {
if (((ctx.getEnd() < transcript.getStart() && transcript.isNegativeStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isPositiveStrand()))) {
if (ctx.withinDistanceOf(transcript, 500)) {
annotation.seqont.add("500B_downstream_variant");
} else if (ctx.withinDistanceOf(transcript, 2_000)) {
annotation.seqont.add("2KB_downstream_variant");
}
} else if (((ctx.getEnd() < transcript.getStart() && transcript.isPositiveStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isNegativeStrand()))) {
if (ctx.withinDistanceOf(transcript, 2_000)) {
annotation.seqont.add("2KB_upstream_variant");
} else if (ctx.withinDistanceOf(transcript, 5_000)) {
annotation.seqont.add("5KB_upstream_variant");
}
}
continue;
}
if (CoordMath.encloses(ctx.getStart(), ctx.getEnd(), transcript.getStart(), transcript.getEnd())) {
// TODO can be inversion ,etc...
annotation.seqont.add("transcript_ablation");
continue;
}
for (int side = 0; side < 2; ++side) {
final Optional<UTR> opt_utr = (side == 0 ? transcript.getTranscriptUTR5() : transcript.getTranscriptUTR3());
if (!opt_utr.isPresent())
continue;
final UTR utr = opt_utr.get();
if (CoordMath.overlaps(utr.getStart(), utr.getEnd(), ctx.getStart(), ctx.getEnd())) {
annotation.seqont.add(side == 0 ? "5_prime_UTR_variant" : "3_prime_UTR_variant");
}
}
for (int side = 0; side < 2; ++side) {
final Optional<? extends ExonOrIntron> opt_ex;
if (side == 0) {
opt_ex = transcript.getExons().stream().filter(E -> E.overlaps(ctx)).findFirst();
} else {
opt_ex = transcript.getIntrons().stream().filter(E -> E.overlaps(ctx)).findFirst();
}
if (!opt_ex.isPresent())
continue;
final ExonOrIntron ei = opt_ex.get();
if (side == 0) {
if (transcript.isNonCoding())
annotation.seqont.add("non_coding_transcript_exon_variant");
} else {
if (transcript.isNonCoding())
annotation.seqont.add("non_coding_transcript_intron_variant");
annotation.seqont.add("intron");
}
if (ctx.getStart() == ctx.getEnd() && ei.isSplicing(ctx.getStart())) {
if (ei.isSplicingAcceptor(ctx.getStart())) {
// SPLICING_ACCEPTOR
annotation.seqont.add("splice_acceptor");
} else if (ei.isSplicingDonor(ctx.getStart())) {
// SPLICING_DONOR
annotation.seqont.add("splice_donor");
} else // ??
{
annotation.seqont.add("splicing_variant");
}
}
}
final StructuralVariantType svType = ctx.getStructuralVariantType();
if (svType != null) {
continue;
}
if (transcript.isNonCoding()) {
// TODO
annotation.seqont.add("non_coding_transcript_variant");
continue;
}
RNASequence cDNA = this.transcriptId2cdna.get(transcript.getId());
if (cDNA == null) {
cDNA = rnaSeqFactory.getCodingRNA(transcript);
this.transcriptId2cdna.put(transcript.getId(), cDNA);
}
final OptionalInt opt_pos_cdna0 = cDNA.convertGenomic0ToRnaIndex0(ctx.getStart() - 1);
if (!opt_pos_cdna0.isPresent())
continue;
final int pos_cdna0 = opt_pos_cdna0.getAsInt();
final int pos_aa = pos_cdna0 / 3;
final GeneticCode geneticCode = GeneticCode.getStandard();
if (AcidNucleics.isATGC(altAllele.getBaseString())) {
String bases = altAllele.getBaseString().toUpperCase();
if (transcript.isNegativeStrand()) {
bases = AcidNucleics.reverseComplement(bases);
}
final MutedSequence mutRNA = new MutedSequence(cDNA, pos_cdna0, ctx.getReference().length(), bases);
final PeptideSequence<CharSequence> wildProt = PeptideSequence.of(cDNA, geneticCode);
final PeptideSequence<CharSequence> mutProt = PeptideSequence.of(mutRNA, geneticCode);
final int mod = pos_cdna0 % 3;
annotation.wildCodon = ("" + cDNA.charAt(pos_cdna0 - mod + 0) + cDNA.charAt(pos_cdna0 - mod + 1) + cDNA.charAt(pos_cdna0 - mod + 2));
annotation.mutCodon = ("" + mutRNA.charAt(pos_cdna0 - mod + 0) + mutRNA.charAt(pos_cdna0 - mod + 1) + mutRNA.charAt(pos_cdna0 - mod + 2));
annotation.position_protein = (pos_aa + 1);
annotation.wildAA = String.valueOf(wildProt.charAt(pos_aa));
annotation.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
annotation.seqont.add("stop_lost");
} else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
annotation.seqont.add("stop_gained");
} else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
annotation.seqont.add("synonymous");
} else {
annotation.seqont.add("missense_variant");
}
}
}
}
}
final Set<String> info = new HashSet<String>(all_annotations.size());
for (final Annotation a : all_annotations) {
info.add(a.toString());
}
final VariantContextBuilder vb = new VariantContextBuilder(ctx);
final String thetag;
switch(this.outputSyntax) {
case Vep:
thetag = "CSQ";
break;
case SnpEff:
thetag = "ANN";
break;
default:
thetag = TAG;
break;
}
vb.attribute(thetag, info.toArray());
w.add(vb.make());
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(r);
CloserUtil.close(this.referenceGenome);
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VCFBedSetFilter method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator r, final VariantContextWriter w) {
final Set<String> contigs_not_found = new HashSet<>();
final VCFHeader h2 = new VCFHeader(r.getHeader());
final VCFFilterHeaderLine filter;
if (!StringUtil.isBlank(this.filterName)) {
filter = new VCFFilterHeaderLine(this.filterName, "Variant " + (this.tabixBlackFile != null ? " overlapping any bed record of " : " that do not overlap any bed record of ") + tabixFile);
h2.addMetaDataLine(filter);
} else {
filter = null;
}
JVarkitVersion.getInstance().addMetaData(this, h2);
final ContigNameConverter ctgNameConverter;
if (this.bedReader != null) {
ctgNameConverter = ContigNameConverter.fromContigSet(this.bedReader.getContigs());
} else {
ctgNameConverter = ContigNameConverter.fromIntervalTreeMap(this.intervalTreeMap);
}
final SAMSequenceDictionary dict = r.getHeader().getSequenceDictionary();
w.writeHeader(h2);
while (r.hasNext()) {
final VariantContext ctx = r.next();
boolean set_filter = false;
final String convert_contig = ctgNameConverter.apply(ctx.getContig());
final int ctx_start = Math.max(1, ctx.getStart() - this.extend_bases);
final int ctx_end;
if (dict == null) {
ctx_end = ctx.getEnd() + this.extend_bases;
} else {
final SAMSequenceRecord ssr = dict.getSequence(ctx.getContig());
if (ssr == null)
throw new JvarkitException.ContigNotFoundInDictionary(ctx.getContig(), dict);
ctx_end = Math.min(ssr.getSequenceLength(), ctx.getEnd() + this.extend_bases);
}
if (StringUtil.isBlank(convert_contig)) {
if (debug)
LOG.warn("Cannot convert contig " + ctx.getContig());
if (contigs_not_found.size() < 100) {
if (contigs_not_found.add(ctx.getContig())) {
LOG.warn("Cannot convert variant contig " + ctx.getContig() + " to bed file. (Contig is not in BED file)");
}
}
set_filter = false;
} else if (this.intervalTreeMap != null) {
if (this.intervalTreeMap.getOverlapping(new SimpleInterval(convert_contig, ctx_start, ctx_end)).stream().anyMatch(LOC -> testOverlap(ctx_start, ctx_end, LOC))) {
if (debug)
LOG.warn("treemap.overlap=true set Filter=true for " + ctx.getContig() + ":" + ctx.getStart());
set_filter = true;
}
} else {
try (CloseableIterator<BedLine> iter = this.bedReader.iterator(convert_contig, ctx_start - 1, ctx_end + 1)) {
while (iter.hasNext()) {
final BedLine bed = iter.next();
if (bed == null)
continue;
if (!testOverlap(ctx_start, ctx_end, bed))
continue;
if (debug)
LOG.warn("tabix=true set Filter=true for " + ctx.getContig() + ":" + ctx.getStart());
set_filter = true;
break;
}
} catch (IOException err) {
throw new RuntimeIOException(err);
}
}
/* variant is in whitelist, we want to keep it */
if (this.tabixWhiteFile != null) {
set_filter = !set_filter;
if (debug)
LOG.warn("inverse. Now Filter=" + set_filter + " for " + ctx.getContig() + ":" + ctx.getStart());
}
if (!set_filter) {
if (debug)
LOG.warn("no filter. Writing " + ctx.getContig() + ":" + ctx.getStart());
w.add(ctx);
continue;
}
if (filter != null) {
if (debug)
LOG.warn("adding filter. Writing " + ctx.getContig() + ":" + ctx.getStart());
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.filter(filter.getID());
w.add(vcb.make());
} else {
if (debug)
LOG.warn("Ignoring " + ctx.getContig() + ":" + ctx.getStart());
}
}
if (!contigs_not_found.isEmpty()) {
LOG.warn("The following contigs were not found: " + String.join(" ", contigs_not_found) + "...");
}
return 0;
}
Aggregations