use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class MantaMerger method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter out = null;
try {
final Map<String, VcfInput> sample2inputs = new TreeMap<>();
SAMSequenceDictionary dict = null;
final List<String> lines;
if (args.size() == 1 && args.get(0).endsWith(".list")) {
lines = Files.lines(Paths.get(args.get(0))).filter(L -> !(StringUtils.isBlank(L) || L.startsWith("#"))).collect(Collectors.toList());
} else {
lines = args;
}
for (final String line : lines) {
final String[] tokens = CharSplitter.TAB.split(line);
final VcfInput vcfInput = new VcfInput();
vcfInput.vcfPath = Paths.get(tokens[0]);
IOUtil.assertFileIsReadable(vcfInput.vcfPath);
final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(vcfInput.vcfPath);
if (dict == null) {
dict = dict1;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict1)) {
throw new JvarkitException.DictionariesAreNotTheSame(dict1, dict);
}
if (tokens.length < 2 || StringUtils.isBlank(tokens[1])) {
try (VCFReader r = VCFReaderFactory.makeDefault().open(vcfInput.vcfPath, false)) {
List<String> snl = r.getHeader().getSampleNamesInOrder();
if (snl.size() == 1) {
vcfInput.sample = snl.get(0);
} else {
vcfInput.sample = vcfInput.vcfPath.toString();
}
}
} else {
vcfInput.sample = tokens[1];
}
if (sample2inputs.containsKey(vcfInput.sample)) {
LOG.error("duplicate sample " + vcfInput.sample);
return -1;
}
sample2inputs.put(vcfInput.sample, vcfInput);
}
if (sample2inputs.isEmpty()) {
LOG.error("no input found");
return -1;
}
if (!StringUtils.isBlank(this.limitContig) && dict.getSequence(this.limitContig) == null) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(this.limitContig, dict));
return -1;
}
final Predicate<VariantContext> bedPredicate;
if (this.excludeBedPath != null) {
final BedLineCodec codec = new BedLineCodec();
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
final IntervalTreeMap<Boolean> map = new IntervalTreeMap<>();
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> codec.decode(L)).filter(L -> L != null).filter(B -> !StringUtils.isBlank(converter.apply(B.getContig()))).map(B -> new Interval(converter.apply(B.getContig()), B.getStart(), B.getEnd())).forEach(R -> map.put(R, true));
}
bedPredicate = V -> !map.containsOverlapping(V);
} else {
bedPredicate = V -> true;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
final VCFInfoHeaderLine infoSvLen = new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation length");
metaData.add(infoSvLen);
final VCFInfoHeaderLine infoNSamples = new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples");
metaData.add(infoNSamples);
final VCFInfoHeaderLine infoSamples = new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "amples");
metaData.add(infoSamples);
final VCFFilterHeaderLine filterSameNext = new VCFFilterHeaderLine("NEXT", "next variant in VCF is the same.");
metaData.add(filterSameNext);
final VCFFilterHeaderLine filterSamePrev = new VCFFilterHeaderLine("PREV", "next variant in VCF is the same.");
metaData.add(filterSamePrev);
final VCFHeader header = new VCFHeader(metaData, sample2inputs.keySet());
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
out = VCFUtils.createVariantContextWriterToPath(this.outputFile);
out.writeHeader(header);
final Decoy decoy = Decoy.getDefaultInstance();
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!StringUtils.isBlank(this.limitContig)) {
if (!ssr.getSequenceName().equals(this.limitContig))
continue;
}
LOG.info("contig " + ssr.getSequenceName());
if (decoy.isDecoy(ssr.getSequenceName()))
continue;
final Map<SVKey, Set<String>> variants2samples = new HashMap<>();
for (final VcfInput vcfinput : sample2inputs.values()) {
// reset count for this contig
vcfinput.contigCount = 0;
try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength()).stream().filter(V -> discard_bnd == false || !V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND")).filter(bedPredicate).map(V -> new VariantContextBuilder(V).unfiltered().noID().noGenotypes().rmAttribute("EVENT").rmAttribute("HOMSEQ").rmAttribute("HOMLEN").rmAttribute("SVINSSEQ").rmAttribute("SVINSLEN").rmAttribute("MATEID").rmAttribute("LEFT_SVINSSEQ").rmAttribute("RIGHT_SVINSSEQ").rmAttribute("BND_DEPTH").rmAttribute("MATE_BND_DEPTH").rmAttribute("JUNCTION_QUAL").rmAttribute("CIGAR").make()).forEach(V -> {
final SVKey key1 = new SVKey(V);
if (!svComparator.test(V, V))
throw new RuntimeException("compare to self failed ! " + V);
variants2samples.put(key1, new HashSet<>());
vcfinput.contigCount++;
});
}
}
if (variants2samples.isEmpty())
continue;
// build an interval tree for a faster access
final IntervalTree<SVKey> intervalTree = new IntervalTree<>();
for (final SVKey key : variants2samples.keySet()) {
final SimpleInterval r = new SimpleInterval(key.archetype).extend(this.svComparator.getBndDistance() + 1);
intervalTree.put(r.getStart(), r.getEnd(), key);
// paranoidcheck interval is ok to find current archetype
boolean found = false;
final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
while (nodeIter.hasNext()) {
final SVKey key1 = nodeIter.next().getValue();
if (this.svComparator.test(key1.archetype, key.archetype)) {
found = true;
break;
}
}
if (!found) {
out.close();
throw new RuntimeException("cannot find self " + key.archetype + " in " + r);
}
}
for (final VcfInput vcfinput : sample2inputs.values()) {
try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
final CloseableIterator<VariantContext> iter = vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength());
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
if (this.discard_bnd && ctx.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
continue;
if (!bedPredicate.test(ctx))
continue;
final SimpleInterval r = new SimpleInterval(ctx).extend(this.svComparator.getBndDistance() + 1);
final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
while (nodeIter.hasNext()) {
final SVKey key1 = nodeIter.next().getValue();
if (!this.svComparator.test(key1.archetype, ctx))
continue;
final Set<String> samples = variants2samples.get(key1);
samples.add(vcfinput.sample);
}
}
iter.close();
}
}
final Comparator<VariantContext> sorter = new ContigDictComparator(dict).createLocatableComparator();
final List<SVKey> orderedKeys = variants2samples.keySet().stream().filter(// no samples for this key ??!
K -> !variants2samples.get(K).isEmpty()).sorted((A, B) -> sorter.compare(A.archetype, B.archetype)).collect(Collectors.toCollection(ArrayList::new));
// remove duplicates
int i = 0;
while (i + 1 < orderedKeys.size()) {
final SVKey key1 = orderedKeys.get(i);
final SVKey key2 = orderedKeys.get(i + 1);
if (svComparator.test(key1.archetype, key2.archetype) && // same samples
variants2samples.get(key1).equals(variants2samples.get(key2))) {
orderedKeys.remove(i + 1);
} else {
i++;
}
}
for (int key_index = 0; key_index < orderedKeys.size(); key_index++) {
final SVKey key = orderedKeys.get(key_index);
final Set<String> samples = variants2samples.get(key);
final Allele refAllele = key.archetype.getReference();
final Allele altAllele = Allele.create("<SV>", false);
final Object svType = key.archetype.getAttribute(VCFConstants.SVTYPE, ".");
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(key.archetype.getContig());
vcb.start(key.archetype.getStart());
vcb.stop(key.archetype.getEnd());
vcb.log10PError(key.archetype.getLog10PError());
vcb.alleles(Arrays.asList(refAllele, altAllele));
vcb.attribute(VCFConstants.END_KEY, key.archetype.getEnd());
vcb.attribute(VCFConstants.SVTYPE, svType);
vcb.attribute(infoSvLen.getID(), (svType.equals("DEL") ? -1 : 1) * key.archetype.getLengthOnReference());
vcb.attribute(infoNSamples.getID(), samples.size());
vcb.attribute(infoSamples.getID(), samples.stream().sorted().collect(Collectors.toList()));
int ac = 0;
final List<Genotype> genotypes = new ArrayList<>(sample2inputs.size());
for (final String sn : sample2inputs.keySet()) {
List<Allele> gta;
if (samples.contains(sn)) {
ac++;
gta = Arrays.asList(refAllele, altAllele);
} else {
gta = Arrays.asList(refAllele, refAllele);
}
genotypes.add(new GenotypeBuilder(sn, gta).make());
}
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, ac);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, sample2inputs.size() * 2);
if (ac > 0) {
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, ac / (float) sample2inputs.size() * 2);
}
if (key_index > 0 && svComparator.test(key.archetype, orderedKeys.get(key_index - 1).archetype)) {
vcb.filter(filterSamePrev.getID());
}
if (key_index + 1 < orderedKeys.size() && svComparator.test(key.archetype, orderedKeys.get(key_index + 1).archetype)) {
System.err.println("SAME\n" + key.archetype + "\n" + orderedKeys.get(key_index + 1).archetype);
vcb.filter(filterSameNext.getID());
}
vcb.genotypes(genotypes);
out.add(vcb.make());
}
}
out.close();
out = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class VcfScanUpstreamOrf method beforeVcf.
@Override
protected int beforeVcf() {
try {
this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(this.refCtgNameConverter);
// tmp IntervalTreeMap for gene, will be used to remove uORF overlapping alternate transcript with CDS */
final IntervalTreeMap<List<Transcript>> tmpTreeMap = new IntervalTreeMap<>();
gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasCodonStartDefined() && T.hasCodonStopDefined()).filter(T -> T.getTranscriptUTR5().isPresent()).filter(T -> (!this.plus_strand_only || T.isPositiveStrand())).forEach(T -> {
final Interval interval = new Interval(T);
List<Transcript> L = tmpTreeMap.get(interval);
if (L == null) {
L = new ArrayList<>();
tmpTreeMap.put(interval, L);
}
L.add(T);
});
for (final Transcript transcript : tmpTreeMap.values().stream().flatMap(L -> L.stream()).collect(Collectors.toList())) {
final Interval interval = transcript.getTranscriptUTR5().get().toInterval();
if (this.exclude_cds_overlaping_alternative) {
if (tmpTreeMap.getOverlapping(interval).stream().flatMap(L -> L.stream()).filter(// same object in memory
G -> G != transcript).anyMatch(K -> overlapCDS(transcript, K))) {
continue;
}
}
List<Transcript> L = this.transcriptMap.get(interval);
if (L == null) {
L = new ArrayList<>();
this.transcriptMap.put(interval, L);
}
if (this.canonical_utr) {
if (L.stream().map(K -> K.getTranscriptUTR5().get().toInterval()).anyMatch(R -> R.isNegativeStrand() == interval.isNegativeStrand() && R.equals(interval)))
continue;
}
L.add(transcript);
}
LOG.info("number of transcripts :" + this.transcriptMap.values().stream().flatMap(L -> L.stream()).count());
}
if (this.transcriptMap.isEmpty()) {
LOG.error("no transcripts found in " + this.gtfPath);
return -1;
}
if (this.archivePath != null) {
try (final ArchiveFactory archive = ArchiveFactory.open(this.archivePath)) {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
final ContigDictComparator ctgCmp = new ContigDictComparator(dict);
final Comparator<Locatable> locCmp1 = (A, B) -> {
int i = ctgCmp.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
};
final PrintWriter pw1 = archive.openWriter("uorf-dna.fa");
final PrintWriter pw2 = archive.openWriter("uorf-pep.fa");
final PrintWriter pw3 = archive.openWriter("uorf.bed");
pw3.println("track name=\"uORF\" description=\"uORF for " + this.gtfPath + "\"");
pw3.println("#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tblockCount\tblockSizes\tblockStarts");
final ProgressFactory.Watcher<Locatable> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
for (final Transcript kg : this.transcriptMap.values().stream().flatMap(L -> L.stream()).sorted(locCmp1).collect(Collectors.toList())) {
/* new reference sequence */
if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(kg.getContig())) {
this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, kg.getContig());
}
final TranscriptRNA mRNA = new TranscriptRNA(kg);
final UpstreamORF uorf = new UpstreamORF(mRNA);
progress.apply(new Interval(uorf.getContig(), uorf.getChromStart() + 1, uorf.getChromEnd()));
final OpenReadingFrame orf = uorf.getBestOpenReadingFrame();
if (orf == null)
continue;
orf.printFastaDNA(pw1);
orf.printFastaPep(pw2);
orf.printBed(pw3);
}
;
pw1.flush();
pw1.close();
pw2.flush();
pw2.close();
pw3.flush();
pw3.close();
progress.close();
}
}
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class MergeStructuralVariants method doWork.
@Override
public int doWork(final List<String> args) {
if (this.max_distance < 0) {
LOG.error("bad max_distance :" + this.max_distance);
return -1;
}
final List<VcfInput> vcf_readers = new ArrayList<>();
try {
SAMSequenceDictionary dict = null;
final List<Path> inputPaths = (IOUtils.unrollPaths(args));
if (inputPaths.isEmpty()) {
LOG.error("input is empty");
return -1;
}
final Set<VCFHeaderLine> metadata = new HashSet<>();
for (int i = 0; i < inputPaths.size(); i++) {
final VcfInput vcfInput = new VcfInput(inputPaths.get(i));
vcf_readers.add(vcfInput);
if (dict == null) {
dict = vcfInput.dict;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, vcfInput.dict)) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(vcfInput.dict, dict));
return -1;
}
}
final Comparator<String> contigComparator = new ContigDictComparator(dict);
final Comparator<CnvCall> comparator = (A, B) -> {
int i = contigComparator.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
i = Integer.compare(A.getEnd(), B.getEnd());
if (i != 0)
return i;
i = A.getType().compareTo(B.getType());
return i;
};
VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.END_KEY);
metadata.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples carrying the SV"));
metadata.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples carrying the SV"));
metadata.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length"));
metadata.add(new VCFInfoHeaderLine("CIPOS", 2, VCFHeaderLineType.Integer, "Confidence interval around POS for imprecise variants"));
metadata.add(new VCFInfoHeaderLine("CIEND", 2, VCFHeaderLineType.Integer, "Confidence interval around END for imprecise variants"));
metadata.add(new VCFInfoHeaderLine("IMPRECISE", 0, VCFHeaderLineType.Flag, "Imprecise structural variation"));
/*metadata.add(new VCFFormatHeaderLine(
"OV",1,
VCFHeaderLineType.Integer,
"Number calls (with different sample) overlapping this genotype"
));*/
metadata.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "SV type"));
final VCFHeader header = new VCFHeader(metadata, (Set<String>) vcf_readers.stream().map(F -> F.sample).collect(Collectors.toCollection(TreeSet::new)));
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
try (VariantContextWriter out = this.writingVariantsDelegate.open(this.outputFile)) {
out.writeHeader(header);
for (int tid = 0; tid < dict.size(); tid++) {
final SAMSequenceRecord ssr = dict.getSequence(tid);
for (VcfInput vin : vcf_readers) {
vin.loadVariants(ssr);
}
final List<CnvCall> intervals_list = vcf_readers.stream().flatMap(F -> F.callMap.values().stream()).collect(Collectors.toSet()).stream().sorted(comparator).collect(Collectors.toList());
for (final CnvCall baseCall : intervals_list) {
if (baseCall.echoed_flag)
continue;
final Allele ref = Allele.create("N", true);
final Allele alt = Allele.create("<" + baseCall.svType + ">", false);
final List<Allele> alleles = Arrays.asList(ref, alt);
final List<CnvCall> callsToPrint = vcf_readers.stream().flatMap(F -> F.callMap.getOverlapping(baseCall).stream().filter(C -> C.getType().equals(baseCall.getType())).filter(C -> !C.echoed_flag).filter(C -> testOverlapping(C, baseCall))).collect(Collectors.toList());
if (callsToPrint.isEmpty()) {
throw new IllegalStateException();
}
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(baseCall.getContig());
vcb.start(baseCall.getStart());
vcb.stop(baseCall.getEnd());
vcb.attribute(VCFConstants.END_KEY, baseCall.getEnd());
vcb.attribute(VCFConstants.SVTYPE, baseCall.getType());
vcb.attribute("SVLEN", CoordMath.getLength(baseCall.getStart(), baseCall.getEnd()));
for (int side = 0; side < 2; side++) {
final Function<CnvCall, Integer> coordExtractor;
if (side == 0) {
coordExtractor = C -> C.getStart();
} else {
coordExtractor = C -> C.getEnd();
}
final List<Integer> list = Arrays.asList(callsToPrint.stream().mapToInt(C -> coordExtractor.apply(C) - coordExtractor.apply(baseCall)).min().orElse(0), callsToPrint.stream().mapToInt(C -> coordExtractor.apply(C) - coordExtractor.apply(baseCall)).max().orElse(0));
vcb.attribute(side == 0 ? "CIPOS" : "CIEND", list);
}
vcb.attribute("IMPRECISE", true);
final Map<String, Genotype> sample2gt = new HashMap<>(callsToPrint.size());
for (final CnvCall call : callsToPrint) {
if (sample2gt.containsKey(call.getSample())) {
LOG.warn("Sample " + call.getSample() + " exits twice at the same loc " + call + " could be two small SV overlapping a big one.");
continue;
}
call.echoed_flag = true;
// final List<Allele> alleles=Arrays.asList(ref,alt);
final GenotypeBuilder gb = new GenotypeBuilder(call.sample, alleles);
sample2gt.put(call.getSample(), gb.make());
}
vcb.attribute("SAMPLES", new ArrayList<>(sample2gt.keySet()));
vcb.attribute("NSAMPLES", sample2gt.size());
vcb.genotypes(sample2gt.values());
vcb.alleles(alleles);
out.add(vcb.make());
}
// end of loop interval
}
/*
all_calls.values().stream().
flatMap(C->C.stream()).
filter(C->!C.echoed_flag).
forEach(C->LOG.warn("Bug: Not printed "+C));
*/
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
for (VcfInput vi : vcf_readers) try {
vi.close();
} catch (IOException err) {
}
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class MergeCnvNator method doWork.
@Override
public int doWork(final List<String> args) {
if (this.region_are_same_ratio <= 0 || this.region_are_same_ratio > 1) {
LOG.error("bad region_are_same_ratio :" + this.region_are_same_ratio);
return -1;
}
if (this.treshold < 0 || this.treshold >= 0.5) {
LOG.error("Bad treshold 0< " + this.treshold + " < 0.5");
return -1;
}
try {
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.error("input is empty");
return -1;
}
final SAMSequenceDictionary dict;
final ContigNameConverter contigNameConverter;
if (this.dictRefFile != null) {
dict = SAMSequenceDictionaryExtractor.extractDictionary(this.dictRefFile);
contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
} else {
contigNameConverter = null;
dict = null;
}
final IntervalTreeMap<Boolean> limitBedIntervalTreeMap;
if (this.includeBed != null) {
limitBedIntervalTreeMap = new IntervalTreeMap<>();
final BedLineCodec codec = new BedLineCodec();
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.includeBed)) {
br.lines().filter(S -> !StringUtils.isBlank(S) && !BedLine.isBedHeader(S)).map(L -> codec.decode(L)).filter(B -> B != null).forEach(B -> {
final String ctg = contigNameConverter == null ? B.getContig() : contigNameConverter.apply(B.getContig());
if (StringUtils.isBlank(ctg))
return;
limitBedIntervalTreeMap.put(new Interval(ctg, B.getStart(), B.getEnd()), Boolean.TRUE);
});
}
} else {
limitBedIntervalTreeMap = null;
}
final Set<CNVNatorInterval> intervals_set = new HashSet<>();
final IntervalTreeMap<List<CnvNatorCall>> all_calls = new IntervalTreeMap<>();
final Set<String> all_samples = new TreeSet<>();
for (final Path input : inputs) {
final String fileSample;
LOG.info("Reading " + input);
if (this.inputType.equals(InputType.cnvnator)) {
fileSample = IOUtils.getFilenameWithoutCommonSuffixes(input);
all_samples.add(fileSample);
} else {
fileSample = null;
}
final BufferedReader br = IOUtils.openPathForBufferedReading(input);
String line;
while ((line = br.readLine()) != null) {
if (StringUtil.isBlank(line) || line.startsWith("#"))
continue;
final String[] tokens = CharSplitter.TAB.split(line);
final CnvNatorCall call;
if (this.inputType.equals(InputType.cnvnator)) {
call = new CnvNatorCall(fileSample, Arrays.asList(tokens));
} else {
call = new CnvNatorCall(null, Arrays.asList(tokens));
all_samples.add(call.sample);
}
if (limitBedIntervalTreeMap != null && !limitBedIntervalTreeMap.containsOverlapping(call)) {
continue;
}
if (call.size > this.max_cnv_size) {
continue;
}
if (dict != null && dict.getSequence(call.getContig()) == null) {
LOG.warn("skipping " + line + " because contig " + call.getContig() + " is not defined in dictionary");
continue;
}
intervals_set.add(call.interval);
final Interval key = new Interval(call);
List<CnvNatorCall> callList = all_calls.get(key);
if (callList == null) {
callList = new ArrayList<>();
all_calls.put(key, callList);
}
callList.add(call);
}
br.close();
}
// contig comparator
final Comparator<String> contigComparator;
if (dict != null) {
contigComparator = new ContigDictComparator(dict);
} else {
final SmartComparator smartComparator = new SmartComparator();
contigComparator = (A, B) -> smartComparator.compare(A, B);
}
// call comparator
final Comparator<CNVNatorInterval> comparator = (A, B) -> {
int i = contigComparator.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
i = Integer.compare(A.getEnd(), B.getEnd());
if (i != 0)
return i;
if (one_cnv_type) {
i = A.type.compareTo(B.type);
}
return i;
};
final List<CNVNatorInterval> intervals_list = intervals_set.stream().sorted(comparator).collect(Collectors.toList());
LOG.info("Identified " + intervals_list.size() + " intervals");
final Set<VCFHeaderLine> metadata = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY);
metadata.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Difference in length between REF and ALT alleles"));
metadata.add(new VCFInfoHeaderLine("IMPRECISE", 0, VCFHeaderLineType.Flag, "Imprecise structural variation"));
metadata.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Structural variation type"));
metadata.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples carrying the CNV"));
metadata.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples carrying the CNV"));
metadata.add(new VCFInfoHeaderLine("OV", 1, VCFHeaderLineType.Integer, "Number calls overlapping this genotype region, whatever their size ( with at most fraction :" + min_overlapping_ratio + ")."));
metadata.add(new VCFFormatHeaderLine("WARN", 1, VCFHeaderLineType.String, "WARNINGS"));
metadata.add(new VCFFormatHeaderLine("RD", 1, VCFHeaderLineType.Float, "Normalized RD"));
metadata.add(new VCFFormatHeaderLine("P1", 1, VCFHeaderLineType.Float, "e-val by t-test"));
metadata.add(new VCFFormatHeaderLine("P2", 1, VCFHeaderLineType.Float, "e-val by Gaussian tail"));
metadata.add(new VCFFormatHeaderLine("P3", 1, VCFHeaderLineType.Float, "e-val by t-test (middle)"));
metadata.add(new VCFFormatHeaderLine("P4", 1, VCFHeaderLineType.Float, "e-val by Gaussian tail (middle)"));
metadata.add(new VCFFormatHeaderLine("Q0", 1, VCFHeaderLineType.Float, "Fraction of reads with 0 mapping quality"));
metadata.add(new VCFFormatHeaderLine("CN", 1, VCFHeaderLineType.Integer, "Copy number genotype for imprecise events"));
metadata.add(new VCFFormatHeaderLine("PE", 1, VCFHeaderLineType.Integer, "Number of paired-ends that support the event"));
metadata.add(new VCFFormatHeaderLine("OV", 1, VCFHeaderLineType.Integer, "Number calls (with different sample) overlapping this genotype"));
final VCFHeader header = new VCFHeader(metadata, all_samples);
JVarkitVersion.getInstance().addMetaData(this, header);
if (dict != null)
header.setSequenceDictionary(dict);
try (VariantContextWriter out = this.writingVariants.dictionary(dict).open(this.outputFile)) {
long id_generator = 0L;
out.writeHeader(header);
final ProgressFactory.Watcher<CNVNatorInterval> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
String prevContig = null;
for (final CNVNatorInterval interval : intervals_list) {
final Set<String> warnings = new HashSet<>();
progress.apply(interval);
if (!interval.getContig().equals(prevContig)) {
// cleanup memory, increase speed ?
final String finalPrevCtg = prevContig;
all_calls.keySet().removeIf(R -> R.getContig().equals(finalPrevCtg));
prevContig = interval.getContig();
}
// find all call overlapping this call
final List<CnvNatorCall> overlappingList = all_calls.getOverlapping(interval).stream().flatMap(L -> L.stream()).filter(C -> !one_cnv_type || C.interval.type.equals(interval.type)).filter(C -> !uniq_use_of_one_cnv || !C.echoed_flag).collect(Collectors.toList());
if (overlappingList.isEmpty())
continue;
// main call for this interval
final CnvNatorCall baseCall = overlappingList.stream().filter(C -> C.interval.equals(interval)).findFirst().orElse(null);
if (baseCall == null) {
continue;
}
// filter calls matching the overlapping with baseCall
final List<CnvNatorCall> callsToPrint = overlappingList.stream().filter(C -> testOverlapping(C, baseCall)).collect(Collectors.toList());
if (callsToPrint.isEmpty()) {
throw new IllegalStateException();
}
final VariantContextBuilder vcb = new VariantContextBuilder();
final Set<Allele> altAlleles = new HashSet<>();
vcb.chr(baseCall.getContig());
vcb.start(baseCall.getStart());
vcb.stop(baseCall.getEnd());
vcb.attribute(VCFConstants.END_KEY, baseCall.getEnd());
vcb.attribute("SVLEN", baseCall.size);
vcb.attribute("IMPRECISE", true);
final Map<String, Genotype> sample2gt = new HashMap<>(callsToPrint.size());
for (final CnvNatorCall call : callsToPrint) {
if (sample2gt.containsKey(call.sample)) {
warnings.add("SAMPLE_" + call.sample + "_MULTIPLE");
LOG.warn("Sample " + call.sample + " exits twice at the same loc " + call + " could be two small SV overlapping a big one.");
continue;
}
if (uniq_use_of_one_cnv)
call.echoed_flag = true;
final GenotypeBuilder gb = new GenotypeBuilder(call.sample);
if (call.normalized_RD != null)
gb.attribute("RD", call.normalized_RD);
if (call.e_val_1 != null)
gb.attribute("P1", call.e_val_1);
if (call.e_val_2 != null)
gb.attribute("P2", call.e_val_2);
if (call.e_val_3 != null)
gb.attribute("P3", call.e_val_3);
if (call.e_val_4 != null)
gb.attribute("P4", call.e_val_4);
if (call.q0 != null)
gb.attribute("Q0", call.q0);
gb.attribute("OV", all_calls.getOverlapping(call).stream().flatMap(col -> col.stream()).filter(C -> !C.sample.equals(call.sample)).count());
if (call.interval.type == CnvType.deletion && call.normalized_RD != null && call.normalized_RD < this.treshold) {
gb.alleles(Arrays.asList(DEL_ALLELE, DEL_ALLELE));
altAlleles.add(DEL_ALLELE);
gb.attribute("CN", 0);
} else if (call.interval.type == CnvType.deletion && call.normalized_RD != null && call.normalized_RD >= this.treshold) {
gb.alleles(Arrays.asList(REF_ALLELE, DEL_ALLELE));
altAlleles.add(DEL_ALLELE);
gb.attribute("CN", 1);
} else if (call.interval.type == CnvType.duplication && call.normalized_RD != null && call.normalized_RD <= (1.5 + this.treshold)) {
gb.alleles(Arrays.asList(REF_ALLELE, DUP_ALLELE));
altAlleles.add(DUP_ALLELE);
gb.attribute("CN", 2);
} else if (call.interval.type == CnvType.duplication && call.normalized_RD != null && call.normalized_RD > (1.5 + this.treshold)) {
gb.alleles(Arrays.asList(DUP_ALLELE, DUP_ALLELE));
gb.attribute("CN", 9999);
altAlleles.add(DUP_ALLELE);
} else if (hom_ref_instead_of_nocall) {
gb.alleles(Arrays.asList(REF_ALLELE, REF_ALLELE));
} else {
gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
}
sample2gt.put(call.sample, gb.make());
}
if (altAlleles.isEmpty()) {
vcb.attribute(VCFConstants.SVTYPE, "UNDEFINED");
}
if (altAlleles.size() == 1 && altAlleles.contains(DEL_ALLELE)) {
vcb.attribute(VCFConstants.SVTYPE, "DEL");
} else if (altAlleles.size() == 1 && altAlleles.contains(DUP_ALLELE)) {
vcb.attribute(VCFConstants.SVTYPE, "DUP");
} else {
vcb.attribute(VCFConstants.SVTYPE, "MIXED");
}
final List<Allele> alleles = new ArrayList<>();
alleles.add(REF_ALLELE);
alleles.addAll(altAlleles);
vcb.id(String.format("cnv%05d", (++id_generator)));
vcb.alleles(alleles);
vcb.attribute("SAMPLES", new ArrayList<>(sample2gt.keySet()));
vcb.attribute("NSAMPLES", sample2gt.size());
if (!warnings.isEmpty()) {
vcb.attribute("WARN", new ArrayList<>(warnings));
}
vcb.attribute("OV", all_calls.getOverlapping(baseCall).stream().flatMap(L -> L.stream()).filter(L -> {
final Interval r1 = new Interval(L);
final Interval r2 = new Interval(baseCall);
if (!r1.overlaps(r2))
return false;
final double L1 = r1.getIntersectionLength(r2);
if ((L1 / r1.length()) < min_overlapping_ratio)
return false;
return true;
}).count());
vcb.genotypes(sample2gt.values());
out.add(vcb.make());
}
progress.close();
}
if (uniq_use_of_one_cnv) {
all_calls.values().stream().flatMap(C -> C.stream()).filter(C -> !C.echoed_flag).forEach(C -> LOG.warn("Bug: Not printed " + C));
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class BamXtremDepth method doWork.
@Override
public int doWork(final List<String> args) {
if (this.min_coverage >= this.max_coverage) {
LOG.equals("min coverage >= max_coverage");
return -1;
}
try {
final SamReaderFactory srf = SamReaderFactory.make().validationStringency(this.validationStringency).referenceSequence(this.faidxPath);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(faidxPath);
final List<Path> bamPaths = IOUtils.unrollPaths(args);
if (bamPaths.isEmpty()) {
LOG.error("No bam was defined");
return -1;
}
try (PrintWriter out = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
final boolean out_is_intervallist = (this.output_format_interval_list || (this.outputFile != null && (this.outputFile.toString().endsWith(FileExtensions.COMPRESSED_INTERVAL_LIST) || this.outputFile.toString().endsWith(FileExtensions.INTERVAL_LIST))));
if (out_is_intervallist) {
final SAMFileHeader outHeader = new SAMFileHeader(dict);
JVarkitVersion.getInstance().addMetaData(this, outHeader);
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
codec.encode(out, outHeader);
}
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!this.limit_to_chromosomes.isEmpty() && !this.limit_to_chromosomes.contains(ssr.getSequenceName()))
continue;
LOG.info("Scanning " + ssr.getSequenceName());
final BitSet highList = new BitSet(ssr.getLengthOnReference());
highList.set(0, ssr.getSequenceLength());
final BitSet lowList = new BitSet(ssr.getLengthOnReference());
lowList.set(0, ssr.getSequenceLength());
runLoop(bamPaths, srf, ssr, dict, lowList, highList);
final List<Interval> rgn = new ArrayList<>(highList.size() + lowList.size());
bitSetToLocatables(ssr, lowList).stream().map(R -> new Interval(R.getContig(), R.getStart(), R.getEnd(), false, "LE." + this.min_coverage)).forEach(R -> rgn.add(R));
bitSetToLocatables(ssr, highList).stream().map(R -> new Interval(R.getContig(), R.getStart(), R.getEnd(), false, "GE." + this.max_coverage)).forEach(R -> rgn.add(R));
rgn.sort(new ContigDictComparator(dict).createLocatableComparator());
for (final Interval R : rgn) {
out.print(R.getContig());
out.print("\t");
out.print(R.getStart() - (out_is_intervallist ? 0 : 1));
out.print("\t");
out.print(R.getEnd());
out.print("\t");
if (out_is_intervallist)
out.print("+\t");
out.print(R.getName());
out.println();
}
}
out.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations