use of com.github.lindenb.jvarkit.samtools.Decoy in project jvarkit by lindenb.
the class MantaMerger method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter out = null;
try {
final Map<String, VcfInput> sample2inputs = new TreeMap<>();
SAMSequenceDictionary dict = null;
final List<String> lines;
if (args.size() == 1 && args.get(0).endsWith(".list")) {
lines = Files.lines(Paths.get(args.get(0))).filter(L -> !(StringUtils.isBlank(L) || L.startsWith("#"))).collect(Collectors.toList());
} else {
lines = args;
}
for (final String line : lines) {
final String[] tokens = CharSplitter.TAB.split(line);
final VcfInput vcfInput = new VcfInput();
vcfInput.vcfPath = Paths.get(tokens[0]);
IOUtil.assertFileIsReadable(vcfInput.vcfPath);
final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(vcfInput.vcfPath);
if (dict == null) {
dict = dict1;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict1)) {
throw new JvarkitException.DictionariesAreNotTheSame(dict1, dict);
}
if (tokens.length < 2 || StringUtils.isBlank(tokens[1])) {
try (VCFReader r = VCFReaderFactory.makeDefault().open(vcfInput.vcfPath, false)) {
List<String> snl = r.getHeader().getSampleNamesInOrder();
if (snl.size() == 1) {
vcfInput.sample = snl.get(0);
} else {
vcfInput.sample = vcfInput.vcfPath.toString();
}
}
} else {
vcfInput.sample = tokens[1];
}
if (sample2inputs.containsKey(vcfInput.sample)) {
LOG.error("duplicate sample " + vcfInput.sample);
return -1;
}
sample2inputs.put(vcfInput.sample, vcfInput);
}
if (sample2inputs.isEmpty()) {
LOG.error("no input found");
return -1;
}
if (!StringUtils.isBlank(this.limitContig) && dict.getSequence(this.limitContig) == null) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(this.limitContig, dict));
return -1;
}
final Predicate<VariantContext> bedPredicate;
if (this.excludeBedPath != null) {
final BedLineCodec codec = new BedLineCodec();
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
final IntervalTreeMap<Boolean> map = new IntervalTreeMap<>();
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> codec.decode(L)).filter(L -> L != null).filter(B -> !StringUtils.isBlank(converter.apply(B.getContig()))).map(B -> new Interval(converter.apply(B.getContig()), B.getStart(), B.getEnd())).forEach(R -> map.put(R, true));
}
bedPredicate = V -> !map.containsOverlapping(V);
} else {
bedPredicate = V -> true;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
final VCFInfoHeaderLine infoSvLen = new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation length");
metaData.add(infoSvLen);
final VCFInfoHeaderLine infoNSamples = new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples");
metaData.add(infoNSamples);
final VCFInfoHeaderLine infoSamples = new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "amples");
metaData.add(infoSamples);
final VCFFilterHeaderLine filterSameNext = new VCFFilterHeaderLine("NEXT", "next variant in VCF is the same.");
metaData.add(filterSameNext);
final VCFFilterHeaderLine filterSamePrev = new VCFFilterHeaderLine("PREV", "next variant in VCF is the same.");
metaData.add(filterSamePrev);
final VCFHeader header = new VCFHeader(metaData, sample2inputs.keySet());
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
out = VCFUtils.createVariantContextWriterToPath(this.outputFile);
out.writeHeader(header);
final Decoy decoy = Decoy.getDefaultInstance();
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!StringUtils.isBlank(this.limitContig)) {
if (!ssr.getSequenceName().equals(this.limitContig))
continue;
}
LOG.info("contig " + ssr.getSequenceName());
if (decoy.isDecoy(ssr.getSequenceName()))
continue;
final Map<SVKey, Set<String>> variants2samples = new HashMap<>();
for (final VcfInput vcfinput : sample2inputs.values()) {
// reset count for this contig
vcfinput.contigCount = 0;
try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength()).stream().filter(V -> discard_bnd == false || !V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND")).filter(bedPredicate).map(V -> new VariantContextBuilder(V).unfiltered().noID().noGenotypes().rmAttribute("EVENT").rmAttribute("HOMSEQ").rmAttribute("HOMLEN").rmAttribute("SVINSSEQ").rmAttribute("SVINSLEN").rmAttribute("MATEID").rmAttribute("LEFT_SVINSSEQ").rmAttribute("RIGHT_SVINSSEQ").rmAttribute("BND_DEPTH").rmAttribute("MATE_BND_DEPTH").rmAttribute("JUNCTION_QUAL").rmAttribute("CIGAR").make()).forEach(V -> {
final SVKey key1 = new SVKey(V);
if (!svComparator.test(V, V))
throw new RuntimeException("compare to self failed ! " + V);
variants2samples.put(key1, new HashSet<>());
vcfinput.contigCount++;
});
}
}
if (variants2samples.isEmpty())
continue;
// build an interval tree for a faster access
final IntervalTree<SVKey> intervalTree = new IntervalTree<>();
for (final SVKey key : variants2samples.keySet()) {
final SimpleInterval r = new SimpleInterval(key.archetype).extend(this.svComparator.getBndDistance() + 1);
intervalTree.put(r.getStart(), r.getEnd(), key);
// paranoidcheck interval is ok to find current archetype
boolean found = false;
final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
while (nodeIter.hasNext()) {
final SVKey key1 = nodeIter.next().getValue();
if (this.svComparator.test(key1.archetype, key.archetype)) {
found = true;
break;
}
}
if (!found) {
out.close();
throw new RuntimeException("cannot find self " + key.archetype + " in " + r);
}
}
for (final VcfInput vcfinput : sample2inputs.values()) {
try (VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfinput.vcfPath, true)) {
final CloseableIterator<VariantContext> iter = vcfFileReader.query(ssr.getSequenceName(), 1, ssr.getSequenceLength());
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
if (this.discard_bnd && ctx.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
continue;
if (!bedPredicate.test(ctx))
continue;
final SimpleInterval r = new SimpleInterval(ctx).extend(this.svComparator.getBndDistance() + 1);
final Iterator<IntervalTree.Node<SVKey>> nodeIter = intervalTree.overlappers(r.getStart(), r.getEnd());
while (nodeIter.hasNext()) {
final SVKey key1 = nodeIter.next().getValue();
if (!this.svComparator.test(key1.archetype, ctx))
continue;
final Set<String> samples = variants2samples.get(key1);
samples.add(vcfinput.sample);
}
}
iter.close();
}
}
final Comparator<VariantContext> sorter = new ContigDictComparator(dict).createLocatableComparator();
final List<SVKey> orderedKeys = variants2samples.keySet().stream().filter(// no samples for this key ??!
K -> !variants2samples.get(K).isEmpty()).sorted((A, B) -> sorter.compare(A.archetype, B.archetype)).collect(Collectors.toCollection(ArrayList::new));
// remove duplicates
int i = 0;
while (i + 1 < orderedKeys.size()) {
final SVKey key1 = orderedKeys.get(i);
final SVKey key2 = orderedKeys.get(i + 1);
if (svComparator.test(key1.archetype, key2.archetype) && // same samples
variants2samples.get(key1).equals(variants2samples.get(key2))) {
orderedKeys.remove(i + 1);
} else {
i++;
}
}
for (int key_index = 0; key_index < orderedKeys.size(); key_index++) {
final SVKey key = orderedKeys.get(key_index);
final Set<String> samples = variants2samples.get(key);
final Allele refAllele = key.archetype.getReference();
final Allele altAllele = Allele.create("<SV>", false);
final Object svType = key.archetype.getAttribute(VCFConstants.SVTYPE, ".");
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(key.archetype.getContig());
vcb.start(key.archetype.getStart());
vcb.stop(key.archetype.getEnd());
vcb.log10PError(key.archetype.getLog10PError());
vcb.alleles(Arrays.asList(refAllele, altAllele));
vcb.attribute(VCFConstants.END_KEY, key.archetype.getEnd());
vcb.attribute(VCFConstants.SVTYPE, svType);
vcb.attribute(infoSvLen.getID(), (svType.equals("DEL") ? -1 : 1) * key.archetype.getLengthOnReference());
vcb.attribute(infoNSamples.getID(), samples.size());
vcb.attribute(infoSamples.getID(), samples.stream().sorted().collect(Collectors.toList()));
int ac = 0;
final List<Genotype> genotypes = new ArrayList<>(sample2inputs.size());
for (final String sn : sample2inputs.keySet()) {
List<Allele> gta;
if (samples.contains(sn)) {
ac++;
gta = Arrays.asList(refAllele, altAllele);
} else {
gta = Arrays.asList(refAllele, refAllele);
}
genotypes.add(new GenotypeBuilder(sn, gta).make());
}
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, ac);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, sample2inputs.size() * 2);
if (ac > 0) {
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, ac / (float) sample2inputs.size() * 2);
}
if (key_index > 0 && svComparator.test(key.archetype, orderedKeys.get(key_index - 1).archetype)) {
vcb.filter(filterSamePrev.getID());
}
if (key_index + 1 < orderedKeys.size() && svComparator.test(key.archetype, orderedKeys.get(key_index + 1).archetype)) {
System.err.println("SAME\n" + key.archetype + "\n" + orderedKeys.get(key_index + 1).archetype);
vcb.filter(filterSameNext.getID());
}
vcb.genotypes(genotypes);
out.add(vcb.make());
}
}
out.close();
out = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.samtools.Decoy in project jvarkit by lindenb.
the class ScanStructuralVariants method doWork.
@Override
public int doWork(final List<String> args) {
final List<VCFReader> casesFiles = new ArrayList<>();
if (this.svComparator.getBndDistance() < 0) {
LOG.error("bad max_distance :" + this.svComparator.getBndDistance());
return -1;
}
VariantContextWriter out = null;
try {
final List<Path> casesPaths = (IOUtils.unrollPaths(args));
if (casesPaths.isEmpty()) {
LOG.error("cases list is empty");
return -1;
}
if (!print_all_ctx && casesPaths.size() == 1) {
LOG.warning("One case: switching to --all");
print_all_ctx = true;
}
if (this.controlsPath.size() == 1 && this.controlsPath.get(0).toString().endsWith(".list")) {
this.controlsPath = Files.lines(this.controlsPath.get(0)).filter(L -> !(L.startsWith("#") || StringUtils.isBlank(L))).map(L -> Paths.get(L)).collect(Collectors.toList());
}
SAMSequenceDictionary dict = null;
final Set<VCFHeaderLine> metadata = new HashSet<>();
for (int side = 0; side < 2; side++) {
for (final Path input : (side == 0 ? casesPaths : this.controlsPath)) {
final VCFReader vcfInput = VCFReaderFactory.makeDefault().open(input);
final VCFHeader header = vcfInput.getHeader();
if (side == 0) {
casesFiles.add(vcfInput);
} else {
vcfInput.close();
}
final SAMSequenceDictionary dict2 = SequenceDictionaryUtils.extractRequired(header);
if (dict == null) {
dict = dict2;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict2)) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict2, dict));
return -1;
}
}
}
final IntervalTreeMap<Boolean> intervalTreeMap;
if (intervalListProvider != null) {
intervalTreeMap = new IntervalTreeMap<>();
intervalListProvider.dictionary(dict).stream().forEach(R -> intervalTreeMap.put(new Interval(R), true));
} else {
intervalTreeMap = null;
}
casesFiles.stream().flatMap(F -> F.getHeader().getMetaDataInInputOrder().stream()).forEach(H -> metadata.add(H));
VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.END_KEY);
metadata.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples carrying the SV"));
metadata.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples carrying the SV"));
metadata.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length"));
metadata.add(new VCFInfoHeaderLine("CIPOS", 2, VCFHeaderLineType.Integer, "Confidence interval around POS for imprecise variants"));
metadata.add(new VCFInfoHeaderLine("CIEND", 2, VCFHeaderLineType.Integer, "Confidence interval around END for imprecise variants"));
metadata.add(new VCFInfoHeaderLine("IMPRECISE", 0, VCFHeaderLineType.Flag, "Imprecise structural variation"));
metadata.add(new VCFInfoHeaderLine(ATT_FILENAME, 1, VCFHeaderLineType.String, "Source of variant"));
metadata.add(new VCFInfoHeaderLine(ATT_CLUSTER, 1, VCFHeaderLineType.String, "Variant cluster"));
/*metadata.add(new VCFFormatHeaderLine(
"OV",1,
VCFHeaderLineType.Integer,
"Number calls (with different sample) overlapping this genotype"
));*/
metadata.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "SV type"));
metadata.add(new VCFFilterHeaderLine(ATT_CONTROL, "Variant is found in controls (max MAF=" + this.max_maf + ")"));
final VCFHeader header = new VCFHeader(metadata);
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
final List<ShadowedVcfReader> controlShadowReaders = new ArrayList<>(this.controlsPath.size());
for (int i = 0; i < this.controlsPath.size(); i++) {
boolean large_flag = this.max_control_large_flag < 0 || i >= this.max_control_large_flag;
controlShadowReaders.add(new ShadowedVcfReader(this.controlsPath.get(i), large_flag));
}
out = super.openVariantContextWriter(this.outputFile);
out.writeHeader(header);
final CloseableIterator<VariantContext> iter = casesFiles.get(0).iterator();
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
final Decoy decoy = Decoy.getDefaultInstance();
VariantContext prevCtx = null;
while (iter.hasNext()) {
final VariantContext ctx = progress.apply(iter.next());
if (decoy.isDecoy(ctx.getContig()))
continue;
if (Breakend.parse(ctx).stream().anyMatch(B -> decoy.isDecoy(B.getContig())))
continue;
if (intervalTreeMap != null && !intervalTreeMap.containsOverlapping(ctx))
continue;
// in manta, I see the same variant multiple times in the same vcf
if (prevCtx != null && ctx.getContig().equals(prevCtx.getContig()) && ctx.getStart() == prevCtx.getStart() && ctx.getEnd() == prevCtx.getEnd())
continue;
prevCtx = ctx;
final List<VariantContext> candidate = new ArrayList<>(casesFiles.size());
candidate.add(ctx);
recursive(ctx, candidate, casesFiles, controlShadowReaders, out);
}
iter.close();
progress.close();
out.close();
out = null;
casesFiles.stream().forEach(F -> {
try {
F.close();
} catch (Exception err) {
}
});
controlShadowReaders.stream().forEach(F -> F.realClose());
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
Aggregations