Search in sources :

Example 16 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class BamToFastq method doWork.

@Override
public int doWork(List<String> args) {
    SamReader sfr = null;
    SortingCollection<MappedFastq> fastqCollection = null;
    try {
        boolean found_single = false;
        boolean found_paired = false;
        long non_primary_alignmaned_flag = 0L;
        final String input = oneFileOrNull(args);
        final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.faidx);
        sfr = (input == null ? srf.open(SamInputResource.of(stdin())) : srf.open(SamInputResource.of(input)));
        fastqCollection = SortingCollection.newInstance(MappedFastq.class, new MappedFastqCodec(), new MappedFastqComparator(), this.maxRecordsInRam, this.tmpDir.toPath());
        fastqCollection.setDestructiveIteration(true);
        SAMRecordIterator iter = sfr.iterator();
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader().getSequenceDictionary());
        while (iter.hasNext()) {
            final SAMRecord rec = progress.watch(iter.next());
            if (rec.isSecondaryOrSupplementary()) {
                if (non_primary_alignmaned_flag == 0) {
                    LOG.warn("SKIPPING NON-PRIMARY " + (non_primary_alignmaned_flag + 1) + " ALIGNMENTS");
                }
                non_primary_alignmaned_flag++;
                continue;
            }
            MappedFastq m = new MappedFastq();
            m.name = rec.getReadName();
            if (m.name == null)
                m.name = "";
            m.seq = rec.getReadString();
            if (m.seq.equals(SAMRecord.NULL_SEQUENCE_STRING))
                m.seq = "";
            m.qual = rec.getBaseQualityString();
            if (m.qual.equals(SAMRecord.NULL_QUALS_STRING))
                m.qual = "";
            if (!rec.getReadUnmappedFlag() && rec.getReadNegativeStrandFlag()) {
                m.seq = AcidNucleics.reverseComplement(m.seq);
                m.qual = new StringBuilder(m.qual).reverse().toString();
            }
            if (m.seq.length() != m.qual.length()) {
                LOG.error("length(seq)!=length(qual) in " + m.name);
                continue;
            }
            if (m.seq.isEmpty() && m.qual.isEmpty()) {
                m.seq = "N";
                m.qual = "#";
            }
            if (rec.getReadPairedFlag()) {
                found_paired = true;
                if (found_single) {
                    sfr.close();
                    throw new RuntimeException("input is a mix of paired/singled reads");
                }
                m.side = (byte) (rec.getSecondOfPairFlag() ? 2 : 1);
            } else {
                found_single = true;
                if (found_paired) {
                    sfr.close();
                    throw new RuntimeException("input is a mix of paired/singled reads");
                }
                m.side = (byte) 0;
            }
            fastqCollection.add(m);
        }
        iter.close();
        CloserUtil.close(iter);
        CloserUtil.close(sfr);
        progress.finish();
        fastqCollection.doneAdding();
        LOG.info("Done reading.");
        if (found_paired) {
            FastqWriter fqw1 = null;
            FastqWriter fqw2 = null;
            if (forwardFile != null) {
                LOG.info("Writing to " + forwardFile);
                fqw1 = new BasicFastqWriter(FastqUtils.validateFastqFilename(forwardFile));
            } else {
                LOG.info("Writing to stdout");
                fqw1 = new BasicFastqWriter(new PrintStream(stdout()));
            }
            if (reverseFile != null) {
                LOG.info("Writing to " + reverseFile);
                fqw2 = new BasicFastqWriter(FastqUtils.validateFastqFilename(reverseFile));
            } else {
                LOG.info("Writing to interlaced stdout");
                fqw2 = fqw1;
            }
            List<MappedFastq> row = new ArrayList<MappedFastq>();
            CloseableIterator<MappedFastq> r = fastqCollection.iterator();
            for (; ; ) {
                MappedFastq curr = null;
                if (r.hasNext())
                    curr = r.next();
                if (curr == null || (!row.isEmpty() && !row.get(0).name.equals(curr.name))) {
                    if (!row.isEmpty()) {
                        if (row.size() > 2) {
                            LOG.warn("WTF :" + row);
                        }
                        boolean found_F = false;
                        boolean found_R = false;
                        for (MappedFastq m : row) {
                            switch((int) m.side) {
                                case 1:
                                    if (found_F)
                                        throw new RuntimeException("two forward reads found for " + row.get(0).name);
                                    found_F = true;
                                    echo(fqw1, m);
                                    break;
                                case 2:
                                    if (found_R)
                                        throw new RuntimeException("two reverse reads found for " + row.get(0).name);
                                    found_R = true;
                                    echo(fqw2, m);
                                    break;
                                default:
                                    throw new IllegalStateException("uh???");
                            }
                        }
                        if (!found_F) {
                            if (this.repair_missing_read) {
                                LOG.warn("forward not found for " + row.get(0));
                                MappedFastq pad = new MappedFastq();
                                pad.side = (byte) 1;
                                pad.name = row.get(0).name;
                                pad.seq = "N";
                                pad.qual = "#";
                                echo(fqw1, pad);
                            } else {
                                throw new RuntimeException("forward not found for " + row);
                            }
                        }
                        if (!found_R) {
                            if (repair_missing_read) {
                                LOG.warn("reverse not found for " + row.get(0));
                                MappedFastq pad = new MappedFastq();
                                pad.side = (byte) 2;
                                pad.name = row.get(0).name;
                                pad.seq = "N";
                                pad.qual = "#";
                                echo(fqw2, pad);
                            } else {
                                throw new RuntimeException("reverse not found for " + row);
                            }
                        }
                    }
                    if (curr == null)
                        break;
                    row.clear();
                }
                row.add(curr);
            }
            r.close();
            fqw1.close();
            fqw2.close();
        } else if (found_single) {
            FastqWriter fqw1 = null;
            if (forwardFile != null) {
                LOG.info("Writing to " + forwardFile);
                fqw1 = new BasicFastqWriter(forwardFile);
            } else {
                LOG.info("Writing to stdout");
                fqw1 = new BasicFastqWriter(new PrintStream(stdout()));
            }
            final CloseableIterator<MappedFastq> r = fastqCollection.iterator();
            while (r.hasNext()) {
                echo(fqw1, r.next());
            }
            r.close();
            fqw1.close();
        }
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        if (fastqCollection != null)
            fastqCollection.cleanup();
    }
}
Also used : PrintStream(java.io.PrintStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) IOException(java.io.IOException) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) BasicFastqWriter(htsjdk.samtools.fastq.BasicFastqWriter) FastqWriter(htsjdk.samtools.fastq.FastqWriter) BasicFastqWriter(htsjdk.samtools.fastq.BasicFastqWriter)

Example 17 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class SamTranslocations method makeIterator.

private CloseableIterator<SAMRecord> makeIterator(final SamReader sr) {
    try {
        CloseableIterator<SAMRecord> iter1;
        final SAMFileHeader header = sr.getFileHeader();
        if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
            throw new JvarkitException.BamBadSortOrder(SAMFileHeader.SortOrder.coordinate, header.getSortOrder());
        }
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
        final Pattern pat = StringUtils.isBlank(this.chromRegex) ? null : Pattern.compile(this.chromRegex);
        final boolean[] allowedContigs = new boolean[dict.size()];
        Arrays.fill(allowedContigs, true);
        if (pat != null) {
            dict.getSequences().stream().filter(SR -> !pat.matcher(SR.getSequenceName()).matches()).forEach(SR -> allowedContigs[SR.getSequenceIndex()] = false);
        }
        if (this.bedFile == null) {
            iter1 = sr.iterator();
        } else {
            if (!sr.hasIndex()) {
                throw new RuntimeIOException("SamReader " + sr.getResourceDescription() + " is not indexed");
            }
            final BedLineCodec bedCodec = new BedLineCodec();
            final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
            final List<QueryInterval> queryIntervals = new ArrayList<>();
            try (BufferedReader br = com.github.lindenb.jvarkit.io.IOUtils.openPathForBufferedReading(this.bedFile)) {
                br.lines().filter(line -> !(line.startsWith("#") || com.github.lindenb.jvarkit.util.bio.bed.BedLine.isBedHeader(line) || line.isEmpty())).map(line -> bedCodec.decode(line)).filter(B -> B != null).map(B -> B.toInterval()).filter(L -> L.getStart() < L.getEnd()).forEach(B -> {
                    final String c = contigNameConverter.apply(B.getContig());
                    if (StringUtils.isBlank(c))
                        return;
                    final int tid = dict.getSequenceIndex(c);
                    if (tid < 0)
                        return;
                    if (!allowedContigs[tid])
                        return;
                    final QueryInterval qi = new QueryInterval(tid, B.getStart(), B.getEnd());
                    queryIntervals.add(qi);
                });
            }
            final QueryInterval[] array = QueryInterval.optimizeIntervals(queryIntervals.toArray(new QueryInterval[queryIntervals.size()]));
            iter1 = sr.queryOverlapping(array);
        }
        final short SA = SAMTag.SA.getBinaryTag();
        final FilterIterator<SAMRecord> fsi = new FilterIterator<>(iter1, (SR) -> {
            if (SR.getReadUnmappedFlag())
                return false;
            if (SR.getDuplicateReadFlag())
                return false;
            if (SR.isSecondaryOrSupplementary())
                return false;
            if (SR.getMappingQuality() <= min_mapq)
                return false;
            final SAMReadGroupRecord rg = SR.getReadGroup();
            if (rg == null)
                return false;
            final String sn = rg.getSample();
            if (StringUtils.isBlank(sn))
                return false;
            if (max_sa_attribute > 0 && SR.getAttribute(SA) != null && SAMUtils.getOtherCanonicalAlignments(SR).stream().filter(R -> allowedContigs[R.getReferenceIndex()]).count() > max_sa_attribute) {
                return false;
            }
            if (SR.getReadPairedFlag() && !SR.getMateUnmappedFlag() && allowedContigs[SR.getReferenceIndex()] && !SR.getReferenceIndex().equals(SR.getMateReferenceIndex()) && allowedContigs[SR.getMateReferenceIndex()]) {
                return true;
            }
            return false;
        });
        return fsi;
    } catch (final IOException err) {
        throw new RuntimeIOException(err);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SortOrder(htsjdk.samtools.SAMFileHeader.SortOrder) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) StringUtil(htsjdk.samtools.util.StringUtil) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) SamFileHeaderMerger(htsjdk.samtools.SamFileHeaderMerger) List(java.util.List) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) StructuralVariantType(htsjdk.variant.variantcontext.StructuralVariantType) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Pattern(java.util.regex.Pattern) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) MergingIterator(htsjdk.samtools.util.MergingIterator) SAMUtils(htsjdk.samtools.SAMUtils) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) SAMTag(htsjdk.samtools.SAMTag) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) LinkedList(java.util.LinkedList) VCFConstants(htsjdk.variant.vcf.VCFConstants) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) FilterIterator(com.github.lindenb.jvarkit.util.iterator.FilterIterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) QueryInterval(htsjdk.samtools.QueryInterval) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) Collections(java.util.Collections) Pattern(java.util.regex.Pattern) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) QueryInterval(htsjdk.samtools.QueryInterval) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) FilterIterator(com.github.lindenb.jvarkit.util.iterator.FilterIterator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)

Example 18 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class ScanStructuralVariants method recursive.

private int recursive(final VariantContext ctx, final List<VariantContext> candidates, final List<VCFReader> vcfFilesInput, final List<ShadowedVcfReader> shadowControls, final VariantContextWriter out) {
    if (candidates.size() == vcfFilesInput.size()) {
        final int max_controls = (int) (shadowControls.size() * max_maf);
        int count_matching_controls = 0;
        for (final ShadowedVcfReader vcfReader : shadowControls) {
            final CloseableIterator<VariantContext> iter = vcfReader.query(ctx.getContig(), Math.max(1, ctx.getStart() - this.svComparator.getBndDistance()), ctx.getEnd() + this.svComparator.getBndDistance());
            while (iter.hasNext()) {
                final VariantContext ctx3 = iter.next();
                if (this.svComparator.test(ctx3, ctx)) {
                    count_matching_controls++;
                    candidates.add(new VariantContextBuilder(ctx3).noGenotypes().filter(ATT_CONTROL).attribute(ATT_FILENAME, vcfReader.vcfPath.toString()).make());
                    break;
                }
            }
            iter.close();
            vcfReader.close();
            if (count_matching_controls > max_controls)
                return -1;
        }
        if (this.print_all_ctx) {
            final String cluster = "CTX" + (++ID_GENERATOR);
            for (int x = 0; x < candidates.size(); ++x) {
                out.add(new VariantContextBuilder(candidates.get(x)).noGenotypes().attribute(ATT_CLUSTER, cluster).make());
            }
            return 0;
        }
        final VariantContextBuilder vcb = new VariantContextBuilder();
        vcb.chr(ctx.getContig());
        vcb.start(ctx.getStart());
        vcb.stop(ctx.getEnd());
        vcb.attribute(VCFConstants.END_KEY, ctx.getEnd());
        vcb.attribute("SVLEN", ctx.getLengthOnReference());
        final String svType = ctx.getAttributeAsString(VCFConstants.SVTYPE, ".");
        vcb.attribute(VCFConstants.SVTYPE, svType);
        vcb.attribute("IMPRECISE", true);
        for (int side = 0; side < 2; side++) {
            final Function<VariantContext, Integer> coordExtractor;
            if (side == 0) {
                coordExtractor = C -> C.getStart();
            } else {
                coordExtractor = C -> C.getEnd();
            }
            final List<Integer> list = Arrays.asList(candidates.stream().mapToInt(C -> coordExtractor.apply(C) - coordExtractor.apply(ctx)).min().orElse(0), candidates.stream().mapToInt(C -> coordExtractor.apply(C) - coordExtractor.apply(ctx)).max().orElse(0));
            vcb.attribute(side == 0 ? "CIPOS" : "CIEND", list);
        }
        final Allele ref = Allele.create("N", true);
        final Allele alt = Allele.create("<" + svType + ">", false);
        vcb.alleles(Arrays.asList(ref, alt));
        out.add(vcb.make());
        return 0;
    }
    VariantContext ctx2 = null;
    ;
    final CloseableIterator<VariantContext> iter = vcfFilesInput.get(candidates.size()).query(ctx.getContig(), Math.max(1, ctx.getStart() - this.svComparator.getBndDistance()), ctx.getEnd() + this.svComparator.getBndDistance());
    while (iter.hasNext()) {
        final VariantContext ctx3 = iter.next();
        if (this.svComparator.test(ctx3, ctx)) {
            ctx2 = ctx3;
            break;
        }
    }
    iter.close();
    if (ctx2 == null)
        return -1;
    candidates.add(ctx2);
    return recursive(ctx, candidates, vcfFilesInput, shadowControls, out);
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) htsjdk.samtools.util(htsjdk.samtools.util) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Decoy(com.github.lindenb.jvarkit.samtools.Decoy) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Breakend(com.github.lindenb.jvarkit.variant.variantcontext.Breakend) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Files(java.nio.file.Files) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) StructuralVariantComparator(com.github.lindenb.jvarkit.variant.sv.StructuralVariantComparator) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) List(java.util.List) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VariantContext(htsjdk.variant.variantcontext.VariantContext)

Example 19 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class ValidateCnv method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.extendFactor <= 0) {
        LOG.error("bad extend factor " + this.extendFactor);
        return -1;
    }
    if (this.treshold < 0 || this.treshold >= 0.25) {
        LOG.error("Bad treshold 0 < " + this.treshold + " >=0.25 ");
        return -1;
    }
    final Map<String, BamInfo> sample2bam = new HashMap<>();
    VariantContextWriter out = null;
    Iterator<? extends Locatable> iterIn = null;
    try {
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.rererencePath);
        final CRAMReferenceSource cramReferenceSource = new ReferenceSource(this.rererencePath);
        final List<Path> bamPaths = IOUtils.unrollPaths(this.bamFiles);
        final String input = oneFileOrNull(args);
        if (input == null) {
            iterIn = IntervalListProvider.empty().dictionary(dict).skipUnknownContigs().fromInputStream(stdin(), "bed").iterator();
        } else {
            final IntervalListProvider ilp = IntervalListProvider.from(input).setVariantPredicate(CTX -> {
                if (CTX.isSNP())
                    return false;
                final String svType = CTX.getAttributeAsString(VCFConstants.SVTYPE, "");
                if (svType != null && (svType.equals("INV") || svType.equals("BND")))
                    return false;
                return true;
            }).dictionary(dict).skipUnknownContigs();
            iterIn = ilp.stream().iterator();
        }
        /* register each bam */
        for (final Path p2 : bamPaths) {
            final BamInfo bi = new BamInfo(p2, cramReferenceSource);
            if (sample2bam.containsKey(bi.sampleName)) {
                LOG.error("sample " + bi.sampleName + " specified twice.");
                bi.close();
                return -1;
            }
            sample2bam.put(bi.sampleName, bi);
        }
        if (sample2bam.isEmpty()) {
            LOG.error("no bam was defined");
            return -1;
        }
        final Set<VCFHeaderLine> metadata = new HashSet<>();
        final VCFInfoHeaderLine infoSVSamples = new VCFInfoHeaderLine("N_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples that could carry a SV");
        metadata.add(infoSVSamples);
        final VCFInfoHeaderLine infoSvLen = new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length");
        metadata.add(infoSvLen);
        final BiFunction<String, String, VCFFormatHeaderLine> makeFmt = (TAG, DESC) -> new VCFFormatHeaderLine(TAG, 1, VCFHeaderLineType.Integer, DESC);
        final VCFFormatHeaderLine formatCN = new VCFFormatHeaderLine("CN", 1, VCFHeaderLineType.Float, "normalized copy-number. Treshold was " + this.treshold);
        metadata.add(formatCN);
        final VCFFormatHeaderLine nReadsSupportingSv = makeFmt.apply("RSD", "number of split reads supporting SV.");
        metadata.add(nReadsSupportingSv);
        final VCFFilterHeaderLine filterAllDel = new VCFFilterHeaderLine("ALL_DEL", "number of samples greater than 1 and all are deletions");
        metadata.add(filterAllDel);
        final VCFFilterHeaderLine filterAllDup = new VCFFilterHeaderLine("ALL_DUP", "number of samples  greater than  1 and all are duplication");
        metadata.add(filterAllDup);
        final VCFFilterHeaderLine filterNoSV = new VCFFilterHeaderLine("NO_SV", "There is no DUP or DEL in this variant");
        metadata.add(filterNoSV);
        final VCFFilterHeaderLine filterHomDel = new VCFFilterHeaderLine("HOM_DEL", "There is one Homozygous deletion.");
        metadata.add(filterHomDel);
        final VCFFilterHeaderLine filterHomDup = new VCFFilterHeaderLine("HOM_DUP", "There is one Homozygous duplication.");
        metadata.add(filterHomDup);
        VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_FILTER_KEY, VCFConstants.GENOTYPE_QUALITY_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY, VCFConstants.ALLELE_NUMBER_KEY);
        final VCFHeader header = new VCFHeader(metadata, sample2bam.keySet());
        if (dict != null)
            header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
        out = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
        out.writeHeader(header);
        final Allele DUP_ALLELE = Allele.create("<DUP>", false);
        final Allele DEL_ALLELE = Allele.create("<DEL>", false);
        final Allele REF_ALLELE = Allele.create("N", true);
        while (iterIn.hasNext()) {
            final Locatable ctx = iterIn.next();
            if (ctx == null)
                continue;
            final SAMSequenceRecord ssr = dict.getSequence(ctx.getContig());
            if (ssr == null || ctx.getStart() >= ssr.getSequenceLength())
                continue;
            final int svLen = ctx.getLengthOnReference();
            if (svLen < this.min_abs_sv_size)
                continue;
            if (svLen > this.max_abs_sv_size)
                continue;
            int n_samples_with_cnv = 0;
            final SimplePosition breakPointLeft = new SimplePosition(ctx.getContig(), ctx.getStart());
            final SimplePosition breakPointRight = new SimplePosition(ctx.getContig(), ctx.getEnd());
            final int extend = 1 + (int) (svLen * this.extendFactor);
            final int leftPos = Math.max(1, breakPointLeft.getPosition() - extend);
            final int array_mid_start = breakPointLeft.getPosition() - leftPos;
            final int array_mid_end = breakPointRight.getPosition() - leftPos;
            final int rightPos = Math.min(breakPointRight.getPosition() + extend, ssr.getSequenceLength());
            final VariantContextBuilder vcb = new VariantContextBuilder();
            vcb.chr(ctx.getContig());
            vcb.start(ctx.getStart());
            vcb.stop(ctx.getEnd());
            vcb.attribute(VCFConstants.END_KEY, ctx.getEnd());
            final Set<Allele> alleles = new HashSet<>();
            alleles.add(REF_ALLELE);
            int count_dup = 0;
            int count_del = 0;
            int an = 0;
            final Counter<Allele> countAlleles = new Counter<>();
            final List<Genotype> genotypes = new ArrayList<>(sample2bam.size());
            Double badestGQ = null;
            final double[] raw_coverage = new double[CoordMath.getLength(leftPos, rightPos)];
            for (final String sampleName : sample2bam.keySet()) {
                final BamInfo bi = sample2bam.get(sampleName);
                Arrays.fill(raw_coverage, 0.0);
                int n_reads_supporting_sv = 0;
                try (CloseableIterator<SAMRecord> iter2 = bi.samReader.queryOverlapping(ctx.getContig(), leftPos, rightPos)) {
                    while (iter2.hasNext()) {
                        final SAMRecord rec = iter2.next();
                        if (!SAMRecordDefaultFilter.accept(rec, this.min_mapq))
                            continue;
                        final Cigar cigar = rec.getCigar();
                        if (cigar == null || cigar.isEmpty())
                            continue;
                        // any clip supporting deletion ?
                        boolean read_supports_cnv = false;
                        final int breakpoint_distance = 10;
                        // any clip on left ?
                        if (cigar.isLeftClipped() && rec.getUnclippedStart() < rec.getAlignmentStart() && new SimpleInterval(ctx.getContig(), rec.getUnclippedStart(), rec.getAlignmentStart()).withinDistanceOf(breakPointLeft, breakpoint_distance)) {
                            read_supports_cnv = true;
                        }
                        // any clip on right ?
                        if (!read_supports_cnv && cigar.isRightClipped() && rec.getAlignmentEnd() < rec.getUnclippedEnd() && new SimpleInterval(ctx.getContig(), rec.getAlignmentEnd(), rec.getUnclippedEnd()).withinDistanceOf(breakPointRight, breakpoint_distance)) {
                            read_supports_cnv = true;
                        }
                        if (read_supports_cnv) {
                            n_reads_supporting_sv++;
                        }
                        int ref = rec.getStart();
                        for (final CigarElement ce : cigar) {
                            final CigarOperator op = ce.getOperator();
                            if (op.consumesReferenceBases()) {
                                if (op.consumesReadBases()) {
                                    for (int x = 0; x < ce.getLength() && ref + x - leftPos < raw_coverage.length; ++x) {
                                        final int p = ref + x - leftPos;
                                        if (p < 0 || p >= raw_coverage.length)
                                            continue;
                                        raw_coverage[p]++;
                                    }
                                }
                                ref += ce.getLength();
                            }
                        }
                    }
                // end while iter record
                }
                // end try query for iterator
                // test for great difference between DP left and DP right
                final OptionalDouble medianDepthLeft = Percentile.median().evaluate(raw_coverage, 0, array_mid_start);
                final OptionalDouble medianDepthRight = Percentile.median().evaluate(raw_coverage, array_mid_end, raw_coverage.length - array_mid_end);
                // any is just too low
                if (!medianDepthLeft.isPresent() || medianDepthLeft.getAsDouble() < this.min_depth || !medianDepthRight.isPresent() || medianDepthRight.getAsDouble() < this.min_depth) {
                    final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("LowDp").make();
                    genotypes.add(gt2);
                    continue;
                }
                final double difference_factor = 2.0;
                // even if a value is divided , it remains greater than the other size
                if (medianDepthLeft.getAsDouble() / difference_factor > medianDepthRight.getAsDouble() || medianDepthRight.getAsDouble() / difference_factor > medianDepthLeft.getAsDouble()) {
                    final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("DiffLR").make();
                    genotypes.add(gt2);
                    continue;
                }
                // run median to smooth spline
                final double[] smoothed_cov = new RunMedian(RunMedian.getTurlachSize(raw_coverage.length)).apply(raw_coverage);
                final double[] bounds_cov = IntStream.concat(IntStream.range(0, array_mid_start), IntStream.range(array_mid_end, smoothed_cov.length)).mapToDouble(IDX -> raw_coverage[IDX]).toArray();
                final OptionalDouble optMedianBound = Percentile.median().evaluate(bounds_cov);
                if (!optMedianBound.isPresent() || optMedianBound.getAsDouble() == 0) {
                    final Genotype gt2 = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("MedZero").make();
                    genotypes.add(gt2);
                    continue;
                }
                final double medianBound = optMedianBound.getAsDouble();
                // divide coverage per medianBound
                final double[] normalized_mid_coverage = new double[array_mid_end - array_mid_start];
                for (int i = 0; i < normalized_mid_coverage.length; ++i) {
                    normalized_mid_coverage[i] = smoothed_cov[array_mid_start + i] / medianBound;
                }
                final double normDepth = Percentile.median().evaluate(normalized_mid_coverage).getAsDouble();
                final boolean is_sv;
                final boolean is_hom_deletion = Math.abs(normDepth - 0.0) <= this.treshold;
                final boolean is_het_deletion = Math.abs(normDepth - 0.5) <= this.treshold || (!is_hom_deletion && normDepth <= 0.5);
                final boolean is_hom_dup = Math.abs(normDepth - 2.0) <= this.treshold || normDepth > 2.0;
                final boolean is_het_dup = Math.abs(normDepth - 1.5) <= this.treshold || (!is_hom_dup && normDepth >= 1.5);
                final boolean is_ref = Math.abs(normDepth - 1.0) <= this.treshold;
                final double theoritical_depth;
                final GenotypeBuilder gb;
                if (is_ref) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, REF_ALLELE));
                    is_sv = false;
                    theoritical_depth = 1.0;
                    an += 2;
                } else if (is_het_deletion) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, DEL_ALLELE));
                    alleles.add(DEL_ALLELE);
                    is_sv = true;
                    theoritical_depth = 0.5;
                    count_del++;
                    an += 2;
                    countAlleles.incr(DEL_ALLELE);
                } else if (is_hom_deletion) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(DEL_ALLELE, DEL_ALLELE));
                    alleles.add(DEL_ALLELE);
                    vcb.filter(filterHomDel.getID());
                    is_sv = true;
                    theoritical_depth = 0.0;
                    count_del++;
                    an += 2;
                    countAlleles.incr(DEL_ALLELE, 2);
                } else if (is_het_dup) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, DUP_ALLELE));
                    alleles.add(DUP_ALLELE);
                    is_sv = true;
                    theoritical_depth = 1.5;
                    count_dup++;
                    an += 2;
                    countAlleles.incr(DUP_ALLELE);
                } else if (is_hom_dup) {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(DUP_ALLELE, DUP_ALLELE));
                    alleles.add(DUP_ALLELE);
                    vcb.filter(filterHomDup.getID());
                    is_sv = true;
                    theoritical_depth = 2.0;
                    count_dup++;
                    an += 2;
                    countAlleles.incr(DUP_ALLELE, 2);
                } else {
                    gb = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).filter("Ambigous");
                    is_sv = false;
                    theoritical_depth = 1.0;
                }
                if (is_sv) {
                    n_samples_with_cnv++;
                }
                double gq = Math.abs(theoritical_depth - normDepth);
                gq = Math.min(0.5, gq);
                gq = gq * gq;
                gq = gq / 0.25;
                gq = 99 * (1.0 - gq);
                gb.GQ((int) gq);
                if (badestGQ == null || badestGQ.compareTo(gq) > 0) {
                    badestGQ = gq;
                }
                gb.attribute(formatCN.getID(), normDepth);
                gb.attribute(nReadsSupportingSv.getID(), n_reads_supporting_sv);
                genotypes.add(gb.make());
            }
            vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, an);
            final List<Allele> orderedAlleles = new ArrayList<>(alleles);
            Collections.sort(orderedAlleles);
            if (orderedAlleles.size() > 1) {
                final List<Integer> acL = new ArrayList<>();
                final List<Double> afL = new ArrayList<>();
                for (int i = 1; i < orderedAlleles.size(); i++) {
                    final Allele a = orderedAlleles.get(i);
                    final int c = (int) countAlleles.count(a);
                    acL.add(c);
                    if (an > 0)
                        afL.add(c / (double) an);
                }
                vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, acL);
                if (an > 0)
                    vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, afL);
            }
            // if(alleles.size()<=1) continue;
            vcb.alleles(orderedAlleles);
            vcb.noID();
            vcb.genotypes(genotypes);
            vcb.attribute(infoSVSamples.getID(), n_samples_with_cnv);
            vcb.attribute(infoSvLen.getID(), svLen);
            if (count_dup == sample2bam.size() && sample2bam.size() != 1) {
                vcb.filter(filterAllDup.getID());
            }
            if (count_del == sample2bam.size() && sample2bam.size() != 1) {
                vcb.filter(filterAllDel.getID());
            }
            if (n_samples_with_cnv == 0) {
                vcb.filter(filterNoSV.getID());
            }
            if (badestGQ != null) {
                vcb.log10PError(badestGQ / -10.0);
            }
            out.add(vcb.make());
        }
        progress.close();
        out.close();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iterIn);
        CloserUtil.close(out);
        sample2bam.values().forEach(F -> CloserUtil.close(F));
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) BiFunction(java.util.function.BiFunction) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) CRAMReferenceSource(htsjdk.samtools.cram.ref.CRAMReferenceSource) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Percentile(com.github.lindenb.jvarkit.math.stats.Percentile) SAMRecord(htsjdk.samtools.SAMRecord) SimplePosition(com.github.lindenb.jvarkit.samtools.util.SimplePosition) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) IntStream(java.util.stream.IntStream) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Parameter(com.beust.jcommander.Parameter) OptionalDouble(java.util.OptionalDouble) HashMap(java.util.HashMap) ValidationStringency(htsjdk.samtools.ValidationStringency) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ReferenceSource(htsjdk.samtools.cram.ref.ReferenceSource) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) SAMRecordDefaultFilter(com.github.lindenb.jvarkit.samtools.SAMRecordDefaultFilter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) Locatable(htsjdk.samtools.util.Locatable) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Iterator(java.util.Iterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) FractionConverter(com.github.lindenb.jvarkit.jcommander.converter.FractionConverter) Closeable(java.io.Closeable) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) RunMedian(com.github.lindenb.jvarkit.math.RunMedian) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) CRAMReferenceSource(htsjdk.samtools.cram.ref.CRAMReferenceSource) ReferenceSource(htsjdk.samtools.cram.ref.ReferenceSource) Counter(com.github.lindenb.jvarkit.util.Counter) CRAMReferenceSource(htsjdk.samtools.cram.ref.CRAMReferenceSource) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) HashSet(java.util.HashSet) Genotype(htsjdk.variant.variantcontext.Genotype) CigarOperator(htsjdk.samtools.CigarOperator) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) OptionalDouble(java.util.OptionalDouble) CigarElement(htsjdk.samtools.CigarElement) RunMedian(com.github.lindenb.jvarkit.math.RunMedian) SAMRecord(htsjdk.samtools.SAMRecord) Locatable(htsjdk.samtools.util.Locatable) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) SimplePosition(com.github.lindenb.jvarkit.samtools.util.SimplePosition) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Path(java.nio.file.Path) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) OptionalDouble(java.util.OptionalDouble) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 20 with CloseableIterator

use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.

the class VcfPostProcessSV method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VCFIterator iterin, VariantContextWriter out) {
    SortingCollection<VariantContext> sorter1 = null;
    SortingCollection<VariantContext> sorter2 = null;
    try {
        final Counter<String> counter = new Counter<>();
        if (StringUtils.isBlank(this.keys)) {
            LOG.error("empty --keys");
            return -1;
        }
        if (StringUtils.isBlank(this.newAlternateSymbol)) {
            LOG.error("empty --allele");
            return -1;
        }
        final VCFHeader header = iterin.getHeader();
        if (header.getInfoHeaderLine(VCFConstants.SVTYPE) == null) {
            header.addMetaDataLine(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Type of structural variant"));
        }
        if (header.getInfoHeaderLine("SVLEN") == null) {
            header.addMetaDataLine(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Difference in length between REF and ALT alleles"));
        }
        if (header.getInfoHeaderLine(VCFConstants.END_KEY) == null) {
            header.addMetaDataLine(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "End position of the variant described in this record"));
        }
        final String mateID = Arrays.stream(CharSplitter.COMMA.split(this.keys)).map(S -> S.trim()).filter(S -> !StringUtils.isBlank(S)).map(S -> header.getInfoHeaderLine(S)).filter(H -> H != null && H.getType().equals(VCFHeaderLineType.String)).map(H -> H.getID()).findFirst().orElse(null);
        if (StringUtils.isBlank(mateID)) {
            LOG.error("Cannot find INFO for mate-id (was " + this.keys + ")");
            return -1;
        }
        LOG.info("Mate key is '" + mateID + "'. Reading input.");
        final Comparator<VariantContext> comparator = (A, B) -> A.getAttributeAsString(mateID, "").compareTo(B.getAttributeAsString(mateID, ""));
        sorter1 = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(header, true), comparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sorter2 = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(header, true), header.getVCFRecordComparator(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        while (iterin.hasNext()) {
            final VariantContext ctx = iterin.next();
            if (ctx.hasAttribute(mateID) && (!ctx.hasAttribute(VCFConstants.SVTYPE) || ctx.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))) {
                sorter1.add(ctx);
            } else {
                counter.incr("Not a BND variant");
                sorter2.add(ctx);
            }
        }
        sorter1.doneAdding();
        sorter1.setDestructiveIteration(true);
        CloseableIterator<VariantContext> iter2 = sorter1.iterator();
        PeekIterator<VariantContext> peek = new PeekIterator<>(iter2);
        while (peek.hasNext()) {
            final VariantContext first = peek.next();
            final List<VariantContext> variants = new ArrayList<>();
            variants.add(first);
            while (peek.hasNext()) {
                final VariantContext second = peek.peek();
                if (first.hasID() && first.getID().equals(second.getAttributeAsString(mateID, ""))) {
                    variants.add(peek.next());
                } else if (second.hasID() && second.getID().equals(first.getAttributeAsString(mateID, ""))) {
                    variants.add(peek.next());
                } else if (first.getAttributeAsString(mateID, "").equals(second.getAttributeAsString(mateID, ""))) {
                    variants.add(peek.next());
                } else {
                    break;
                }
            }
            if (variants.size() != 2) {
                counter.incr("Not a pair of Mate (" + variants.size() + ")", variants.size());
                for (final VariantContext ctx : variants) {
                    sorter2.add(ctx);
                }
                continue;
            }
            Collections.sort(variants, (A, B) -> Integer.compare(A.getStart(), B.getStart()));
            final VariantContext ctx1 = variants.get(0);
            final VariantContext ctx2 = variants.get(1);
            if (!(ctx1.getNAlleles() == 2 && ctx1.getNAlleles() == 2)) {
                counter.incr("expected two alleles.", 2L);
                sorter2.add(ctx1);
                sorter2.add(ctx2);
                continue;
            }
            final Breakend brk1 = Breakend.parse(ctx1.getAlleles().get(1)).orElse(null);
            final Breakend brk2 = Breakend.parse(ctx2.getAlleles().get(1)).orElse(null);
            if (brk1 == null || brk2 == null) {
                counter.incr("ALT is not breakend.", 2L);
                sorter2.add(ctx1);
                sorter2.add(ctx2);
            }
            /* should be the same breakend 
				difference can be large use CIPOS / CIEND ?
				if( !ctx1.getContig().equals(brk2.getContig()) ||
					!ctx2.getContig().equals(brk1.getContig()) ||
					Math.abs(ctx1.getStart()-brk2.getStart())>1  ||
					Math.abs(ctx2.getStart()-brk1.getStart())>1) {
					counter.incr("mate is not the same position.",2L);
					sorter2.add(ctx1);
					sorter2.add(ctx2);
					continue;
					}
				*/
            final VariantContextBuilder vcb1 = new VariantContextBuilder(ctx1);
            final VariantContextBuilder vcb2 = new VariantContextBuilder(ctx2);
            if (!ctx1.contigsMatch(ctx2)) {
                vcb1.attribute(VCFConstants.SVTYPE, "TRANSLOC");
                vcb2.attribute(VCFConstants.SVTYPE, "TRANSLOC");
                sorter2.add(vcb1.make());
                sorter2.add(vcb2.make());
                counter.incr("translocation.", 2L);
                continue;
            }
            final Allele ctx1_alt = ctx1.getAlleles().get(1);
            final List<Allele> alleles = Arrays.asList(ctx1.getReference(), Allele.create("<" + this.newAlternateSymbol.trim() + ">", false));
            vcb1.attribute(VCFConstants.SVTYPE, this.newAlternateSymbol.trim());
            vcb1.alleles(alleles);
            final int ctx_end = Math.max(ctx1.getEnd(), ctx2.getEnd());
            vcb1.stop(ctx_end);
            vcb1.attribute("END", ctx_end);
            vcb1.attribute("SVLEN", CoordMath.getLength(ctx1.getStart(), ctx_end));
            vcb1.rmAttribute(mateID);
            vcb1.genotypes(ctx1.getGenotypes().stream().map(GT -> new GenotypeBuilder(GT).alleles(GT.getAlleles().stream().map(A -> A.equals(ctx1_alt) ? alleles.get(1) : A).collect(Collectors.toList())).make()).collect(Collectors.toList()));
            sorter2.add(vcb1.make());
            counter.incr("Two BND variants converted.", 2L);
        }
        iter2.close();
        sorter1.cleanup();
        sorter1 = null;
        sorter2.doneAdding();
        sorter2.setDestructiveIteration(true);
        iter2 = sorter2.iterator();
        final VCFHeader header2 = new VCFHeader(header);
        JVarkitVersion.getInstance().addMetaData(this, header2);
        out.writeHeader(header2);
        while (iter2.hasNext()) {
            out.add(iter2.next());
        }
        iter2.close();
        sorter2.cleanup();
        sorter2 = null;
        if (!disable_summary) {
            LOG.info("SUMMARY COUNT");
            for (final String key : counter.keySet()) {
                LOG.info("\t" + key + " : " + counter.count(key));
            }
        }
        return 0;
    } catch (final Throwable err) {
        err.printStackTrace();
        LOG.error(err);
        return -1;
    } finally {
        if (sorter1 != null)
            sorter1.cleanup();
        if (sorter2 != null)
            sorter2.cleanup();
    }
}
Also used : CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) Breakend(com.github.lindenb.jvarkit.variant.variantcontext.Breakend) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) OnePassVcfLauncher(com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher) Comparator(java.util.Comparator) PeekIterator(htsjdk.samtools.util.PeekIterator) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) PeekIterator(htsjdk.samtools.util.PeekIterator) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Breakend(com.github.lindenb.jvarkit.variant.variantcontext.Breakend) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Allele(htsjdk.variant.variantcontext.Allele) Counter(com.github.lindenb.jvarkit.util.Counter) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeader(htsjdk.variant.vcf.VCFHeader)

Aggregations

CloseableIterator (htsjdk.samtools.util.CloseableIterator)103 List (java.util.List)86 Logger (com.github.lindenb.jvarkit.util.log.Logger)85 Parameter (com.beust.jcommander.Parameter)82 Program (com.github.lindenb.jvarkit.util.jcommander.Program)78 ArrayList (java.util.ArrayList)73 Collectors (java.util.stream.Collectors)71 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)69 Path (java.nio.file.Path)69 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)66 CloserUtil (htsjdk.samtools.util.CloserUtil)64 Set (java.util.Set)64 VCFHeader (htsjdk.variant.vcf.VCFHeader)59 VariantContext (htsjdk.variant.variantcontext.VariantContext)54 IOException (java.io.IOException)53 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)51 SAMRecord (htsjdk.samtools.SAMRecord)51 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)51 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)50 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)49