Search in sources :

Example 86 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class VCFCombineTwoSnvs method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, File saveAs) {
    BufferedReader bufferedReader = null;
    htsjdk.variant.variantcontext.writer.VariantContextWriter w = null;
    SortingCollection<CombinedMutation> mutations = null;
    CloseableIterator<Variant> varIter = null;
    CloseableIterator<CombinedMutation> mutIter = null;
    Map<String, SamReader> sample2samReader = new HashMap<>();
    try {
        bufferedReader = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
        final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(bufferedReader);
        /* get VCF header */
        final VCFHeader header = cah.header;
        final Set<String> sampleNamesInOrder = new HashSet<>(header.getSampleNamesInOrder());
        LOG.info("opening REF:" + referenceFile);
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.referenceFile);
        final SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
        if (dict == null)
            throw new IOException("dictionary missing");
        if (this.bamIn != null) {
            /**
             * unroll and open bam file
             */
            for (final File bamFile : IOUtils.unrollFileCollection(Collections.singletonList(this.bamIn))) {
                LOG.info("opening BAM :" + this.bamIn);
                final SamReader samReader = SamReaderFactory.makeDefault().referenceSequence(this.referenceFile).validationStringency(ValidationStringency.LENIENT).open(this.bamIn);
                if (!samReader.hasIndex()) {
                    throw new IOException("Sam file is NOT indexed: " + bamFile);
                }
                final SAMFileHeader samHeader = samReader.getFileHeader();
                if (samHeader.getSequenceDictionary() == null || !SequenceUtil.areSequenceDictionariesEqual(dict, samReader.getFileHeader().getSequenceDictionary())) {
                    throw new IOException(bamFile + " and REF don't have the same Sequence Dictionary.");
                }
                /* get sample name */
                String sampleName = null;
                for (final SAMReadGroupRecord rg : samHeader.getReadGroups()) {
                    if (rg.getSample() == null)
                        continue;
                    if (sampleName != null && !sampleName.equals(rg.getSample())) {
                        samReader.close();
                        throw new IOException(bamFile + " Contains two samples " + sampleName + " " + rg.getSample());
                    }
                    sampleName = rg.getSample();
                }
                if (sampleName == null) {
                    samReader.close();
                    LOG.warn("no sample in " + bamFile);
                    continue;
                }
                if (!sampleNamesInOrder.contains(sampleName)) {
                    samReader.close();
                    LOG.warn("no sample " + sampleName + " in vcf");
                    continue;
                }
                sample2samReader.put(sampleName, samReader);
            }
        }
        loadKnownGenesFromUri();
        this.variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), new VariantComparator(dict), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        this.variants.setDestructiveIteration(true);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        String vcfLine = null;
        while ((vcfLine = bufferedReader.readLine()) != null) {
            final VariantContext ctx = progress.watch(cah.codec.decode(vcfLine));
            /* discard non SNV variant */
            if (!ctx.isVariant() || ctx.isIndel()) {
                continue;
            }
            /* find the overlapping genes : extend the interval of the variant to include the stop codon */
            final Collection<KnownGene> genes = new ArrayList<>();
            for (List<KnownGene> lkg : this.knownGenes.getOverlapping(new Interval(ctx.getContig(), Math.max(1, ctx.getStart() - 3), ctx.getEnd() + 3))) {
                genes.addAll(lkg);
            }
            final List<Allele> alternateAlleles = ctx.getAlternateAlleles();
            /* loop over overlapping genes */
            for (final KnownGene kg : genes) {
                /* loop over available alleles */
                for (int allele_idx = 0; allele_idx < alternateAlleles.size(); ++allele_idx) {
                    final Allele alt = alternateAlleles.get(allele_idx);
                    challenge(ctx, alt, kg, vcfLine);
                }
            }
        }
        progress.finish();
        this.variants.doneAdding();
        mutations = SortingCollection.newInstance(CombinedMutation.class, new MutationCodec(), new MutationComparator(dict), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        mutations.setDestructiveIteration(true);
        final VCFFilterHeaderLine vcfFilterHeaderLine = new VCFFilterHeaderLine("TwoHaplotypes", "(number of reads carrying both mutation) < (reads carrying variant 1 + reads carrying variant 2) ");
        varIter = this.variants.iterator();
        progress = new SAMSequenceDictionaryProgress(header);
        final ArrayList<Variant> buffer = new ArrayList<>();
        for (; ; ) {
            Variant variant = null;
            if (varIter.hasNext()) {
                variant = varIter.next();
                progress.watch(variant.contig, variant.genomicPosition1);
            }
            if (variant == null || !(!buffer.isEmpty() && buffer.get(0).contig.equals(variant.contig) && buffer.get(0).transcriptName.equals(variant.transcriptName))) {
                if (!buffer.isEmpty()) {
                    for (int i = 0; i < buffer.size(); ++i) {
                        final Variant v1 = buffer.get(i);
                        for (int j = i + 1; j < buffer.size(); ++j) {
                            final Variant v2 = buffer.get(j);
                            if (v1.codonStart() != v2.codonStart())
                                continue;
                            if (v1.positionInCodon() == v2.positionInCodon())
                                continue;
                            if (!v1.wildCodon.equals(v2.wildCodon)) {
                                throw new IllegalStateException();
                            }
                            final StringBuilder combinedCodon = new StringBuilder(v1.wildCodon);
                            combinedCodon.setCharAt(v1.positionInCodon(), v1.mutCodon.charAt(v1.positionInCodon()));
                            combinedCodon.setCharAt(v2.positionInCodon(), v2.mutCodon.charAt(v2.positionInCodon()));
                            final String pwild = new ProteinCharSequence(v1.wildCodon).getString();
                            final String p1 = new ProteinCharSequence(v1.mutCodon).getString();
                            final String p2 = new ProteinCharSequence(v2.mutCodon).getString();
                            final String pCombined = new ProteinCharSequence(combinedCodon).getString();
                            final String combinedSO;
                            final String combinedType;
                            /* both AA are synonymous, while combined is not */
                            if (!pCombined.equals(pwild) && p1.equals(pwild) && p2.equals(pwild)) {
                                combinedType = "combined_is_nonsynonymous";
                                if (pCombined.equals("*")) {
                                    /* http://www.sequenceontology.org/browser/current_svn/term/SO:0001587 */
                                    combinedSO = "stop_gained";
                                } else if (pwild.equals("*")) {
                                    /* http://www.sequenceontology.org/browser/current_svn/term/SO:0002012 */
                                    combinedSO = "stop_lost";
                                } else {
                                    /* http://www.sequenceontology.org/miso/current_svn/term/SO:0001992 */
                                    combinedSO = "nonsynonymous_variant";
                                }
                            } else if (!pCombined.equals(p1) && !pCombined.equals(p2) && !pCombined.equals(pwild)) {
                                combinedType = "combined_is_new";
                                if (pCombined.equals("*")) {
                                    /* http://www.sequenceontology.org/browser/current_svn/term/SO:0001587 */
                                    combinedSO = "stop_gained";
                                } else {
                                    /* http://www.sequenceontology.org/miso/current_svn/term/SO:0001992 */
                                    combinedSO = "nonsynonymous_variant";
                                }
                            } else {
                                combinedType = null;
                                combinedSO = null;
                            }
                            /**
                             * ok, there is something interesting here ,
                             * create two new Mutations carrying the
                             * two variants
                             */
                            if (combinedSO != null) {
                                /**
                                 * grantham score is max found combined vs (p1/p2/wild)
                                 */
                                int grantham_score = GranthamScore.score(pCombined.charAt(0), pwild.charAt(0));
                                grantham_score = Math.max(grantham_score, GranthamScore.score(pCombined.charAt(0), p1.charAt(0)));
                                grantham_score = Math.max(grantham_score, GranthamScore.score(pCombined.charAt(0), p2.charAt(0)));
                                /**
                                 * info that will be displayed in the vcf
                                 */
                                final Map<String, Object> info1 = v1.getInfo(v2);
                                final Map<String, Object> info2 = v2.getInfo(v1);
                                // filter for this combined: default it fails the filter
                                String filter = vcfFilterHeaderLine.getID();
                                final Map<String, Object> combinedMap = new LinkedHashMap<>();
                                combinedMap.put("CombinedCodon", combinedCodon);
                                combinedMap.put("CombinedAA", pCombined);
                                combinedMap.put("CombinedSO", combinedSO);
                                combinedMap.put("CombinedType", combinedType);
                                combinedMap.put("GranthamScore", grantham_score);
                                info1.putAll(combinedMap);
                                info2.putAll(combinedMap);
                                final Map<String, CoverageInfo> sample2coverageInfo = new HashMap<>(sample2samReader.size());
                                final int chromStart = Math.min(v1.genomicPosition1, v2.genomicPosition1);
                                final int chromEnd = Math.max(v1.genomicPosition1, v2.genomicPosition1);
                                /* get phasing info for each sample*/
                                for (final String sampleName : sample2samReader.keySet()) {
                                    final SamReader samReader = sample2samReader.get(sampleName);
                                    final CoverageInfo covInfo = new CoverageInfo();
                                    sample2coverageInfo.put(sampleName, covInfo);
                                    SAMRecordIterator iter = null;
                                    try {
                                        iter = samReader.query(v1.contig, chromStart, chromEnd, false);
                                        while (iter.hasNext()) {
                                            final SAMRecord rec = iter.next();
                                            if (rec.getReadUnmappedFlag())
                                                continue;
                                            if (rec.isSecondaryOrSupplementary())
                                                continue;
                                            if (rec.getDuplicateReadFlag())
                                                continue;
                                            if (rec.getReadFailsVendorQualityCheckFlag())
                                                continue;
                                            // get DEPTh for variant 1
                                            if (rec.getAlignmentStart() <= v1.genomicPosition1 && v1.genomicPosition1 <= rec.getAlignmentEnd()) {
                                                covInfo.depth1++;
                                            }
                                            // get DEPTh for variant 2
                                            if (rec.getAlignmentStart() <= v2.genomicPosition1 && v2.genomicPosition1 <= rec.getAlignmentEnd()) {
                                                covInfo.depth2++;
                                            }
                                            if (rec.getAlignmentEnd() < chromEnd)
                                                continue;
                                            if (rec.getAlignmentStart() > chromStart)
                                                continue;
                                            final Cigar cigar = rec.getCigar();
                                            if (cigar == null)
                                                continue;
                                            final byte[] bases = rec.getReadBases();
                                            if (bases == null)
                                                continue;
                                            int refpos1 = rec.getAlignmentStart();
                                            int readpos = 0;
                                            boolean found_variant1_on_this_read = false;
                                            boolean found_variant2_on_this_read = false;
                                            /**
                                             * loop over cigar
                                             */
                                            for (final CigarElement ce : cigar.getCigarElements()) {
                                                final CigarOperator op = ce.getOperator();
                                                switch(op) {
                                                    case P:
                                                        continue;
                                                    case S:
                                                    case I:
                                                        readpos += ce.getLength();
                                                        break;
                                                    case D:
                                                    case N:
                                                        refpos1 += ce.getLength();
                                                        break;
                                                    case H:
                                                        continue;
                                                    case EQ:
                                                    case M:
                                                    case X:
                                                        for (int x = 0; x < ce.getLength(); ++x) {
                                                            if (refpos1 == v1.genomicPosition1 && same(bases[readpos], v1.altAllele)) {
                                                                found_variant1_on_this_read = true;
                                                            } else if (refpos1 == v2.genomicPosition1 && same(bases[readpos], v2.altAllele)) {
                                                                found_variant2_on_this_read = true;
                                                            }
                                                            refpos1++;
                                                            readpos++;
                                                        }
                                                        break;
                                                    default:
                                                        throw new IllegalStateException(op.name());
                                                }
                                                /* skip remaining bases after last variant */
                                                if (refpos1 > chromEnd)
                                                    break;
                                            }
                                            /* sum-up what we found */
                                            if (found_variant1_on_this_read && found_variant2_on_this_read) {
                                                covInfo.count_reads_having_both_variants++;
                                            } else if (!found_variant1_on_this_read && !found_variant2_on_this_read) {
                                                covInfo.count_reads_having_no_variants++;
                                            } else if (found_variant1_on_this_read) {
                                                covInfo.count_reads_having_variant1++;
                                            } else if (found_variant2_on_this_read) {
                                                covInfo.count_reads_having_variant2++;
                                            }
                                        }
                                    /* end of loop over reads */
                                    } finally {
                                        iter.close();
                                        iter = null;
                                    }
                                    info1.put("N_READS_BOTH_VARIANTS_" + sampleName, covInfo.count_reads_having_both_variants);
                                    info2.put("N_READS_BOTH_VARIANTS_" + sampleName, covInfo.count_reads_having_both_variants);
                                    info1.put("N_READS_NO_VARIANTS_" + sampleName, covInfo.count_reads_having_no_variants);
                                    info2.put("N_READS_NO_VARIANTS_" + sampleName, covInfo.count_reads_having_no_variants);
                                    info1.put("N_READS_TOTAL_" + sampleName, covInfo.count_reads_having_both_variants + covInfo.count_reads_having_no_variants + covInfo.count_reads_having_variant1 + covInfo.count_reads_having_variant2);
                                    info2.put("N_READS_TOTAL_" + sampleName, covInfo.count_reads_having_both_variants + covInfo.count_reads_having_no_variants + covInfo.count_reads_having_variant1 + covInfo.count_reads_having_variant2);
                                    // count for variant 1
                                    info1.put("N_READS_ONLY_1_" + sampleName, covInfo.count_reads_having_variant1);
                                    info1.put("N_READS_ONLY_2_" + sampleName, covInfo.count_reads_having_variant2);
                                    info1.put("DEPTH_1_" + sampleName, covInfo.depth1);
                                    // inverse previous count
                                    info2.put("N_READS_ONLY_1_" + sampleName, covInfo.count_reads_having_variant2);
                                    info2.put("N_READS_ONLY_2_" + sampleName, covInfo.count_reads_having_variant1);
                                    info2.put("DEPTH_2_" + sampleName, covInfo.depth2);
                                    /* number of reads with both variant is greater than
									 * reads carrying only one variant: reset the filter 
									 */
                                    if (2 * covInfo.count_reads_having_both_variants > (covInfo.count_reads_having_variant1 + covInfo.count_reads_having_variant2)) {
                                        /* reset filter */
                                        filter = VCFConstants.UNFILTERED;
                                        info1.put("FILTER_1_" + sampleName, ".");
                                        info2.put("FILTER_2_" + sampleName, ".");
                                    } else {
                                        info1.put("FILTER_1_" + sampleName, vcfFilterHeaderLine.getID());
                                        info2.put("FILTER_2_" + sampleName, vcfFilterHeaderLine.getID());
                                    }
                                }
                                /* end of loop over bams */
                                final CombinedMutation m1 = new CombinedMutation();
                                m1.contig = v1.contig;
                                m1.genomicPosition1 = v1.genomicPosition1;
                                m1.id = v1.id;
                                m1.refAllele = v1.refAllele;
                                m1.altAllele = v1.altAllele;
                                m1.vcfLine = v1.vcfLine;
                                m1.info = mapToString(info1);
                                m1.filter = filter;
                                m1.grantham_score = grantham_score;
                                m1.sorting_id = ID_GENERATOR++;
                                mutations.add(m1);
                                final CombinedMutation m2 = new CombinedMutation();
                                m2.contig = v2.contig;
                                m2.genomicPosition1 = v2.genomicPosition1;
                                m2.id = v2.id;
                                m2.refAllele = v2.refAllele;
                                m2.altAllele = v2.altAllele;
                                m2.vcfLine = v2.vcfLine;
                                m2.info = mapToString(info2);
                                m2.filter = filter;
                                m2.grantham_score = grantham_score;
                                m2.sorting_id = ID_GENERATOR++;
                                mutations.add(m2);
                            }
                        }
                    }
                }
                buffer.clear();
                if (variant == null)
                    break;
            }
            buffer.add(variant);
        }
        progress.finish();
        mutations.doneAdding();
        varIter.close();
        varIter = null;
        variants.cleanup();
        variants = null;
        final ArrayList<CombinedMutation> mBuffer = new ArrayList<>();
        final VCFHeader header2 = new VCFHeader(header);
        header2.addMetaDataLine(new VCFHeaderLine(getProgramName() + "AboutQUAL", "QUAL is filled with Grantham Score  http://www.ncbi.nlm.nih.gov/pubmed/4843792"));
        final StringBuilder infoDesc = new StringBuilder("Variant affected by two distinct mutation. Format is defined in the INFO column. ");
        final VCFInfoHeaderLine infoHeaderLine = new VCFInfoHeaderLine("CodonVariant", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, infoDesc.toString());
        super.addMetaData(header2);
        header2.addMetaDataLine(infoHeaderLine);
        if (!sample2samReader.isEmpty()) {
            header2.addMetaDataLine(vcfFilterHeaderLine);
        }
        w = super.openVariantContextWriter(saveAs);
        w.writeHeader(header2);
        progress = new SAMSequenceDictionaryProgress(header);
        mutIter = mutations.iterator();
        for (; ; ) {
            CombinedMutation mutation = null;
            if (mutIter.hasNext()) {
                mutation = mutIter.next();
                progress.watch(mutation.contig, mutation.genomicPosition1);
            }
            if (mutation == null || !(!mBuffer.isEmpty() && mBuffer.get(0).contig.equals(mutation.contig) && mBuffer.get(0).genomicPosition1 == mutation.genomicPosition1 && mBuffer.get(0).refAllele.equals(mutation.refAllele))) {
                if (!mBuffer.isEmpty()) {
                    // default grantham score used in QUAL
                    int grantham_score = -1;
                    // default filter fails
                    String filter = vcfFilterHeaderLine.getID();
                    final CombinedMutation first = mBuffer.get(0);
                    final Set<String> info = new HashSet<>();
                    final VariantContext ctx = cah.codec.decode(first.vcfLine);
                    final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                    vcb.chr(first.contig);
                    vcb.start(first.genomicPosition1);
                    vcb.stop(first.genomicPosition1 + first.refAllele.length() - 1);
                    if (!first.id.equals(VCFConstants.EMPTY_ID_FIELD))
                        vcb.id(first.id);
                    for (final CombinedMutation m : mBuffer) {
                        info.add(m.info);
                        grantham_score = Math.max(grantham_score, m.grantham_score);
                        if (VCFConstants.UNFILTERED.equals(m.filter)) {
                            // at least one SNP is ok one this line
                            filter = null;
                        }
                    }
                    vcb.unfiltered();
                    if (filter != null && !sample2samReader.isEmpty()) {
                        vcb.filter(filter);
                    } else {
                        vcb.passFilters();
                    }
                    vcb.attribute(infoHeaderLine.getID(), new ArrayList<String>(info));
                    if (grantham_score > 0) {
                        vcb.log10PError(grantham_score / -10.0);
                    } else {
                        vcb.log10PError(VariantContext.NO_LOG10_PERROR);
                    }
                    w.add(vcb.make());
                }
                mBuffer.clear();
                if (mutation == null)
                    break;
            }
            mBuffer.add(mutation);
        }
        progress.finish();
        mutIter.close();
        mutations.cleanup();
        mutations = null;
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
        CloserUtil.close(mutIter);
        CloserUtil.close(varIter);
        if (this.variants != null)
            this.variants.cleanup();
        if (mutations != null)
            mutations.cleanup();
        this.variants = null;
        for (SamReader r : sample2samReader.values()) CloserUtil.close(r);
        CloserUtil.close(w);
        CloserUtil.close(bufferedReader);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) LinkedHashMap(java.util.LinkedHashMap) HashSet(java.util.HashSet) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) SAMRecord(htsjdk.samtools.SAMRecord) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) SAMFileHeader(htsjdk.samtools.SAMFileHeader) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) File(java.io.File) Interval(htsjdk.samtools.util.Interval) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) IOException(java.io.IOException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader)

Example 87 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class SamExtractClip method run.

private void run(final SamReader r, final FastqWriter out) {
    int[] startend = new int[2];
    final SAMFileHeader header = r.getFileHeader();
    // w=swf.make(header, System.out);
    SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
    SAMRecordIterator it = r.iterator();
    while (it.hasNext()) {
        final SAMRecord rec = progress.watch(it.next());
        if (rec.getReadUnmappedFlag())
            continue;
        if (rec.isSecondaryOrSupplementary())
            continue;
        if (rec.getDuplicateReadFlag())
            continue;
        final Cigar cigar = rec.getCigar();
        if (cigar == null || cigar.isEmpty())
            continue;
        String suffix = "";
        if (rec.getReadPairedFlag()) {
            suffix = (rec.getFirstOfPairFlag() ? "/1" : "/2");
        }
        startend[0] = 0;
        startend[1] = rec.getReadLength();
        boolean found = false;
        for (int side = 0; side < 2; ++side) {
            final CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
            if (!ce.getOperator().equals(CigarOperator.S))
                continue;
            if (ce.getLength() < min_clip_length)
                continue;
            found = true;
            final String clippedSeq;
            final String clippedQual;
            if (side == 0) {
                startend[0] = ce.getLength();
                clippedSeq = rec.getReadString().substring(0, startend[0]);
                clippedQual = rec.getBaseQualityString().substring(0, startend[0]);
            } else {
                startend[1] = rec.getReadLength() - ce.getLength();
                clippedSeq = rec.getReadString().substring(startend[1]);
                clippedQual = rec.getBaseQualityString().substring(startend[1]);
            }
            out.write(new FastqRecord(rec.getReadName() + suffix + ";" + side + ";" + rec.getReferenceName() + ";" + rec.getAlignmentStart() + ";" + rec.getFlags() + ";" + rec.getCigarString() + ";" + (side == 0 ? "5'" : "3'"), clippedSeq, "", clippedQual));
        }
        if (!found)
            continue;
        String bases = rec.getReadString();
        String qual = rec.getBaseQualityString();
        if (rec.getReadNegativeStrandFlag()) {
            bases = AcidNucleics.reverseComplement(bases);
            qual = new StringBuilder(qual).reverse().toString();
        }
        if (print_original_read) {
            out.write(new FastqRecord(rec.getReadName() + suffix, bases, "", qual));
        }
        if (print_clipped_read) {
            out.write(new FastqRecord(rec.getReadName() + suffix + ":clipped", bases.substring(startend[0], startend[1]), "", qual.substring(startend[0], startend[1])));
        }
    }
    it.close();
    progress.finish();
}
Also used : Cigar(htsjdk.samtools.Cigar) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMRecord(htsjdk.samtools.SAMRecord) FastqRecord(htsjdk.samtools.fastq.FastqRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) CigarElement(htsjdk.samtools.CigarElement)

Example 88 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class SplitByTile method doWork.

@Override
public int doWork(final List<String> args) {
    if (OUTPUT == null || !OUTPUT.contains(TILEWORD) || !OUTPUT.endsWith(".bam")) {
        LOG.error("Bad OUPUT name " + OUTPUT + ". must contain " + TILEWORD + " and ends with .bam");
        return -1;
    }
    SamReader samReader = null;
    final Map<Integer, SAMFileWriter> tile2writer = new HashMap<Integer, SAMFileWriter>();
    final Pattern colon = Pattern.compile("[\\:]");
    SAMRecordIterator iter = null;
    try {
        samReader = super.openSamReader(oneFileOrNull(args));
        final SAMFileHeader header = samReader.getFileHeader();
        iter = samReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = iter.next();
            final String[] tokens = colon.split(rec.getReadName(), 6);
            if (tokens.length < 5) {
                LOG.error("Cannot get the 6th field in " + rec.getReadName());
                samReader.close();
                samReader = null;
                return -1;
            }
            int tile = -1;
            try {
                tile = Integer.parseInt(tokens[4]);
            } catch (Exception e) {
                tile = -1;
            }
            if (tile < 0) {
                LOG.error("Bad tile in read: " + rec.getReadName());
                samReader.close();
                samReader = null;
                return -1;
            }
            SAMFileWriter sfw = tile2writer.get(tile);
            if (sfw == null) {
                final File outFile = new File(OUTPUT.replaceAll(TILEWORD, String.valueOf(tile)));
                LOG.info("create output for " + outFile);
                if (outFile.getParentFile() != null) {
                    outFile.getParentFile().mkdirs();
                }
                sfw = this.writingBamArgs.openSAMFileWriter(outFile, header, true);
                tile2writer.put(tile, sfw);
            }
            sfw.addAlignment(rec);
        }
        for (final SAMFileWriter sfw : tile2writer.values()) {
            sfw.close();
        }
    } catch (final Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        if (iter != null)
            iter.close();
        CloserUtil.close(samReader);
    }
    return 0;
}
Also used : Pattern(java.util.regex.Pattern) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File)

Example 89 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class SamColorTag method doWork.

@Override
public int doWork(final List<String> args) {
    SAMRecordIterator iter = null;
    SamReader samFileReader = null;
    SAMFileWriter sw = null;
    final ColorUtils colorUtils = new ColorUtils();
    try {
        final CompiledScript script = super.compileJavascript(this.jsExpression, this.jsFile);
        samFileReader = openSamReader(oneFileOrNull(args));
        final SAMFileHeader srcheader = samFileReader.getFileHeader();
        final SAMFileHeader header = srcheader.clone();
        header.addComment(ColorUtils.YC_TAG + " attribute added with " + getProgramName() + " " + getProgramCommandLine());
        sw = this.writingBamArgs.openSAMFileWriter(outputFile, header, true);
        final Bindings bindings = script.getEngine().createBindings();
        bindings.put("header", samFileReader.getFileHeader());
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        iter = samFileReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord record = progress.watch(iter.next());
            bindings.put("record", record);
            final Color color;
            Object result;
            try {
                result = script.eval(bindings);
            } catch (final Exception err) {
                if (!ignoreErrors) {
                    LOG.error(err);
                    return -1;
                }
                result = null;
            }
            if (result == null) {
                color = null;
            } else if (result instanceof Color) {
                color = Color.class.cast(result);
            } else if (result instanceof String) {
                final String s = (String) result;
                Color c2 = null;
                try {
                    if (s.trim().isEmpty()) {
                        c2 = null;
                    } else {
                        c2 = colorUtils.parse(s);
                    }
                } catch (final Exception err) {
                    if (!ignoreErrors) {
                        LOG.error(err);
                        return -1;
                    }
                    c2 = null;
                }
                color = c2;
            } else {
                if (!ignoreErrors) {
                    LOG.error("Cannot cast to color a " + result.getClass());
                    return -1;
                }
                color = null;
            }
            if (color != null) {
                record.setAttribute(ColorUtils.YC_TAG, ColorUtils.colorToSamAttribute(color));
            } else {
                // clear attribute
                record.setAttribute(ColorUtils.YC_TAG, null);
            }
            sw.addAlignment(record);
        }
        sw.close();
        sw = null;
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(samFileReader);
        CloserUtil.close(sw);
    }
}
Also used : CompiledScript(javax.script.CompiledScript) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Color(java.awt.Color) ColorUtils(com.github.lindenb.jvarkit.util.swing.ColorUtils) Bindings(javax.script.Bindings) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 90 with SAMRecordIterator

use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.

the class SamJdk method doWork.

@Override
public int doWork(final List<String> args) {
    SAMRecordIterator iter = null;
    SamReader samFileReader = null;
    SAMFileWriter sw = null;
    try {
        final String code;
        if (this.scriptFile != null) {
            code = IOUtil.slurp(this.scriptFile);
        } else if (!StringUtil.isBlank(this.scriptExpr)) {
            code = this.scriptExpr;
        } else {
            LOG.error("Option -e or -f are required. The content of those empty mut be not empty");
            return -1;
        }
        final Random rand = new Random(System.currentTimeMillis());
        final String javaClassName = SamJdk.class.getSimpleName() + "Custom" + Math.abs(rand.nextInt());
        final StringWriter codeWriter = new StringWriter();
        final PrintWriter pw = new PrintWriter(codeWriter);
        pw.println("import java.util.*;");
        pw.println("import java.util.stream.*;");
        pw.println("import java.util.function.*;");
        pw.println("import htsjdk.samtools.*;");
        pw.println("import htsjdk.samtools.util.*;");
        pw.println("import javax.annotation.Generated;");
        pw.println("@Generated(value=\"" + SamJdk.class.getSimpleName() + "\",date=\"" + new Iso8601Date(new Date()) + "\")");
        pw.println("public class " + javaClassName + " extends " + (this.pair_mode ? AbstractListFilter.class : AbstractFilter.class).getName().replace('$', '.') + " {");
        pw.println("  public " + javaClassName + "(final SAMFileHeader header) {");
        pw.println("  super(header);");
        pw.println("  }");
        if (user_code_is_body) {
            pw.println("   //user's code starts here");
            pw.println(code);
            pw.println("   // user's code ends here");
        } else {
            pw.println("  @Override");
            pw.println("  public Object apply(final " + (this.pair_mode ? "List<SAMRecord> records" : "SAMRecord record") + ") {");
            pw.println("   /** user's code starts here */");
            pw.println(code);
            pw.println("/** user's code ends here */");
            pw.println("   }");
        }
        pw.println("}");
        pw.flush();
        if (!hideGeneratedCode) {
            LOG.debug(" Compiling :\n" + InMemoryCompiler.beautifyCode(codeWriter.toString()));
        }
        if (this.saveCodeInDir != null) {
            PrintWriter cw = null;
            try {
                IOUtil.assertDirectoryIsWritable(this.saveCodeInDir);
                cw = new PrintWriter(new File(this.saveCodeInDir, javaClassName + ".java"));
                cw.write(codeWriter.toString());
                cw.flush();
                cw.close();
                cw = null;
                LOG.info("saved " + javaClassName + ".java in " + this.saveCodeInDir);
            } catch (final Exception err) {
                LOG.error(err);
                return -1;
            } finally {
                CloserUtil.close(cw);
            }
        }
        final InMemoryCompiler inMemoryCompiler = new InMemoryCompiler();
        final Class<?> compiledClass = inMemoryCompiler.compileClass(javaClassName, codeWriter.toString());
        final Constructor<?> ctor = compiledClass.getDeclaredConstructor(SAMFileHeader.class);
        samFileReader = openSamReader(oneFileOrNull(args));
        final SAMFileHeader header = samFileReader.getFileHeader();
        if (this.pair_mode) {
            final SAMFileHeader.SortOrder order = header.getSortOrder();
            if (order == null || order.equals(SAMFileHeader.SortOrder.unsorted)) {
                LOG.warning("In `--pair` mode , the input BAM is expected to be sorted on queryname but current sort-order is " + order + " ... Continue...");
            } else if (!order.equals(SAMFileHeader.SortOrder.queryname)) {
                LOG.error("In `--pair` mode , the input BAM is expected to be sorted on queryname but I've got \"" + order + "\". " + "Use picard SortSam (not `samtools sort` https://github.com/samtools/hts-specs/issues/5 )");
                return -1;
            }
        }
        long count = 0L;
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
        sw = this.writingBamArgs.openSAMFileWriter(this.outputFile, header, true);
        iter = samFileReader.iterator();
        if (this.pair_mode) {
            SAMRecord prev = null;
            final AbstractListFilter filter = (AbstractListFilter) ctor.newInstance(header);
            final List<SAMRecord> buffer = new ArrayList<>();
            for (; ; ) {
                int numWarnings = 100;
                final Comparator<SAMRecord> nameComparator = ReadNameSortMethod.picard.get();
                final SAMRecord record = (iter.hasNext() ? progress.watch(iter.next()) : null);
                if (prev != null && record != null && numWarnings > 0 && nameComparator.compare(prev, record) > 0) {
                    LOG.warn("SamRecord doesn't look sorted on query name using a picard/htsjdk method. Got " + record + " affter " + prev + ". " + "In '--pair'  mode, reads should be sorted on query name using **picard/htsjdk**. (samtools != picard) see https://github.com/samtools/hts-specs/issues/5");
                    --numWarnings;
                }
                prev = record;
                if (record == null || (!buffer.isEmpty() && !buffer.get(0).getReadName().equals(record.getReadName()))) {
                    if (!buffer.isEmpty()) {
                        final Object result = filter.apply(buffer);
                        // result is an array of a collection of reads
                        if (result != null && (result.getClass().isArray() || (result instanceof Collection))) {
                            final Collection<?> col;
                            if (result.getClass().isArray()) {
                                final Object[] array = (Object[]) result;
                                col = Arrays.asList(array);
                            } else {
                                col = (Collection<?>) result;
                            }
                            // write all of reads
                            for (final Object item : col) {
                                if (item == null)
                                    throw new JvarkitException.UserError("item in array is null");
                                if (!(item instanceof SAMRecord))
                                    throw new JvarkitException.UserError("item in array is not a SAMRecord " + item.getClass());
                                ++count;
                                sw.addAlignment(SAMRecord.class.cast(item));
                                if (this.LIMIT > 0L && count >= this.LIMIT)
                                    break;
                            }
                        } else // result is a SAMRecord
                        if (result != null && (result instanceof SAMRecord)) {
                            ++count;
                            sw.addAlignment(SAMRecord.class.cast(result));
                        } else {
                            boolean accept = true;
                            if (result == null) {
                                accept = false;
                            } else if (result instanceof Boolean) {
                                if (Boolean.FALSE.equals(result))
                                    accept = false;
                            } else if (result instanceof Number) {
                                if (((Number) result).intValue() != 1)
                                    accept = false;
                            } else {
                                LOG.warn("Script returned something that is not a boolean or a number:" + result.getClass());
                                accept = false;
                            }
                            if (!accept) {
                                for (final SAMRecord item : buffer) {
                                    failing(item, header);
                                }
                            } else {
                                for (final SAMRecord item : buffer) {
                                    ++count;
                                    sw.addAlignment(item);
                                }
                            }
                        }
                    }
                    // end of if !buffer.isEmpty()
                    if (record == null)
                        break;
                    buffer.clear();
                }
                // end flush flush
                if (this.LIMIT > 0L && count >= this.LIMIT)
                    break;
                buffer.add(record);
            }
        // infinite loop
        } else {
            final AbstractFilter filter = (AbstractFilter) ctor.newInstance(header);
            while (iter.hasNext()) {
                final SAMRecord record = progress.watch(iter.next());
                final Object result = filter.apply(record);
                // result is an array of a collection of reads
                if (result != null && (result.getClass().isArray() || (result instanceof Collection))) {
                    final Collection<?> col;
                    if (result.getClass().isArray()) {
                        final Object[] array = (Object[]) result;
                        col = Arrays.asList(array);
                    } else {
                        col = (Collection<?>) result;
                    }
                    // write all of reads
                    for (final Object item : col) {
                        if (item == null)
                            throw new JvarkitException.UserError("item in array is null");
                        if (!(item instanceof SAMRecord))
                            throw new JvarkitException.UserError("item in array is not a SAMRecord " + item.getClass());
                        ++count;
                        sw.addAlignment(SAMRecord.class.cast(item));
                    }
                } else // result is a SAMRecord
                if (result != null && (result instanceof SAMRecord)) {
                    ++count;
                    sw.addAlignment(SAMRecord.class.cast(result));
                } else {
                    boolean accept = true;
                    if (result == null) {
                        accept = false;
                    } else if (result instanceof Boolean) {
                        if (Boolean.FALSE.equals(result))
                            accept = false;
                    } else if (result instanceof Number) {
                        if (((Number) result).intValue() != 1)
                            accept = false;
                    } else {
                        LOG.warn("Script returned something that is not a boolean or a number:" + result.getClass());
                        accept = false;
                    }
                    if (!accept) {
                        failing(record, header);
                    } else {
                        ++count;
                        sw.addAlignment(record);
                    }
                }
                if (this.LIMIT > 0L && count >= this.LIMIT)
                    break;
            }
        }
        sw.close();
        /* create empty if never called */
        openFailing(header);
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(samFileReader);
        CloserUtil.close(sw);
        CloserUtil.close(this.failingReadsWriter);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) SamReader(htsjdk.samtools.SamReader) Random(java.util.Random) StringWriter(java.io.StringWriter) InMemoryCompiler(com.github.lindenb.jvarkit.lang.InMemoryCompiler) Iso8601Date(htsjdk.samtools.util.Iso8601Date) PrintWriter(java.io.PrintWriter) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) Date(java.util.Date) Iso8601Date(htsjdk.samtools.util.Iso8601Date) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) Collection(java.util.Collection) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File)

Aggregations

SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)107 SAMRecord (htsjdk.samtools.SAMRecord)92 SamReader (htsjdk.samtools.SamReader)83 SAMFileHeader (htsjdk.samtools.SAMFileHeader)49 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)47 File (java.io.File)47 SAMFileWriter (htsjdk.samtools.SAMFileWriter)45 IOException (java.io.IOException)41 ArrayList (java.util.ArrayList)34 CigarElement (htsjdk.samtools.CigarElement)30 Cigar (htsjdk.samtools.Cigar)26 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)24 SamReaderFactory (htsjdk.samtools.SamReaderFactory)21 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)18 CigarOperator (htsjdk.samtools.CigarOperator)16 Interval (htsjdk.samtools.util.Interval)16 PrintWriter (java.io.PrintWriter)15 HashMap (java.util.HashMap)15 SAMFileWriterFactory (htsjdk.samtools.SAMFileWriterFactory)14 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)14