Search in sources :

Example 11 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class VcfSimulator method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.indexedFastaSequenceFile == null) {
        LOG.error("Reference is undefined");
        return -1;
    }
    if (!args.isEmpty()) {
        LOG.error("too many arguments");
        return -1;
    }
    if (this.randomSeed == -1L) {
        this.random = new Random(System.currentTimeMillis());
    } else {
        this.random = new Random(this.randomSeed);
    }
    if (this.numSamples < 0) {
        this.numSamples = 1 + this.random.nextInt(10);
    }
    while (this.samples.size() < numSamples) this.samples.add("SAMPLE" + (1 + this.samples.size()));
    VariantContextWriter writer = null;
    PrintStream pw = null;
    try {
        final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, "GT", "DP");
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, "AF", "AN", "AC", "DP");
        VariantAttributesRecalculator calc = new VariantAttributesRecalculator();
        final VCFHeader header = new VCFHeader(metaData, this.samples);
        header.setSequenceDictionary(this.indexedFastaSequenceFile.getSequenceDictionary());
        calc.setHeader(header);
        pw = super.openFileOrStdoutAsPrintStream(this.outputFile);
        writer = VCFUtils.createVariantContextWriterToOutputStream(pw);
        writer.writeHeader(header);
        long countVariantsSoFar = 0;
        for (; ; ) {
            if (pw.checkError())
                break;
            if (this.numberOfVariants >= 0 && countVariantsSoFar >= this.numberOfVariants)
                break;
            for (final SAMSequenceRecord ssr : this.indexedFastaSequenceFile.getSequenceDictionary().getSequences()) {
                if (pw.checkError())
                    break;
                final GenomicSequence genomic = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
                for (int pos = 1; pos <= ssr.getSequenceLength(); ++pos) {
                    if (pw.checkError())
                        break;
                    char REF = Character.toUpperCase(genomic.charAt(pos - 1));
                    if (REF == 'N')
                        continue;
                    char ALT = 'N';
                    switch(REF) {
                        case 'A':
                            ALT = "TGC".charAt(random.nextInt(3));
                            break;
                        case 'T':
                            ALT = "AGC".charAt(random.nextInt(3));
                            break;
                        case 'G':
                            ALT = "TAC".charAt(random.nextInt(3));
                            break;
                        case 'C':
                            ALT = "TGA".charAt(random.nextInt(3));
                            break;
                        default:
                            ALT = 'N';
                    }
                    if (ALT == 'N')
                        continue;
                    final Allele refAllele = Allele.create((byte) REF, true);
                    Allele altAllele = Allele.create((byte) ALT, false);
                    final VariantContextBuilder cb = new VariantContextBuilder();
                    cb.chr(genomic.getChrom());
                    cb.start(pos);
                    cb.stop(pos);
                    List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
                    for (String sample : samples) {
                        final Allele a1 = (random.nextBoolean() ? refAllele : altAllele);
                        final Allele a2 = (random.nextBoolean() ? refAllele : altAllele);
                        GenotypeBuilder gb = new GenotypeBuilder(sample, Arrays.asList(a1, a2));
                        if (random.nextBoolean()) {
                            gb = new GenotypeBuilder(sample, Arrays.asList(a1, a2));
                            gb.DP(1 + random.nextInt(50));
                        }
                        genotypes.add(gb.make());
                    }
                    cb.genotypes(genotypes);
                    cb.alleles(Arrays.asList(refAllele, altAllele));
                    writer.add(calc.apply(cb.make()));
                    countVariantsSoFar++;
                }
            }
        }
        writer.close();
        writer = null;
        pw.flush();
        pw.close();
        pw = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
        CloserUtil.close(pw);
        CloserUtil.close(writer);
    }
}
Also used : PrintStream(java.io.PrintStream) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ArrayList(java.util.ArrayList) Genotype(htsjdk.variant.variantcontext.Genotype) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Allele(htsjdk.variant.variantcontext.Allele) Random(java.util.Random) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VariantAttributesRecalculator(com.github.lindenb.jvarkit.util.vcf.VariantAttributesRecalculator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Example 12 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class LocalRealignReads method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidxFile == null) {
        LOG.error("REFerence file missing;");
        return -1;
    }
    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SamReader samReader = null;
    SAMFileWriter w = null;
    SAMRecordIterator iter = null;
    GenomicSequence genomicSequence = null;
    LongestCommonSequence matrix = new LongestCommonSequence();
    try {
        indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
        samReader = openSamReader(oneFileOrNull(args));
        final SAMFileHeader header1 = samReader.getFileHeader();
        final SAMFileHeader header2 = header1.clone();
        header2.setSortOrder(SortOrder.unsorted);
        w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
        iter = samReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.watch(iter.next());
            if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
                w.addAlignment(rec);
                continue;
            }
            final Cigar cigar = rec.getCigar();
            final int nCigarElement = cigar.numCigarElements();
            if (nCigarElement < 2) {
                w.addAlignment(rec);
                continue;
            }
            final CigarElement ce5 = cigar.getCigarElement(0);
            final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
            if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
                w.addAlignment(rec);
                continue;
            }
            if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
                genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
            }
            final CharSequence readseq = rec.getReadString();
            /**
             * short invert
             */
            if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
                }
            /*
					
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}
					*/
            }
            if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
                }
            /*
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}*/
            }
            /* other */
            List<LongestCommonSequence.Hit> hits = new ArrayList<>();
            align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
            if (hits.size() < 2000)
                continue;
            for (LongestCommonSequence.Hit hit : hits) {
                int readstart = 0;
                boolean in_M = false;
                for (int i = 0; i < nCigarElement; ++i) {
                    final CigarElement ce = cigar.getCigarElement(i);
                    if (ce.getOperator().consumesReadBases()) {
                        int readend = readstart + ce.getLength();
                        if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
                            if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
                                in_M = true;
                                break;
                            }
                        }
                        readstart = readend;
                    }
                }
                if (in_M)
                    continue;
                int align_size = hit.size();
                System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
                System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
                System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
                System.err.print("REF  :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(genomicSequence.charAt(hit.getStartX() + i));
                }
                System.err.println();
                System.err.print("READ :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(readseq.charAt(hit.getStartY() + i));
                }
                System.err.println();
            }
            System.err.println();
        }
        progress.finish();
        return RETURN_OK;
    } catch (Exception e) {
        return wrapException(e);
    } finally {
        genomicSequence = null;
        CloserUtil.close(iter);
        CloserUtil.close(samReader);
        CloserUtil.close(w);
        CloserUtil.close(indexedFastaSequenceFile);
        matrix = null;
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) ArrayList(java.util.ArrayList) LongestCommonSequence(com.github.lindenb.jvarkit.util.align.LongestCommonSequence) CigarElement(htsjdk.samtools.CigarElement) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) Cigar(htsjdk.samtools.Cigar) SubSequence(com.github.lindenb.jvarkit.lang.SubSequence) SAMRecord(htsjdk.samtools.SAMRecord) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 13 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class MapUniProtFeatures method doWork.

@Override
public int doWork(List<String> args) {
    PrintWriter pw = null;
    try {
        JAXBContext jc = JAXBContext.newInstance("org.uniprot");
        Unmarshaller uniprotUnmarshaller = jc.createUnmarshaller();
        Marshaller uniprotMarshaller = jc.createMarshaller();
        pw = super.openFileOrStdoutAsPrintWriter(OUT);
        LOG.info("read " + REF);
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(REF);
        if (this.indexedFastaSequenceFile.getSequenceDictionary() == null) {
            LOG.error("fasta sequence is not indexed");
            return -1;
        }
        LOG.info("readubf " + kgUri);
        String line;
        Pattern tab = Pattern.compile("[\t]");
        BufferedReader r = IOUtils.openURIForBufferedReading(kgUri);
        while ((line = r.readLine()) != null) {
            String[] tokens = tab.split(line);
            KnownGene kg = new KnownGene();
            kg.setName(tokens[0]);
            kg.setChrom(tokens[1]);
            kg.setStrand(tokens[2].charAt(0));
            kg.setTxStart(Integer.parseInt(tokens[3]));
            kg.setTxEnd(Integer.parseInt(tokens[4]));
            kg.setCdsStart(Integer.parseInt(tokens[5]));
            kg.setCdsEnd(Integer.parseInt(tokens[6]));
            kg.setExonBounds(Integer.parseInt(tokens[7]), tokens[8], tokens[9]);
            List<KnownGene> L = prot2genes.get(tokens[10]);
            if (L == null) {
                L = new ArrayList<KnownGene>();
                prot2genes.put(tokens[10], L);
            }
            if (indexedFastaSequenceFile.getSequenceDictionary().getSequence(kg.getContig()) == null) {
                LOG.info("ignoring " + line);
                continue;
            }
            L.add(kg);
        }
        r.close();
        if (OUT != null) {
            LOG.info("opening " + OUT);
            pw = new PrintWriter(OUT);
        }
        LOG.info("knownGenes: " + this.prot2genes.size());
        XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
        xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlInputFactory.setXMLResolver(new XMLResolver() {

            @Override
            public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
                LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
                return null;
            }
        });
        // SortingCollection<UBed> mappedFeatures=SortingCollection.newInstance(UBed.class, new UBedCodec(),new UBedCmp(),super.MAX_RECORDS_IN_RAM);
        // mappedFeatures.setDestructiveIteration(true);
        LOG.info("Scanning " + UNIPROT);
        Reader fr = IOUtils.openURIForBufferedReading(UNIPROT);
        XMLEventReader rx = xmlInputFactory.createXMLEventReader(fr);
        final QName uEntry = new QName(UNIPROT_NS, "entry");
        GenomicSequence genomic = null;
        while (rx.hasNext()) {
            XMLEvent evt = rx.peek();
            if (!(evt.isStartElement() && evt.asStartElement().getName().equals(uEntry))) {
                rx.next();
                continue;
            }
            JAXBElement<Entry> jaxbElement = uniprotUnmarshaller.unmarshal(rx, Entry.class);
            Entry entry = jaxbElement.getValue();
            if (entry.getFeature().isEmpty())
                continue;
            List<KnownGene> genes = null;
            for (String acn : entry.getAccession()) {
                genes = prot2genes.get(acn);
                if (genes != null)
                    break;
            }
            if (genes == null)
                continue;
            for (KnownGene kg : genes) {
                if (genomic == null || !genomic.getChrom().equals(kg.getChromosome())) {
                    genomic = new GenomicSequence(this.indexedFastaSequenceFile, kg.getChromosome());
                }
                KnownGene.Peptide pep = kg.getCodingRNA(genomic).getPeptide();
                int sameSequenceLength = 0;
                while (sameSequenceLength < entry.getSequence().getValue().length() && sameSequenceLength < pep.length()) {
                    if (Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength)) != Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength))) {
                        break;
                    }
                    sameSequenceLength++;
                }
                if (sameSequenceLength != pep.length()) {
                    System.err.println("Not Same sequence " + kg.getName() + " strand " + kg.getStrand() + " ok-up-to-" + sameSequenceLength);
                    System.err.println("P:" + pep.toString() + " " + pep.length());
                    System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
                    if (pep.toString().contains("?")) {
                        System.err.println("RNA:" + pep.getCodingRNA().toString());
                    }
                }
                if (sameSequenceLength == 0)
                    continue;
                for (FeatureType feat : entry.getFeature()) {
                    if (feat.getType() == null || feat.getType().isEmpty())
                        continue;
                    LocationType locType = feat.getLocation();
                    if (locType == null)
                        continue;
                    int pepStart, pepEnd;
                    if (locType.getPosition() != null && locType.getPosition().getPosition() != null) {
                        pepEnd = locType.getPosition().getPosition().intValue();
                        pepStart = pepEnd - 1;
                    } else if (locType.getBegin() != null && locType.getEnd() != null && locType.getBegin().getPosition() != null && locType.getEnd().getPosition() != null) {
                        pepStart = locType.getBegin().getPosition().intValue() - 1;
                        pepEnd = locType.getEnd().getPosition().intValue();
                    } else {
                        continue;
                    }
                    if (pepEnd >= sameSequenceLength)
                        continue;
                    List<Integer> genomicPos = new ArrayList<Integer>();
                    while (pepStart < pepEnd) {
                        if (pepStart >= pep.length()) {
                            System.err.println();
                            System.err.println("P:" + pep.toString() + " " + pep.length() + " " + kg.getStrand());
                            System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
                            uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
                        }
                        int[] codon = pep.convertToGenomicCoordinates(pepStart);
                        pepStart++;
                        for (int gP : codon) {
                            if (gP == -1) {
                                uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
                                LOG.error("error in genomoc for (" + pepStart + "/" + pepEnd + "):" + entry.getName());
                                System.exit(-1);
                            }
                            genomicPos.add(gP);
                        }
                    }
                    Collections.sort(genomicPos);
                    UBed ubed = new UBed();
                    ubed.tid = this.indexedFastaSequenceFile.getSequenceDictionary().getSequenceIndex(genomic.getChrom());
                    int k0 = 0;
                    while (k0 < genomicPos.size()) {
                        Range range = new Range(genomicPos.get(k0), genomicPos.get(k0) + 1);
                        k0++;
                        while (k0 < genomicPos.size() && range.end == genomicPos.get(k0)) {
                            range = new Range(range.start, range.end + 1);
                            ++k0;
                        }
                        ubed.positions.add(range);
                    }
                    ubed.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
                    ubed.name = feat.getType();
                    ubed.print(pw);
                }
            }
        }
        rx.close();
        fr.close();
        LOG.info("End scan uniprot");
    /*
			mappedFeatures.doneAdding();
			
			
			CloseableIterator<UBed> iter=mappedFeatures.iterator();
			while(iter.hasNext())
				{
				UBed ubed=iter.next();
				ubed.print();
				}
			mappedFeatures.cleanup();
			*/
    } catch (Exception err) {
        err.printStackTrace();
        if (OUT != null)
            OUT.deleteOnExit();
        return -1;
    } finally {
        pw.flush();
        pw.close();
        CloserUtil.close(this.indexedFastaSequenceFile);
    }
    return 0;
}
Also used : FeatureType(org.uniprot.FeatureType) ArrayList(java.util.ArrayList) XMLEventReader(javax.xml.stream.XMLEventReader) Reader(java.io.Reader) BufferedReader(java.io.BufferedReader) JAXBContext(javax.xml.bind.JAXBContext) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) Entry(org.uniprot.Entry) XMLEventReader(javax.xml.stream.XMLEventReader) XMLResolver(javax.xml.stream.XMLResolver) Unmarshaller(javax.xml.bind.Unmarshaller) PrintWriter(java.io.PrintWriter) Pattern(java.util.regex.Pattern) Marshaller(javax.xml.bind.Marshaller) QName(javax.xml.namespace.QName) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) XMLStreamException(javax.xml.stream.XMLStreamException) XMLStreamException(javax.xml.stream.XMLStreamException) BufferedReader(java.io.BufferedReader) XMLEvent(javax.xml.stream.events.XMLEvent) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) LocationType(org.uniprot.LocationType) XMLInputFactory(javax.xml.stream.XMLInputFactory)

Example 14 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class VcfToBam method run.

private void run(VcfIterator vcfIterator) throws IOException {
    long id_generator = 0L;
    SAMFileWriter samFileWriter = null;
    final VCFHeader header = vcfIterator.getHeader();
    SAMSequenceDictionary dict = header.getSequenceDictionary();
    if (dict == null)
        throw new IOException("Sequence Dictionary missing in VCF");
    if (!SequenceUtil.areSequenceDictionariesEqual(dict, indexedFastaSequenceFile.getSequenceDictionary())) {
        LOG.warning("REF/VCF: not the same Sequence dictonary " + dict + " " + indexedFastaSequenceFile.getSequenceDictionary());
    }
    final SAMFileHeader samHeader = new SAMFileHeader();
    samHeader.setSequenceDictionary(dict);
    for (final String sample : new TreeSet<>(header.getSampleNamesInOrder())) {
        final SAMReadGroupRecord rg = new SAMReadGroupRecord(sample);
        rg.setSample(sample);
        rg.setLibrary(sample);
        rg.setDescription(sample);
        rg.setLibrary("illumina");
        samHeader.addReadGroup(rg);
    }
    samHeader.addComment("Generated with " + getProgramCommandLine());
    samHeader.setSortOrder(SortOrder.unsorted);
    samFileWriter = this.writingBamArgs.setReferenceFile(this.faidx).openSAMFileWriter(this.outputFile, samHeader, true);
    /* looping over sequences */
    for (final SAMSequenceRecord ssr : dict.getSequences()) {
        final LinkedList<VariantContext> variantBuffer = new LinkedList<>();
        GenomicSequence genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
        int x = 1;
        while (x + this.fragmentSize <= ssr.getSequenceLength()) {
            // pop on left
            while (!variantBuffer.isEmpty() && (variantBuffer.getFirst().getStart() + this.fragmentSize * 2) < x) {
                variantBuffer.removeFirst();
            }
            // refill buffer
            while (vcfIterator.hasNext()) {
                // buffer is already by enough
                if (!variantBuffer.isEmpty() && variantBuffer.getLast().getStart() > x + 2 * fragmentSize) {
                    break;
                }
                final VariantContext ctx = vcfIterator.peek();
                if (ctx == null)
                    break;
                if (ctx.isIndel() || !ctx.isVariant()) {
                    LOG.warning("Cannot handle " + ctx.getReference() + "/" + ctx.getAlternateAlleles());
                    // consumme
                    vcfIterator.next();
                    continue;
                }
                final SAMSequenceRecord variantContig = dict.getSequence(ctx.getContig());
                if (variantContig == null)
                    throw new IOException("Unknown contig " + ctx.getContig());
                if (variantContig.getSequenceIndex() < ssr.getSequenceIndex()) {
                    // just consumme !
                    // https://github.com/lindenb/jvarkit/issues/86#issuecomment-326986654
                    // consumme
                    vcfIterator.next();
                    continue;
                } else if (variantContig.getSequenceIndex() > ssr.getSequenceIndex()) {
                    break;
                } else {
                    variantBuffer.add(vcfIterator.next());
                }
            }
            if (variantBuffer.isEmpty()) {
                LOG.info("no variant ?");
                // no variant on this chromosome
                break;
            }
            if (x < variantBuffer.getFirst().getStart() - 2 * fragmentSize) {
                x = variantBuffer.getFirst().getStart() - 2 * fragmentSize;
            }
            for (int depth = 0; depth < 1; ++depth) {
                for (String sample : header.getSampleNamesInOrder()) {
                    // loop over genomic strand
                    for (int g_strand = 0; g_strand < 2; ++g_strand) {
                        SAMRecord[] records = { new SAMRecord(samHeader), new SAMRecord(samHeader) };
                        String readName = String.format("%010d", ++id_generator);
                        for (int R = 0; R < 2; ++R) {
                            records[R].setReferenceName(ssr.getSequenceName());
                            records[R].setReadPairedFlag(true);
                            records[R].setReadName(readName);
                            records[R].setMappingQuality(60);
                            records[R].setProperPairFlag(true);
                            records[R].setMateReferenceName(ssr.getSequenceName());
                            records[R].setMateUnmappedFlag(false);
                            records[R].setReadUnmappedFlag(false);
                            records[R].setAttribute("RG", sample);
                            records[R].setReadNegativeStrandFlag(R == 1);
                            StringBuilder sequence = new StringBuilder(this.readSize);
                            int readLen = 0;
                            int refPos1 = (R == 0 ? x : x + this.fragmentSize - this.readSize);
                            records[R].setAlignmentStart(refPos1);
                            List<CigarElement> cigarElements = new ArrayList<>(this.readSize);
                            int NM = 0;
                            while (readLen < this.readSize) {
                                String base = null;
                                VariantContext overlap = null;
                                for (VariantContext vc : variantBuffer) {
                                    int d = vc.getStart() - (refPos1 + readLen);
                                    if (d == 0) {
                                        overlap = vc;
                                        break;
                                    }
                                    if (d > 0)
                                        break;
                                }
                                if (overlap != null) {
                                    Genotype genotype = overlap.getGenotype(sample);
                                    if (genotype.isCalled() && !genotype.isMixed()) {
                                        List<Allele> alleles = genotype.getAlleles();
                                        if (alleles.size() != 2)
                                            throw new RuntimeException("Not a diploid organism.");
                                        Allele allele = null;
                                        if (genotype.isPhased()) {
                                            allele = alleles.get(g_strand);
                                        } else {
                                            allele = alleles.get(Math.random() < 0.5 ? 0 : 1);
                                        }
                                        if (allele.isSymbolic())
                                            throw new RuntimeException("Cannot handle symbolic alleles");
                                        if (allele.isReference()) {
                                            cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.EQ));
                                        } else if (allele.getBaseString().length() < overlap.getReference().getBaseString().length()) {
                                            cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.D));
                                            NM++;
                                        } else if (allele.getBaseString().length() > overlap.getReference().getBaseString().length()) {
                                            cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.I));
                                            NM++;
                                        } else // same size
                                        {
                                            cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.X));
                                            NM++;
                                        }
                                        base = allele.getBaseString().toLowerCase();
                                    }
                                }
                                if (base == null) {
                                    base = String.valueOf(genomicSequence.charAt(refPos1 - 1 + readLen));
                                    cigarElements.add(new CigarElement(1, CigarOperator.EQ));
                                }
                                sequence.append(base);
                                readLen += base.length();
                            }
                            while (sequence.length() > this.readSize) sequence.deleteCharAt(sequence.length() - 1);
                            records[R].setReadString(sequence.toString());
                            StringBuilder qual = new StringBuilder(sequence.length());
                            for (int i = 0; i < sequence.length(); ++i) qual.append("I");
                            records[R].setBaseQualityString(qual.toString());
                            // cigar
                            int c = 0;
                            while (c + 1 < cigarElements.size()) {
                                CigarElement c0 = cigarElements.get(c);
                                CigarElement c1 = cigarElements.get(c + 1);
                                if (c0.getOperator().equals(c1.getOperator())) {
                                    cigarElements.set(c, new CigarElement(c0.getLength() + c1.getLength(), c0.getOperator()));
                                    cigarElements.remove(c + 1);
                                } else {
                                    ++c;
                                }
                            }
                            records[R].setCigar(new Cigar(cigarElements));
                            records[R].setAttribute("NM", NM);
                        }
                        if (Math.random() < 0.5) {
                            records[0].setFirstOfPairFlag(true);
                            records[0].setSecondOfPairFlag(false);
                            records[1].setFirstOfPairFlag(false);
                            records[1].setSecondOfPairFlag(true);
                        } else {
                            records[0].setFirstOfPairFlag(false);
                            records[0].setSecondOfPairFlag(true);
                            records[1].setFirstOfPairFlag(true);
                            records[1].setSecondOfPairFlag(false);
                        }
                        records[0].setMateAlignmentStart(records[1].getAlignmentStart());
                        records[1].setMateAlignmentStart(records[0].getAlignmentStart());
                        records[0].setInferredInsertSize(records[1].getAlignmentStart() - records[0].getAlignmentStart());
                        records[1].setInferredInsertSize(records[0].getAlignmentStart() - records[1].getAlignmentStart());
                        samFileWriter.addAlignment(records[0]);
                        samFileWriter.addAlignment(records[1]);
                    }
                }
            }
            ++x;
        }
    }
    samFileWriter.close();
}
Also used : SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) TreeSet(java.util.TreeSet) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMFileWriter(htsjdk.samtools.SAMFileWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) Genotype(htsjdk.variant.variantcontext.Genotype) IOException(java.io.IOException) CigarElement(htsjdk.samtools.CigarElement) LinkedList(java.util.LinkedList) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 15 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class NoZeroVariationVCF method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    final VCFHeader header = in.getHeader();
    if (in.hasNext()) {
        LOG.info("found a variant in VCF.");
        VCFUtils.copyHeaderAndVariantsTo(in, out);
    } else {
        LOG.info("no a variant in VCF. Creating a fake Variant");
        header.addMetaDataLine(new VCFFilterHeaderLine("FAKESNP", "Fake SNP created because vcf input was empty."));
        VCFFormatHeaderLine gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_KEY);
        if (gtHeaderLine == null) {
            LOG.info("Adding GT to header");
            header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
        }
        gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY);
        if (gtHeaderLine == null) {
            LOG.info("Adding GQ to header");
            header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality"));
        }
        out.writeHeader(header);
        SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
        // choose random chrom, best is 'random' , but not 1...23,X,Y, etc...
        String chrom = dict.getSequence(0).getSequenceName();
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            String ssn = ssr.getSequenceName();
            if (ssn.contains("_")) {
                chrom = ssn;
                break;
            }
        }
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            String ssn = ssr.getSequenceName();
            if (ssn.toLowerCase().contains("random")) {
                chrom = ssn;
                break;
            }
            if (ssn.toLowerCase().contains("gl")) {
                chrom = ssn;
                break;
            }
        }
        GenomicSequence gseq = new GenomicSequence(this.indexedFastaSequenceFile, chrom);
        char ref = 'N';
        char alt = 'N';
        int POS = 0;
        for (POS = 0; POS < gseq.length(); ++POS) {
            ref = Character.toUpperCase(gseq.charAt(POS));
            if (ref == 'N')
                continue;
            switch(ref) {
                case 'A':
                    alt = 'T';
                    break;
                case 'T':
                    alt = 'G';
                    break;
                case 'G':
                    alt = 'C';
                    break;
                case 'C':
                    alt = 'A';
                    break;
                default:
                    break;
            }
            if (alt == 'N')
                continue;
            break;
        }
        if (alt == 'N')
            throw new RuntimeException("found only N");
        VariantContextBuilder vcb = new VariantContextBuilder();
        Allele a1 = Allele.create((byte) ref, true);
        Allele a2 = Allele.create((byte) alt, false);
        List<Allele> la1a2 = new ArrayList<Allele>(2);
        List<Genotype> genotypes = new ArrayList<Genotype>(header.getSampleNamesInOrder().size());
        la1a2.add(a1);
        la1a2.add(a2);
        vcb.chr(gseq.getChrom());
        vcb.start(POS + 1);
        vcb.stop(POS + 1);
        vcb.filter("FAKESNP");
        vcb.alleles(la1a2);
        vcb.log10PError(-0.1);
        for (String sample : header.getSampleNamesInOrder()) {
            final GenotypeBuilder gb = new GenotypeBuilder(sample);
            if (genotypes.isEmpty()) {
                gb.DP(1);
                gb.GQ(1);
                gb.alleles(la1a2);
                gb.noAD();
                gb.noPL();
                genotypes.add(gb.make());
            } else {
                genotypes.add(GenotypeBuilder.createMissing(sample, 2));
            }
        }
        vcb.genotypes(genotypes);
        vcb.noID();
        out.add(vcb.make());
    }
    return 0;
}
Also used : GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ArrayList(java.util.ArrayList) Genotype(htsjdk.variant.variantcontext.Genotype) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine)

Aggregations

GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)24 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)15 ArrayList (java.util.ArrayList)12 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)10 CigarElement (htsjdk.samtools.CigarElement)10 SAMRecord (htsjdk.samtools.SAMRecord)10 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)10 SamReader (htsjdk.samtools.SamReader)10 VCFHeader (htsjdk.variant.vcf.VCFHeader)10 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)9 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)8 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)8 SAMFileHeader (htsjdk.samtools.SAMFileHeader)7 Allele (htsjdk.variant.variantcontext.Allele)7 Cigar (htsjdk.samtools.Cigar)6 VariantContext (htsjdk.variant.variantcontext.VariantContext)6 HashSet (java.util.HashSet)6 CigarOperator (htsjdk.samtools.CigarOperator)5 Genotype (htsjdk.variant.variantcontext.Genotype)5 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)5