Examples with CharSplitter - com.github.lindenb.jvarkit.lang.CharSplitter

Example 26 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class VcfRenameSamples method parseNames.

private void parseNames(final Path f) throws IOException {
    final CharSplitter tab = CharSplitter.TAB;
    final BufferedReader in = IOUtils.openPathForBufferedReading(f);
    String line;
    while ((line = in.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        if (StringUtil.isBlank(line))
            continue;
        final String[] tokens = tab.split(line);
        if (tokens.length != 2)
            throw new IOException("Expected two columns in \"" + line + "\" : " + f);
        tokens[0] = tokens[0].trim();
        tokens[1] = tokens[1].trim();
        if (StringUtil.isBlank(tokens[0]))
            throw new IOException("Empty src-name in \"" + line + "\" : " + f);
        if (StringUtil.isBlank(tokens[1]))
            throw new IOException("Empty dest-name in \"" + line + "\" : " + f);
        if (tokens[0].equals(tokens[1])) {
            LOG.warning("Same src/dest in in \"" + line + "\" : " + f);
            continue;
        }
        if (this.oldNameToNewName.containsKey(tokens[0])) {
            final String dest = this.oldNameToNewName.get(tokens[0]);
            if (dest.equals(tokens[1])) {
                LOG.warning(tokens[0] + " -> " + tokens[1] + " defined twice");
                continue;
            } else {
                throw new IOException("Empty src-name was first defined as " + dest + " and now as " + tokens[1] + " in \"" + line + "\" : " + f);
            }
        }
        this.oldNameToNewName.put(tokens[0], tokens[1]);
    }
    in.close();
}

Also used : CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException)

Example 27 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class BackLocate method run.

private void run(final PrintStream out, final BufferedReader in) throws IOException {
    final CharSplitter tab = CharSplitter.TAB;
    String line;
    while ((line = in.readLine()) != null) {
        if (line.startsWith("#") || StringUtils.isBlank(line))
            continue;
        final String[] tokens = JvarkitException.TokenErrors.atLeast(2, tab.split(line, 3));
        final String geneName = tokens[0].trim();
        if (StringUtils.isBlank(geneName))
            throw new IOException("Bad line. No gene in " + geneName);
        final String mut = tokens[1].trim();
        int x0 = 0, x1 = 0;
        while (x1 < mut.length() && Character.isLetter(mut.charAt(x1))) {
            ++x1;
        }
        String substr = mut.substring(x0, x1);
        final AminoAcids.AminoAcid aa1 = substr.length() == 1 ? AminoAcids.getAminoAcidFromOneLetterCode(substr.charAt(0)) : AminoAcids.getAminoAcidFromThreeLettersCode(substr);
        if (aa1 == null)
            throw new JvarkitException.UserError("Bad mutation " + mut + " (cannot parse left AA)");
        x0 = x1;
        while (x1 < mut.length() && Character.isDigit(mut.charAt(x1))) {
            ++x1;
        }
        substr = mut.substring(x0, x1);
        if (!StringUtils.isInteger(substr))
            throw new JvarkitException.UserError("Bad mutation " + mut + " (cannot parse position)");
        final int position1 = Integer.parseInt(substr);
        if (position1 == 0)
            throw new IOException("Bad position in protein (" + substr + ") in " + line);
        substr = mut.substring(x1);
        final AminoAcids.AminoAcid aa2 = substr.length() == 1 ? AminoAcids.getAminoAcidFromOneLetterCode(substr.charAt(0)) : AminoAcids.getAminoAcidFromThreeLettersCode(substr);
        if (aa2 == null)
            throw new JvarkitException.UserError("Bad mutation " + mut + " (cannot parse right AA)");
        final List<Transcript> transcripts = this.name2transcripts.get(geneName.toUpperCase());
        if (transcripts == null || transcripts.isEmpty()) {
            LOG.warn("no transcript found for " + geneName);
            continue;
        }
        for (final Transcript transcript : transcripts) {
            backLocate(out, transcript, geneName, aa1, aa2, position1, tokens.length > 2 ? tokens[2] : ".");
        }
    }
}

Also used : JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) AminoAcid(com.github.lindenb.jvarkit.util.bio.AminoAcids.AminoAcid) AminoAcids(com.github.lindenb.jvarkit.util.bio.AminoAcids) IOException(java.io.IOException)

Example 28 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class Biostar214299 method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.positionFile == null) {
        LOG.error("position File is not defined.");
        return -1;
    }
    final String UNAFFECTED_SAMPLE = "UNAFFECTED";
    final String AMBIGOUS_SAMPLE = "AMBIGOUS";
    final String UNMAPPED = "UNMAPPED";
    SamReader sfr = null;
    SAMFileWriter sfw = null;
    final IntervalTreeMap<Position> positionsTreeMap = new IntervalTreeMap<>();
    final Set<String> samples = new HashSet<>();
    try {
        sfr = openSamReader(oneFileOrNull(args));
        final SAMFileHeader header = sfr.getFileHeader();
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
        try (BufferedReader br = IOUtils.openPathForBufferedReading(this.positionFile)) {
            String line;
            final CharSplitter tab = CharSplitter.TAB;
            while ((line = br.readLine()) != null) {
                if (StringUtils.isBlank(line) || line.startsWith("#"))
                    continue;
                final String[] tokens = tab.split(line);
                if (tokens.length < 4) {
                    LOG.error("Not enough columns in " + line);
                    return -1;
                }
                final String contig = tokens[0];
                if (dict.getSequence(contig) == null) {
                    LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(contig, dict));
                    return -1;
                }
                final int refpos = Integer.parseInt(tokens[1]);
                final Interval interval = new Interval(contig, refpos, refpos);
                Position position = positionsTreeMap.get(interval);
                if (position == null) {
                    position = new Position();
                    // position.contig = contig;
                    position.refpos = refpos;
                    positionsTreeMap.put(interval, position);
                }
                final String bases = tokens[2].toUpperCase();
                if (bases.length() != 1 || !bases.matches("[ATGC]")) {
                    LOG.error("in " + line + " bases should be one letter and ATGC");
                    return -1;
                }
                if (position.base2sample.containsKey(bases.charAt(0))) {
                    LOG.error("in " + line + " bases already defined for this position");
                    return -1;
                }
                final String sampleName = tokens[3].trim();
                if (sampleName.isEmpty()) {
                    LOG.error("sample name cannot be empty");
                    return -1;
                }
                samples.add(sampleName);
                position.base2sample.put(bases.charAt(0), sampleName);
            }
        } catch (final IOException err) {
            LOG.error(err);
            return -1;
        }
        if (samples.contains(UNAFFECTED_SAMPLE)) {
            LOG.error("Sample cannot be named " + UNAFFECTED_SAMPLE);
            return -1;
        }
        if (samples.contains(AMBIGOUS_SAMPLE)) {
            LOG.error("Sample cannot be named " + AMBIGOUS_SAMPLE);
            return -1;
        }
        if (samples.contains(UNMAPPED)) {
            LOG.error("Sample cannot be named " + UNMAPPED);
            return -1;
        }
        samples.add(UNAFFECTED_SAMPLE);
        samples.add(AMBIGOUS_SAMPLE);
        samples.add(UNMAPPED);
        final SAMFileHeader newHeader = new SAMFileHeader();
        newHeader.setSortOrder(header.getSortOrder());
        newHeader.setSequenceDictionary(dict);
        newHeader.addComment("generated with " + getProgramName() + " " + getVersion() + " Pierre Lindenbaum : " + getProgramCommandLine());
        /* create groups */
        for (final String sample : samples) {
            final SAMReadGroupRecord rg = new SAMReadGroupRecord(sample);
            rg.setSample(sample);
            rg.setLibrary(sample);
            newHeader.addReadGroup(rg);
        }
        sfw = this.writingBamArgs.setReferencePath(this.reference).openSamWriter(this.outputFile, newHeader, true);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
        final SAMRecordIterator iter = sfr.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.watch(iter.next());
            rec.setAttribute("RG", null);
            if (rec.getReadUnmappedFlag()) {
                rec.setAttribute("RG", UNMAPPED);
                sfw.addAlignment(rec);
                continue;
            }
            final Cigar cigar = rec.getCigar();
            final Collection<Position> snps = positionsTreeMap.getContained(new Interval(rec.getContig(), rec.getUnclippedStart(), rec.getUnclippedEnd()));
            if (snps == null || snps.isEmpty()) {
                rec.setAttribute("RG", UNAFFECTED_SAMPLE);
                sfw.addAlignment(rec);
                continue;
            }
            final Map<Integer, Position> index2pos = snps.stream().collect(Collectors.toMap(P -> P.refpos, P -> P));
            final Set<String> selectedSamples = new HashSet<>();
            final byte[] bases = rec.getReadBases();
            if (bases == null || bases.equals(SAMRecord.NULL_SEQUENCE)) {
                LOG.error("Bases missing in read " + rec);
                return -1;
            }
            int refPos1 = rec.getUnclippedStart();
            int readPos0 = 0;
            for (final CigarElement ce : cigar.getCigarElements()) {
                final CigarOperator op = ce.getOperator();
                final boolean consummeReadBaseOrSoftClip = op.consumesReadBases() || op.equals(CigarOperator.S);
                if (op.consumesReferenceBases() && consummeReadBaseOrSoftClip) {
                    for (int i = 0; i < ce.getLength(); ++i) {
                        final int nowRefPos1 = (refPos1 + i);
                        final int nowReadPos0 = (readPos0 + i);
                        final Position position = index2pos.get(nowRefPos1);
                        if (position == null)
                            continue;
                        if (nowReadPos0 >= bases.length)
                            continue;
                        final char base = (char) Character.toUpperCase(bases[nowReadPos0]);
                        final String sample = position.base2sample.get(base);
                        if (sample == null)
                            continue;
                        selectedSamples.add(sample);
                        index2pos.remove(nowRefPos1);
                        if (index2pos.isEmpty())
                            break;
                    }
                }
                if (op.consumesReferenceBases() || op.isClipping()) {
                    refPos1 += ce.getLength();
                }
                if (consummeReadBaseOrSoftClip) {
                    readPos0 += ce.getLength();
                }
            }
            if (selectedSamples.isEmpty()) {
                rec.setAttribute("RG", UNAFFECTED_SAMPLE);
            } else if (selectedSamples.size() == 1) {
                rec.setAttribute("RG", selectedSamples.iterator().next());
            } else {
                rec.setAttribute("RG", AMBIGOUS_SAMPLE);
            }
            sfw.addAlignment(rec);
        }
        progress.finish();
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(sfr);
        CloserUtil.close(sfw);
    }
}

Also used : Cigar(htsjdk.samtools.Cigar) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) CigarElement(htsjdk.samtools.CigarElement) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) HashMap(java.util.HashMap) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ParametersDelegate(com.beust.jcommander.ParametersDelegate) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) IOException(java.io.IOException) SAMFileWriter(htsjdk.samtools.SAMFileWriter) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) BufferedReader(java.io.BufferedReader) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) HashSet(java.util.HashSet) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) IOException(java.io.IOException) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) SAMFileHeader(htsjdk.samtools.SAMFileHeader) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Interval(htsjdk.samtools.util.Interval)

Aggregations

CharSplitter (com.github.lindenb.jvarkit.lang.CharSplitter)28 BufferedReader (java.io.BufferedReader)19 IOException (java.io.IOException)12 List (java.util.List)11 Parameter (com.beust.jcommander.Parameter)10 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)10 Program (com.github.lindenb.jvarkit.util.jcommander.Program)10 Logger (com.github.lindenb.jvarkit.util.log.Logger)10 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)10 Set (java.util.Set)10 Path (java.nio.file.Path)9 ArrayList (java.util.ArrayList)9 HashSet (java.util.HashSet)9 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)8 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)7 CloserUtil (htsjdk.samtools.util.CloserUtil)7 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)7 VCFHeader (htsjdk.variant.vcf.VCFHeader)7 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)7 Map (java.util.Map)7