Search in sources :

Example 16 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class ContigNameConverter method fromPath.

public static ContigNameConverter fromPath(final Path mappingFile) {
    IOUtil.assertFileIsReadable(mappingFile);
    final MapBasedContigNameConverter mapper = new MapBasedContigNameConverter();
    mapper.name = mappingFile.getFileName().toString();
    BufferedReader in = null;
    try {
        final CharSplitter tab = CharSplitter.TAB;
        in = IOUtils.openPathForBufferedReading(mappingFile);
        String line;
        while ((line = in.readLine()) != null) {
            if (StringUtils.isBlank(line) || line.startsWith("#"))
                continue;
            final String[] tokens = tab.split(line);
            if (tokens.length != 2 || StringUtils.isBlank(tokens[0]) || StringUtils.isBlank(tokens[1])) {
                in.close();
                in = null;
                throw new IOException("Bad mapping line: \"" + line + "\" in " + mappingFile);
            }
            tokens[0] = tokens[0].trim();
            tokens[1] = tokens[1].trim();
            if (mapper.map.containsKey(tokens[0])) {
                in.close();
                throw new IOException("Mapping defined twice for: \"" + tokens[0] + "\"");
            }
            mapper.map.put(tokens[0], tokens[1]);
        }
        return mapper;
    } catch (final IOException err) {
        throw new RuntimeIOException(err);
    } finally {
        CloserUtil.close(in);
    }
}
Also used : RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) BufferedReader(java.io.BufferedReader) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException)

Example 17 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class ConvertBamChromosomes method readMappingFile.

private Map<String, String> readMappingFile(final SAMSequenceDictionary dictIn, final SAMSequenceDictionary dictOut) throws IOException {
    final Map<String, String> mapper = new LinkedHashMap<>();
    BufferedReader in = null;
    final CharSplitter tab = CharSplitter.TAB;
    try {
        in = IOUtils.openPathForBufferedReading(this.mappingFile);
        String line;
        while ((line = in.readLine()) != null) {
            if (StringUtil.isBlank(line) || line.startsWith("#"))
                continue;
            final String[] tokens = tab.split(line);
            if (tokens.length != 2 || StringUtil.isBlank(tokens[0]) || StringUtil.isBlank(tokens[1])) {
                in.close();
                in = null;
                throw new IOException("Bad mapping line: \"" + line + "\"");
            }
            tokens[0] = tokens[0].trim();
            tokens[1] = tokens[1].trim();
            if (tokens[0].equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) || tokens[1].equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
                in.close();
                throw new IOException("Bad name for: " + line);
            }
            if (dictIn.getSequence(tokens[0]) == null) {
                LOG.warn("Input dictionary doesn't contain " + tokens[0] + ", skipping");
                continue;
            }
            if (dictOut != null && dictOut.getSequence(tokens[1]) == null) {
                LOG.warn("Output dictionary doesn't contain " + tokens[1] + ", skipping");
                continue;
            }
            if (mapper.containsKey(tokens[0])) {
                in.close();
                throw new IOException("Mapping defined twice for src \"" + tokens[0] + "\"");
            }
            if (mapper.values().contains(tokens[1])) {
                throw new IOException("Mapping defined twice for dest \"" + tokens[1] + "\"");
            }
            mapper.put(tokens[0], tokens[1]);
        }
        return mapper;
    } finally {
        CloserUtil.close(in);
    }
}
Also used : CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) LinkedHashMap(java.util.LinkedHashMap)

Example 18 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class ConvertBamChromosomes method doWork.

@Override
public int doWork(final List<String> args) {
    final CharSplitter SEMICOLON_PAT = CharSplitter.SEMICOLON;
    final CharSplitter COMMA_PAT = CharSplitter.COMMA;
    if (this.dictSource == null && this.mappingFile == null) {
        LOG.error("dict source undefined and mapping file both undefined");
        return -1;
    }
    SamReader sr = null;
    SAMFileWriter sfw = null;
    try {
        final SamReaderFactory srf = super.createSamReaderFactory();
        if (this.faidx != null)
            srf.referenceSequence(this.faidx);
        final String input = oneFileOrNull(args);
        if (input == null) {
            sr = srf.open(SamInputResource.of(stdin()));
        } else {
            sr = srf.open(SamInputResource.of(input));
        }
        final SAMFileHeader inHeader = sr.getFileHeader();
        if (inHeader == null) {
            LOG.error("File header missing");
            return -1;
        }
        final SAMSequenceDictionary dictIn = SequenceDictionaryUtils.extractRequired(inHeader);
        if (dictIn.isEmpty()) {
            LOG.error("input Sequence dict is empty");
            return -1;
        }
        final SAMFileHeader outHeader = inHeader.clone();
        // create new sequence dict
        SAMSequenceDictionary dictOut = null;
        Map<String, String> contigMap = null;
        if (this.dictSource != null) {
            dictOut = SAMSequenceDictionaryExtractor.extractDictionary(this.dictSource);
        }
        if (this.mappingFile != null) {
            contigMap = this.readMappingFile(dictIn, dictOut);
        }
        if (dictOut == null && contigMap != null) {
            contigMap = this.readMappingFile(dictIn, dictOut);
            final List<SAMSequenceRecord> ssrs = new ArrayList<SAMSequenceRecord>(dictIn.size());
            for (final String ci : contigMap.keySet()) {
                final SAMSequenceRecord ssr1 = dictIn.getSequence(ci);
                if (ssr1 == null)
                    continue;
                final SAMSequenceRecord ssr2 = new SAMSequenceRecord(contigMap.get(ci), ssr1.getSequenceLength());
                for (final Map.Entry<String, String> atts : ssr1.getAttributes()) {
                    ssr2.setAttribute(atts.getKey(), atts.getValue());
                }
                ssrs.add(ssr2);
            }
            dictOut = new SAMSequenceDictionary(ssrs);
            this.mapper = ContigNameConverter.fromMap(contigMap);
        } else if (dictOut != null && contigMap == null) {
            this.mapper = ContigNameConverter.fromDictionaries(dictIn, dictOut);
        }
        if (this.mapper == null || dictOut == null) {
            LOG.error("Illegal state mapper==null or dict-out=null");
            return -1;
        }
        outHeader.setSequenceDictionary(dictOut);
        // check order of dictionaries, do we need to set the sort order ?
        int prev_out_index = -1;
        boolean output_is_sorted = true;
        for (int i = 0; i < dictIn.size(); i++) {
            final String si = dictIn.getSequence(i).getSequenceName();
            final String so = this.mapper.apply(si);
            if (so == null)
                continue;
            final int o_index = dictOut.getSequenceIndex(so);
            if (o_index < prev_out_index) {
                output_is_sorted = false;
                LOG.info("output will NOT be sorted");
                break;
            }
            prev_out_index = o_index;
        }
        if (!output_is_sorted) {
            outHeader.setSortOrder(SortOrder.unsorted);
        }
        sfw = this.writingBamArgs.openSamWriter(this.output, outHeader, true);
        long num_ignored = 0L;
        SAMRecordIterator iter = sr.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec1 = iter.next();
            String newName1 = null;
            String newName2 = null;
            if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
                newName1 = convert(rec1.getReferenceName(), dictOut);
            }
            if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
                newName2 = convert(rec1.getMateReferenceName(), dictOut);
            }
            rec1.setHeader(outHeader);
            if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
                if (newName1 == null) {
                    ++num_ignored;
                    continue;
                } else {
                    rec1.setReferenceName(newName1);
                }
            }
            if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
                if (newName2 == null) {
                    ++num_ignored;
                    continue;
                }
                rec1.setMateReferenceName(newName2);
            }
            if (rec1.hasAttribute(SAMTag.SA.name())) {
                final Object sa = rec1.getStringAttribute(SAMTag.SA.name());
                if (sa != null && (sa instanceof String)) {
                    final String[] tokens = SEMICOLON_PAT.split(String.class.cast(sa));
                    final List<String> L = new ArrayList<>(tokens.length);
                    for (final String semiColonStr : tokens) {
                        final String[] commaStrs = COMMA_PAT.split(semiColonStr);
                        final String newctg = convert(commaStrs[0], dictOut);
                        if (newctg == null)
                            continue;
                        commaStrs[0] = newctg;
                        L.add(String.join(",", commaStrs));
                    }
                    if (L.isEmpty()) {
                        rec1.setAttribute(SAMTag.SA.name(), null);
                    } else {
                        rec1.setAttribute(SAMTag.SA.name(), String.join(";", L));
                    }
                }
            }
            sfw.addAlignment(rec1);
        }
        if (!this.unmappedChromosomes.isEmpty()) {
            LOG.warning("Unmapped chromosomes: " + unmappedChromosomes);
        }
        LOG.warning("num ignored read:" + num_ignored);
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(sr);
        CloserUtil.close(sfw);
    }
}
Also used : CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMFileWriter(htsjdk.samtools.SAMFileWriter) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 19 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class ConvertBedChromosomes method doWork.

@SuppressWarnings("resource")
protected int doWork(final BufferedReader in, PrintStream out) throws IOException {
    String line;
    final CharSplitter tab = CharSplitter.of(this.delimStr);
    final Pattern headerRegex = Pattern.compile(this.ignoreLinesPattern);
    while ((line = in.readLine()) != null) {
        final Matcher match = headerRegex.matcher(line);
        if (match.find() && match.start() == 0) {
            out.println(line);
            continue;
        }
        boolean ok = true;
        final String[] tokens = tab.split(line);
        for (final int chromColumn0 : this.chromColumns0) {
            if (chromColumn0 < 0 || chromColumn0 >= tokens.length) {
                ok = false;
                continue;
            }
            final String chrom = this.customMapping.apply(tokens[chromColumn0]);
            if (StringUtils.isBlank(chrom)) {
                if (this.unmappedChromosomes.add(tokens[chromColumn0])) {
                    LOG.warn("cannot convert " + tokens[chromColumn0]);
                }
                ok = false;
                continue;
            }
            tokens[chromColumn0] = chrom;
        }
        if (!ok) {
            switch(this.onNotFound) {
                case RAISE_EXCEPTION:
                    throw new IOException("Cannot convert chrom in : " + line);
                case SKIP:
                    continue;
                case RETURN_ORIGINAL:
                    break;
            }
        }
        out.println(String.join(String.valueOf(tab.getDelimiter()), tokens));
    }
    out.flush();
    return 0;
}
Also used : Pattern(java.util.regex.Pattern) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Matcher(java.util.regex.Matcher) IOException(java.io.IOException)

Example 20 with CharSplitter

use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.

the class BimToVcf method doWork.

@Override
public int doWork(final List<String> args) {
    VariantContextWriter w = null;
    BufferedReader r = null;
    ReferenceSequenceFile faidx = null;
    GenomicSequence genomic = null;
    long number_non_snvs = 0L;
    try {
        faidx = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(faidx);
        this.writingVariants.dictionary(dict);
        r = super.openBufferedReader(oneFileOrNull(args));
        final Set<VCFHeaderLine> headerLines = new HashSet<>();
        final VCFInfoHeaderLine morgan = new VCFInfoHeaderLine("MORGAN", 1, VCFHeaderLineType.Float, "Centimorgan");
        final VCFInfoHeaderLine svtype = new VCFInfoHeaderLine("SVTYPE", 1, VCFHeaderLineType.String, "Variation type");
        VCFStandardHeaderLines.addStandardInfoLines(headerLines, false, "");
        // super.addMetaData(headerLines);
        headerLines.add(morgan);
        headerLines.add(svtype);
        final List<String> genotypeSampleNames = Collections.emptyList();
        final VCFHeader header = new VCFHeader(headerLines, genotypeSampleNames);
        header.setSequenceDictionary(dict);
        w = this.writingVariants.open(this.outputFile);
        w.writeHeader(header);
        final CharSplitter tab = CharSplitter.TAB;
        String line;
        final Predicate<String> iupacATGC = S -> AcidNucleics.isATGC(S);
        final List<String> convertCtg = Arrays.asList("23", "X", "23", "chrX", "24", "Y", "24", "chrY", "26", "MT", "26", "chrM");
        while ((line = r.readLine()) != null) {
            final String[] tokens = tab.split(line);
            if (tokens.length != 6) {
                throw new JvarkitException.TokenErrors(6, tokens);
            }
            Allele a1 = null;
            Allele a2 = null;
            Allele ref = null;
            final String contig = tokens[0];
            SAMSequenceRecord ssr = null;
            ssr = dict.getSequence(contig);
            for (int i = 0; ssr == null && i + 1 < convertCtg.size(); i += 2) {
                if (!convertCtg.get(i).equals(contig))
                    continue;
                ssr = dict.getSequence(convertCtg.get(i + 1));
            }
            if (ssr == null && contig.equals("25")) {
                LOG.warn("ignoring " + line);
                ++number_non_snvs;
                continue;
            }
            if (ssr == null) {
                throw new JvarkitException.ContigNotFoundInDictionary(contig, dict);
            }
            if (genomic == null || !ssr.getSequenceName().equals(genomic.getChrom())) {
                genomic = new GenomicSequence(faidx, ssr.getSequenceName());
            }
            int pos1 = Integer.parseInt(tokens[3]);
            if (tokens[4].equals("0"))
                tokens[4] = tokens[5];
            if (tokens[5].equals("0"))
                tokens[5] = tokens[4];
            final VariantContextBuilder vcb = new VariantContextBuilder();
            vcb.chr(ssr.getSequenceName());
            vcb.attribute(morgan.getID(), Float.parseFloat(tokens[2]));
            if (iupacATGC.test(tokens[4]) && iupacATGC.test(tokens[5])) {
                if (tokens[4].length() != 1 || tokens[5].length() != -1) {
                    LOG.warn("Skipping " + line + " because I only handle SNVs.");
                    number_non_snvs++;
                    continue;
                }
                final String refBase = String.valueOf(genomic.charAt(pos1 - 1));
                ref = Allele.create(refBase, true);
                a1 = refBase.equalsIgnoreCase(tokens[4]) ? ref : Allele.create(tokens[4], false);
                ;
                a2 = refBase.equalsIgnoreCase(tokens[5]) ? ref : Allele.create(tokens[5], false);
                final String type;
                if (a1.isReference() && a2.isReference()) {
                    type = "NOVARIATION";
                } else if (tokens[4].length() < tokens[5].length()) {
                    type = "INS";
                } else if (tokens[4].length() > tokens[5].length()) {
                    type = "DEL";
                } else {
                    type = "SNV";
                }
                vcb.attribute(svtype.getID(), type);
            } else if ((tokens[4].equals("-") && iupacATGC.test(tokens[5])) || (tokens[5].equals("-") && iupacATGC.test(tokens[4]))) {
                // shift left
                pos1--;
                String refBase = String.valueOf(genomic.charAt(pos1 - 1));
                a1 = Allele.create(refBase, false);
                ref = Allele.create(refBase + tokens[tokens[4].equals("-") ? 5 : 4], true);
                a2 = a1;
                vcb.attribute(svtype.getID(), "DEL");
            } else if (tokens[4].equals("-") && tokens[5].equals("-")) {
                // shift left
                pos1--;
                String refBase = String.valueOf(genomic.charAt(pos1 - 1));
                a1 = Allele.create(refBase, false);
                ref = Allele.create(refBase + genomic.charAt(pos1), true);
                a2 = a1;
                vcb.attribute(svtype.getID(), "DEL");
            } else {
                LOG.warn("Skipping " + line + ".");
                number_non_snvs++;
                continue;
            }
            final Set<Allele> alleles = new HashSet<>();
            alleles.add(ref);
            alleles.add(a1);
            alleles.add(a2);
            vcb.start(pos1);
            vcb.stop(pos1 + ref.length() - 1);
            if (!tokens[1].isEmpty())
                vcb.id(tokens[1]);
            vcb.alleles(alleles);
            w.add(vcb.make());
        }
        r.close();
        r = null;
        w.close();
        w = null;
        if (number_non_snvs > 0)
            LOG.warn("Number lines skipped:" + number_non_snvs);
        return 0;
    } catch (final Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(faidx);
        CloserUtil.close(w);
        CloserUtil.close(r);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) HashSet(java.util.HashSet) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Predicate(java.util.function.Predicate) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) BufferedReader(java.io.BufferedReader) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader)

Aggregations

CharSplitter (com.github.lindenb.jvarkit.lang.CharSplitter)28 BufferedReader (java.io.BufferedReader)19 IOException (java.io.IOException)12 List (java.util.List)11 Parameter (com.beust.jcommander.Parameter)10 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)10 Program (com.github.lindenb.jvarkit.util.jcommander.Program)10 Logger (com.github.lindenb.jvarkit.util.log.Logger)10 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)10 Set (java.util.Set)10 Path (java.nio.file.Path)9 ArrayList (java.util.ArrayList)9 HashSet (java.util.HashSet)9 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)8 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)7 CloserUtil (htsjdk.samtools.util.CloserUtil)7 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)7 VCFHeader (htsjdk.variant.vcf.VCFHeader)7 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)7 Map (java.util.Map)7