Search in sources :

Example 1 with StringWithMapping

use of io.repseq.util.StringWithMapping in project repseqio by repseqio.

the class FromFastaActionAbstract method go.

@Override
public void go(ActionHelper helper) throws Exception {
    Pattern functionalityRegexp = params.getFunctionalityRegexp();
    GeneType geneType = params.getGeneType();
    Map<String, VDJCGeneData> genes = new HashMap<>();
    Path libraryPath = Paths.get(params.getOutputJSON()).toAbsolutePath();
    // Parsing -P or --gene-feature parameters
    Map<ReferencePoint, Integer> points = new HashMap<>();
    if (params.geneFeature != null) {
        GeneFeature gf = params.getGeneFeature();
        points.put(gf.getFirstPoint(), 0);
        points.put(gf.getLastPoint(), -1);
    } else
        for (Map.Entry<String, String> p : params.points.entrySet()) {
            ReferencePoint anchorPoint = ReferencePoint.getPointByName(p.getKey());
            if (anchorPoint == null)
                throw new IllegalArgumentException("Unknown anchor point: " + p.getKey());
            int position = Integer.decode(p.getValue());
            points.put(anchorPoint, position);
        }
    // Check
    for (Map.Entry<ReferencePoint, Integer> entry : points.entrySet()) if (entry.getKey().getGeneType() != null && entry.getKey().getGeneType() != geneType)
        throw new IllegalArgumentException("Incompatible anchor point and gene type: " + entry.getKey() + " / " + geneType);
    try (FastaReader reader = new FastaReader<>(params.getInput(), null);
        SequenceStorage storage = params.doEmbedSequences() ? new EmbeddedWriter() : params.getOutputFasta() == null ? new ExistingFileWriter(libraryPath, Paths.get(params.getInput()).toAbsolutePath()) : new FastaSequenceStorage(libraryPath, Paths.get(params.getOutputFasta()).toAbsolutePath())) {
        for (FastaReader.RawFastaRecord record : CUtils.it((OutputPortCloseable<FastaReader.RawFastaRecord>) reader.asRawRecordsPort())) {
            StringWithMapping swm = StringWithMapping.removeSymbol(record.sequence, params.getPaddingCharacter());
            NucleotideSequence seq = new NucleotideSequence(swm.getModifiedString());
            if (seq.containsWildcards()) {
                System.out.println("Sequence dropped because contain wildcards: " + record.description);
                continue;
            }
            String[] fields = record.description.split("\\|");
            String geneName = fields[params.nameIndex];
            boolean functionality = true;
            if (params.functionalityIndex != null)
                functionality = functionalityRegexp.matcher(fields[params.functionalityIndex]).matches();
            SortedMap<ReferencePoint, Long> anchorPoints = new TreeMap<>();
            for (Map.Entry<String, String> p : params.patterns.entrySet()) {
                ReferencePoint anchorPoint = ReferencePoint.getPointByName(p.getKey());
                if (anchorPoint == null)
                    throw new IllegalArgumentException("Unknown anchor point: " + p.getKey());
                if (anchorPoint.getGeneType() != null && anchorPoint.getGeneType() != geneType)
                    throw new IllegalArgumentException("Incompatible anchor point and gene type: " + anchorPoint + " / " + geneType);
                Pattern pattern = Pattern.compile(p.getValue());
                int position = -1;
                for (boolean stops : new boolean[] { false, true }) for (int f = 0; f < 3; f++) {
                    if (position != -1)
                        continue;
                    TranslationParameters tp = TranslationParameters.withoutIncompleteCodon(f);
                    AminoAcidSequence aa = AminoAcidSequence.translate(seq, tp);
                    if (!stops && aa.containStops())
                        continue;
                    String str = aa.toString();
                    Matcher matcher = pattern.matcher(str);
                    if (matcher.find()) {
                        int aaPosition = matcher.start(1);
                        position = AminoAcidSequence.convertAAPositionToNt(aaPosition, seq.size(), tp);
                    }
                }
                if (position == -1)
                    continue;
                anchorPoints.put(anchorPoint, (long) position);
            }
            for (Map.Entry<ReferencePoint, Integer> p : points.entrySet()) {
                // AA patterns have priority over positional anchor points
                if (anchorPoints.containsKey(p.getKey()))
                    continue;
                // Converting position using
                int position = swm.convertPosition(p.getValue());
                // Can't be converted (e.g. position of padding symbol) skipping
                if (position == -1)
                    continue;
                anchorPoints.put(p.getKey(), (long) position);
            }
            if (genes.containsKey(geneName)) {
                if (params.getIgnoreDuplicates()) {
                    System.out.println("Ignored: Duplicate records for " + geneName);
                    continue;
                } else
                    throw new IllegalArgumentException("Duplicate records for " + geneName);
            }
            BaseSequence baseSequence = storage.storeSequence(seq, geneName, record.description);
            VDJCGeneData gene = new VDJCGeneData(baseSequence, geneName, geneType, functionality, new Chains(params.chain), new TreeMap<String, SortedSet<String>>(), anchorPoints).addMetaValue(KnownVDJCGeneMetaFields.COMMENTS, record.description);
            genes.put(geneName, gene);
        }
        VDJCLibraryData library = new VDJCLibraryData(params.taxonId, params.speciesNames, new ArrayList<>(genes.values()), new TreeMap<String, SortedSet<String>>(), storage.getBase()).addMetaValue(KnownVDJCLibraryMetaFields.COMMENTS, "Imported from: " + params.getInput());
        VDJCDataUtils.writeToFile(new VDJCLibraryData[] { library }, params.getOutputJSON(), false);
    }
}
Also used : FastaReader(com.milaboratory.core.io.sequence.fasta.FastaReader) Matcher(java.util.regex.Matcher) AminoAcidSequence(com.milaboratory.core.sequence.AminoAcidSequence) Path(java.nio.file.Path) Pattern(java.util.regex.Pattern) StringWithMapping(io.repseq.util.StringWithMapping) TranslationParameters(com.milaboratory.core.sequence.TranslationParameters) NucleotideSequence(com.milaboratory.core.sequence.NucleotideSequence)

Aggregations

FastaReader (com.milaboratory.core.io.sequence.fasta.FastaReader)1 AminoAcidSequence (com.milaboratory.core.sequence.AminoAcidSequence)1 NucleotideSequence (com.milaboratory.core.sequence.NucleotideSequence)1 TranslationParameters (com.milaboratory.core.sequence.TranslationParameters)1 StringWithMapping (io.repseq.util.StringWithMapping)1 Path (java.nio.file.Path)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1