Search in sources :

Example 1 with LocationType

use of org.uniprot.LocationType in project jvarkit by lindenb.

the class MapUniProtFeatures method doWork.

@Override
public int doWork(List<String> args) {
    PrintWriter pw = null;
    try {
        JAXBContext jc = JAXBContext.newInstance("org.uniprot");
        Unmarshaller uniprotUnmarshaller = jc.createUnmarshaller();
        Marshaller uniprotMarshaller = jc.createMarshaller();
        pw = super.openFileOrStdoutAsPrintWriter(OUT);
        LOG.info("read " + REF);
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(REF);
        if (this.indexedFastaSequenceFile.getSequenceDictionary() == null) {
            LOG.error("fasta sequence is not indexed");
            return -1;
        }
        LOG.info("readubf " + kgUri);
        String line;
        Pattern tab = Pattern.compile("[\t]");
        BufferedReader r = IOUtils.openURIForBufferedReading(kgUri);
        while ((line = r.readLine()) != null) {
            String[] tokens = tab.split(line);
            KnownGene kg = new KnownGene();
            kg.setName(tokens[0]);
            kg.setChrom(tokens[1]);
            kg.setStrand(tokens[2].charAt(0));
            kg.setTxStart(Integer.parseInt(tokens[3]));
            kg.setTxEnd(Integer.parseInt(tokens[4]));
            kg.setCdsStart(Integer.parseInt(tokens[5]));
            kg.setCdsEnd(Integer.parseInt(tokens[6]));
            kg.setExonBounds(Integer.parseInt(tokens[7]), tokens[8], tokens[9]);
            List<KnownGene> L = prot2genes.get(tokens[10]);
            if (L == null) {
                L = new ArrayList<KnownGene>();
                prot2genes.put(tokens[10], L);
            }
            if (indexedFastaSequenceFile.getSequenceDictionary().getSequence(kg.getContig()) == null) {
                LOG.info("ignoring " + line);
                continue;
            }
            L.add(kg);
        }
        r.close();
        if (OUT != null) {
            LOG.info("opening " + OUT);
            pw = new PrintWriter(OUT);
        }
        LOG.info("knownGenes: " + this.prot2genes.size());
        XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
        xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlInputFactory.setXMLResolver(new XMLResolver() {

            @Override
            public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
                LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
                return null;
            }
        });
        // SortingCollection<UBed> mappedFeatures=SortingCollection.newInstance(UBed.class, new UBedCodec(),new UBedCmp(),super.MAX_RECORDS_IN_RAM);
        // mappedFeatures.setDestructiveIteration(true);
        LOG.info("Scanning " + UNIPROT);
        Reader fr = IOUtils.openURIForBufferedReading(UNIPROT);
        XMLEventReader rx = xmlInputFactory.createXMLEventReader(fr);
        final QName uEntry = new QName(UNIPROT_NS, "entry");
        GenomicSequence genomic = null;
        while (rx.hasNext()) {
            XMLEvent evt = rx.peek();
            if (!(evt.isStartElement() && evt.asStartElement().getName().equals(uEntry))) {
                rx.next();
                continue;
            }
            JAXBElement<Entry> jaxbElement = uniprotUnmarshaller.unmarshal(rx, Entry.class);
            Entry entry = jaxbElement.getValue();
            if (entry.getFeature().isEmpty())
                continue;
            List<KnownGene> genes = null;
            for (String acn : entry.getAccession()) {
                genes = prot2genes.get(acn);
                if (genes != null)
                    break;
            }
            if (genes == null)
                continue;
            for (KnownGene kg : genes) {
                if (genomic == null || !genomic.getChrom().equals(kg.getChromosome())) {
                    genomic = new GenomicSequence(this.indexedFastaSequenceFile, kg.getChromosome());
                }
                KnownGene.Peptide pep = kg.getCodingRNA(genomic).getPeptide();
                int sameSequenceLength = 0;
                while (sameSequenceLength < entry.getSequence().getValue().length() && sameSequenceLength < pep.length()) {
                    if (Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength)) != Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength))) {
                        break;
                    }
                    sameSequenceLength++;
                }
                if (sameSequenceLength != pep.length()) {
                    System.err.println("Not Same sequence " + kg.getName() + " strand " + kg.getStrand() + " ok-up-to-" + sameSequenceLength);
                    System.err.println("P:" + pep.toString() + " " + pep.length());
                    System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
                    if (pep.toString().contains("?")) {
                        System.err.println("RNA:" + pep.getCodingRNA().toString());
                    }
                }
                if (sameSequenceLength == 0)
                    continue;
                for (FeatureType feat : entry.getFeature()) {
                    if (feat.getType() == null || feat.getType().isEmpty())
                        continue;
                    LocationType locType = feat.getLocation();
                    if (locType == null)
                        continue;
                    int pepStart, pepEnd;
                    if (locType.getPosition() != null && locType.getPosition().getPosition() != null) {
                        pepEnd = locType.getPosition().getPosition().intValue();
                        pepStart = pepEnd - 1;
                    } else if (locType.getBegin() != null && locType.getEnd() != null && locType.getBegin().getPosition() != null && locType.getEnd().getPosition() != null) {
                        pepStart = locType.getBegin().getPosition().intValue() - 1;
                        pepEnd = locType.getEnd().getPosition().intValue();
                    } else {
                        continue;
                    }
                    if (pepEnd >= sameSequenceLength)
                        continue;
                    List<Integer> genomicPos = new ArrayList<Integer>();
                    while (pepStart < pepEnd) {
                        if (pepStart >= pep.length()) {
                            System.err.println();
                            System.err.println("P:" + pep.toString() + " " + pep.length() + " " + kg.getStrand());
                            System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
                            uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
                        }
                        int[] codon = pep.convertToGenomicCoordinates(pepStart);
                        pepStart++;
                        for (int gP : codon) {
                            if (gP == -1) {
                                uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
                                LOG.error("error in genomoc for (" + pepStart + "/" + pepEnd + "):" + entry.getName());
                                System.exit(-1);
                            }
                            genomicPos.add(gP);
                        }
                    }
                    Collections.sort(genomicPos);
                    UBed ubed = new UBed();
                    ubed.tid = this.indexedFastaSequenceFile.getSequenceDictionary().getSequenceIndex(genomic.getChrom());
                    int k0 = 0;
                    while (k0 < genomicPos.size()) {
                        Range range = new Range(genomicPos.get(k0), genomicPos.get(k0) + 1);
                        k0++;
                        while (k0 < genomicPos.size() && range.end == genomicPos.get(k0)) {
                            range = new Range(range.start, range.end + 1);
                            ++k0;
                        }
                        ubed.positions.add(range);
                    }
                    ubed.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
                    ubed.name = feat.getType();
                    ubed.print(pw);
                }
            }
        }
        rx.close();
        fr.close();
        LOG.info("End scan uniprot");
    /*
			mappedFeatures.doneAdding();
			
			
			CloseableIterator<UBed> iter=mappedFeatures.iterator();
			while(iter.hasNext())
				{
				UBed ubed=iter.next();
				ubed.print();
				}
			mappedFeatures.cleanup();
			*/
    } catch (Exception err) {
        err.printStackTrace();
        if (OUT != null)
            OUT.deleteOnExit();
        return -1;
    } finally {
        pw.flush();
        pw.close();
        CloserUtil.close(this.indexedFastaSequenceFile);
    }
    return 0;
}
Also used : FeatureType(org.uniprot.FeatureType) ArrayList(java.util.ArrayList) XMLEventReader(javax.xml.stream.XMLEventReader) Reader(java.io.Reader) BufferedReader(java.io.BufferedReader) JAXBContext(javax.xml.bind.JAXBContext) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) Entry(org.uniprot.Entry) XMLEventReader(javax.xml.stream.XMLEventReader) XMLResolver(javax.xml.stream.XMLResolver) Unmarshaller(javax.xml.bind.Unmarshaller) PrintWriter(java.io.PrintWriter) Pattern(java.util.regex.Pattern) Marshaller(javax.xml.bind.Marshaller) QName(javax.xml.namespace.QName) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) XMLStreamException(javax.xml.stream.XMLStreamException) XMLStreamException(javax.xml.stream.XMLStreamException) BufferedReader(java.io.BufferedReader) XMLEvent(javax.xml.stream.events.XMLEvent) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) LocationType(org.uniprot.LocationType) XMLInputFactory(javax.xml.stream.XMLInputFactory)

Aggregations

GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)1 KnownGene (com.github.lindenb.jvarkit.util.ucsc.KnownGene)1 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)1 BufferedReader (java.io.BufferedReader)1 PrintWriter (java.io.PrintWriter)1 Reader (java.io.Reader)1 ArrayList (java.util.ArrayList)1 Pattern (java.util.regex.Pattern)1 JAXBContext (javax.xml.bind.JAXBContext)1 Marshaller (javax.xml.bind.Marshaller)1 Unmarshaller (javax.xml.bind.Unmarshaller)1 QName (javax.xml.namespace.QName)1 XMLEventReader (javax.xml.stream.XMLEventReader)1 XMLInputFactory (javax.xml.stream.XMLInputFactory)1 XMLResolver (javax.xml.stream.XMLResolver)1 XMLStreamException (javax.xml.stream.XMLStreamException)1 XMLEvent (javax.xml.stream.events.XMLEvent)1 Entry (org.uniprot.Entry)1 FeatureType (org.uniprot.FeatureType)1 LocationType (org.uniprot.LocationType)1