use of org.uniprot.LocationType in project jvarkit by lindenb.
the class MapUniProtFeatures method doWork.
@Override
public int doWork(List<String> args) {
PrintWriter pw = null;
try {
JAXBContext jc = JAXBContext.newInstance("org.uniprot");
Unmarshaller uniprotUnmarshaller = jc.createUnmarshaller();
Marshaller uniprotMarshaller = jc.createMarshaller();
pw = super.openFileOrStdoutAsPrintWriter(OUT);
LOG.info("read " + REF);
this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(REF);
if (this.indexedFastaSequenceFile.getSequenceDictionary() == null) {
LOG.error("fasta sequence is not indexed");
return -1;
}
LOG.info("readubf " + kgUri);
String line;
Pattern tab = Pattern.compile("[\t]");
BufferedReader r = IOUtils.openURIForBufferedReading(kgUri);
while ((line = r.readLine()) != null) {
String[] tokens = tab.split(line);
KnownGene kg = new KnownGene();
kg.setName(tokens[0]);
kg.setChrom(tokens[1]);
kg.setStrand(tokens[2].charAt(0));
kg.setTxStart(Integer.parseInt(tokens[3]));
kg.setTxEnd(Integer.parseInt(tokens[4]));
kg.setCdsStart(Integer.parseInt(tokens[5]));
kg.setCdsEnd(Integer.parseInt(tokens[6]));
kg.setExonBounds(Integer.parseInt(tokens[7]), tokens[8], tokens[9]);
List<KnownGene> L = prot2genes.get(tokens[10]);
if (L == null) {
L = new ArrayList<KnownGene>();
prot2genes.put(tokens[10], L);
}
if (indexedFastaSequenceFile.getSequenceDictionary().getSequence(kg.getContig()) == null) {
LOG.info("ignoring " + line);
continue;
}
L.add(kg);
}
r.close();
if (OUT != null) {
LOG.info("opening " + OUT);
pw = new PrintWriter(OUT);
}
LOG.info("knownGenes: " + this.prot2genes.size());
XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
xmlInputFactory.setXMLResolver(new XMLResolver() {
@Override
public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
return null;
}
});
// SortingCollection<UBed> mappedFeatures=SortingCollection.newInstance(UBed.class, new UBedCodec(),new UBedCmp(),super.MAX_RECORDS_IN_RAM);
// mappedFeatures.setDestructiveIteration(true);
LOG.info("Scanning " + UNIPROT);
Reader fr = IOUtils.openURIForBufferedReading(UNIPROT);
XMLEventReader rx = xmlInputFactory.createXMLEventReader(fr);
final QName uEntry = new QName(UNIPROT_NS, "entry");
GenomicSequence genomic = null;
while (rx.hasNext()) {
XMLEvent evt = rx.peek();
if (!(evt.isStartElement() && evt.asStartElement().getName().equals(uEntry))) {
rx.next();
continue;
}
JAXBElement<Entry> jaxbElement = uniprotUnmarshaller.unmarshal(rx, Entry.class);
Entry entry = jaxbElement.getValue();
if (entry.getFeature().isEmpty())
continue;
List<KnownGene> genes = null;
for (String acn : entry.getAccession()) {
genes = prot2genes.get(acn);
if (genes != null)
break;
}
if (genes == null)
continue;
for (KnownGene kg : genes) {
if (genomic == null || !genomic.getChrom().equals(kg.getChromosome())) {
genomic = new GenomicSequence(this.indexedFastaSequenceFile, kg.getChromosome());
}
KnownGene.Peptide pep = kg.getCodingRNA(genomic).getPeptide();
int sameSequenceLength = 0;
while (sameSequenceLength < entry.getSequence().getValue().length() && sameSequenceLength < pep.length()) {
if (Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength)) != Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength))) {
break;
}
sameSequenceLength++;
}
if (sameSequenceLength != pep.length()) {
System.err.println("Not Same sequence " + kg.getName() + " strand " + kg.getStrand() + " ok-up-to-" + sameSequenceLength);
System.err.println("P:" + pep.toString() + " " + pep.length());
System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
if (pep.toString().contains("?")) {
System.err.println("RNA:" + pep.getCodingRNA().toString());
}
}
if (sameSequenceLength == 0)
continue;
for (FeatureType feat : entry.getFeature()) {
if (feat.getType() == null || feat.getType().isEmpty())
continue;
LocationType locType = feat.getLocation();
if (locType == null)
continue;
int pepStart, pepEnd;
if (locType.getPosition() != null && locType.getPosition().getPosition() != null) {
pepEnd = locType.getPosition().getPosition().intValue();
pepStart = pepEnd - 1;
} else if (locType.getBegin() != null && locType.getEnd() != null && locType.getBegin().getPosition() != null && locType.getEnd().getPosition() != null) {
pepStart = locType.getBegin().getPosition().intValue() - 1;
pepEnd = locType.getEnd().getPosition().intValue();
} else {
continue;
}
if (pepEnd >= sameSequenceLength)
continue;
List<Integer> genomicPos = new ArrayList<Integer>();
while (pepStart < pepEnd) {
if (pepStart >= pep.length()) {
System.err.println();
System.err.println("P:" + pep.toString() + " " + pep.length() + " " + kg.getStrand());
System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
}
int[] codon = pep.convertToGenomicCoordinates(pepStart);
pepStart++;
for (int gP : codon) {
if (gP == -1) {
uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
LOG.error("error in genomoc for (" + pepStart + "/" + pepEnd + "):" + entry.getName());
System.exit(-1);
}
genomicPos.add(gP);
}
}
Collections.sort(genomicPos);
UBed ubed = new UBed();
ubed.tid = this.indexedFastaSequenceFile.getSequenceDictionary().getSequenceIndex(genomic.getChrom());
int k0 = 0;
while (k0 < genomicPos.size()) {
Range range = new Range(genomicPos.get(k0), genomicPos.get(k0) + 1);
k0++;
while (k0 < genomicPos.size() && range.end == genomicPos.get(k0)) {
range = new Range(range.start, range.end + 1);
++k0;
}
ubed.positions.add(range);
}
ubed.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
ubed.name = feat.getType();
ubed.print(pw);
}
}
}
rx.close();
fr.close();
LOG.info("End scan uniprot");
/*
mappedFeatures.doneAdding();
CloseableIterator<UBed> iter=mappedFeatures.iterator();
while(iter.hasNext())
{
UBed ubed=iter.next();
ubed.print();
}
mappedFeatures.cleanup();
*/
} catch (Exception err) {
err.printStackTrace();
if (OUT != null)
OUT.deleteOnExit();
return -1;
} finally {
pw.flush();
pw.close();
CloserUtil.close(this.indexedFastaSequenceFile);
}
return 0;
}
Aggregations