Search in sources :

Example 1 with NcbiGeneHistory

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory in project Gemma by PavlidisLab.

the class NcbiGeneHistoryParser method parseOneLine.

@Override
public NcbiGeneHistory parseOneLine(String line) {
    if (line.startsWith("#")) {
        return null;
    }
    String[] fields = StringUtils.split(line, '\t');
    if (fields.length > NcbiGeneHistoryParser.GENE_HISTORY_FILE_NUM_FIELDS) {
        // sanity check.
        throw new IllegalStateException("NCBI gene_history file has unexpected column count. Expected " + NcbiGeneHistoryParser.GENE_HISTORY_FILE_NUM_FIELDS + ", got " + fields.length + " in line=" + line);
    }
    String geneId = fields[1];
    String discontinuedGeneId = fields[2];
    if (StringUtils.isBlank(geneId) || geneId.equals("-")) {
        String taxonId = fields[0];
        String discontinuedSymbol = fields[3];
        Integer taxonInt = Integer.parseInt(taxonId);
        if (!(discontinuedGenes.containsKey(taxonInt))) {
            discontinuedGenes.put(taxonInt, new HashMap<String, String>());
        }
        discontinuedGenes.get(taxonInt).put(discontinuedSymbol, discontinuedGeneId);
        return null;
    }
    NcbiGeneHistory his;
    if (id2history.containsKey(discontinuedGeneId)) {
        his = id2history.get(discontinuedGeneId);
        his.update(discontinuedGeneId, geneId);
        id2history.remove(discontinuedGeneId);
        id2history.put(geneId, his);
    } else {
        his = new NcbiGeneHistory(discontinuedGeneId);
        his.update(discontinuedGeneId, geneId);
    }
    return his;
}
Also used : NcbiGeneHistory(ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory)

Example 2 with NcbiGeneHistory

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory in project Gemma by PavlidisLab.

the class NcbiGeneDomainObjectGenerator method processLocalFiles.

private void processLocalFiles(final LocalFile geneInfoFile, final LocalFile gene2AccessionFile, LocalFile geneHistoryFile, LocalFile geneEnsemblFile, final BlockingQueue<NcbiGeneData> geneDataQueue) {
    final NcbiGeneInfoParser infoParser = new NcbiGeneInfoParser();
    infoParser.setFilter(this.filter);
    if (this.filter) {
        infoParser.setSupportedTaxa(supportedTaxa.keySet());
    }
    final NcbiGeneEnsemblFileParser ensemblParser = new NcbiGeneEnsemblFileParser();
    final NcbiGene2AccessionParser accParser = new NcbiGene2AccessionParser();
    accParser.setStartingNbiId(startingNcbiId);
    final File gene2accessionFileHandle = gene2AccessionFile.asFile();
    final NcbiGeneHistoryParser historyParser = new NcbiGeneHistoryParser();
    try {
        NcbiGeneDomainObjectGenerator.log.debug("Parsing gene history");
        historyParser.parse(geneHistoryFile.asFile());
        if (geneEnsemblFile != null) {
            NcbiGeneDomainObjectGenerator.log.debug("Parsing ensembl");
            ensemblParser.parse(geneEnsemblFile.asFile());
        }
        // 
        NcbiGeneDomainObjectGenerator.log.debug("Parsing GeneInfo =" + geneInfoFile.asFile().getAbsolutePath());
        try (InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(geneInfoFile.asFile().getAbsolutePath())) {
            infoParser.parse(is);
        }
    } catch (IOException e) {
        // infoProducerDone.set( true );
        throw new RuntimeException(e);
    }
    Collection<NCBIGeneInfo> geneInfoList = infoParser.getResults();
    // put into HashMap
    final Map<String, NCBIGeneInfo> geneInfoMap = new HashMap<>();
    Map<Integer, Integer> taxaCount = new HashMap<>();
    for (NCBIGeneInfo geneInfo : geneInfoList) {
        NcbiGeneHistory history = historyParser.get(geneInfo.getGeneId());
        geneInfo.setHistory(history);
        if (history == null) {
            String discontinuedIdForGene = historyParser.discontinuedIdForSymbol(geneInfo.getDefaultSymbol(), geneInfo.getTaxId());
            geneInfo.setDiscontinuedId(discontinuedIdForGene);
        }
        if (geneEnsemblFile != null) {
            String ensemblId = ensemblParser.get(geneInfo.getGeneId());
            geneInfo.setEnsemblId(ensemblId);
        }
        int taxId = geneInfo.getTaxId();
        if (!taxaCount.containsKey(taxId)) {
            taxaCount.put(taxId, 0);
        }
        taxaCount.put(taxId, taxaCount.get(taxId) + 1);
        geneInfoMap.put(geneInfo.getGeneId(), geneInfo);
    }
    supportedTaxaWithNCBIGenes = new HashSet<>();
    if (supportedTaxa != null) {
        for (Integer taxId : taxaCount.keySet()) {
            if (taxaCount.get(taxId) > 0) {
                NcbiGeneDomainObjectGenerator.log.debug("Taxon " + taxId + ": " + taxaCount.get(taxId) + " genes");
                Taxon t = supportedTaxa.get(taxId);
                supportedTaxaWithNCBIGenes.add(t);
            }
        }
    }
    // 1) use a producer-consumer model for Gene2Accession conversion
    // 1a) Parse Gene2Accession until the gene id changes. This means that
    // all accessions for the gene are done.
    // 1b) Create a Collection<Gene2Accession>, and push into BlockingQueue
    Thread parseThread = new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                NcbiGeneDomainObjectGenerator.log.debug("Parsing gene2accession=" + gene2AccessionFile.asFile().getAbsolutePath());
                accParser.setStartingNbiId(startingNcbiId);
                accParser.parse(gene2accessionFileHandle, geneDataQueue, geneInfoMap);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            NcbiGeneDomainObjectGenerator.log.debug("Domain object generator done");
            producerDone.set(true);
        }
    }, "gene2accession parser");
    parseThread.start();
// 1c) As elements get added to BlockingQueue, NCBIGeneConverter
// consumes
// and creates Gene/GeneProduct/DatabaseEntry objects.
// 1d) Push Gene to another BlockingQueue genePersistence
// 2) use producer-consumer model for Gene persistence
// 2a) as elements get added to genePersistence, persist Gene and
// associated entries.
}
Also used : NCBIGeneInfo(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo) NcbiGeneHistory(ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory) HashMap(java.util.HashMap) InputStream(java.io.InputStream) Taxon(ubic.gemma.model.genome.Taxon) IOException(java.io.IOException) File(java.io.File) LocalFile(ubic.gemma.model.common.description.LocalFile)

Aggregations

NcbiGeneHistory (ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory)2 File (java.io.File)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 HashMap (java.util.HashMap)1 NCBIGeneInfo (ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo)1 LocalFile (ubic.gemma.model.common.description.LocalFile)1 Taxon (ubic.gemma.model.genome.Taxon)1