Search in sources :

Example 1 with NCBIGeneInfo

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo in project Gemma by PavlidisLab.

the class NcbiGene2AccessionParser method parse.

/*
     * This has been overridden to add postprocessing to the gene2accession file. This involves adding the
     * last gene that had accessions (if available) and adding the remaining genes without accessions
     *
     */
@Override
public void parse(InputStream is) throws IOException {
    if (startingNcbiId == null)
        hasStarted = true;
    super.parse(is);
    // add last gene with an accession
    if (geneData.getGeneInfo() != null) {
        try {
            queue.put(geneData);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        geneInfo.remove(lastGeneId);
    }
    // add remaining genes
    // push in remaining genes that did not have accessions
    Collection<NCBIGeneInfo> remainingGenes = geneInfo.values();
    for (NCBIGeneInfo o : remainingGenes) {
        NcbiGeneData geneCollection = new NcbiGeneData();
        geneCollection.setGeneInfo(o);
        try {
            queue.put(geneCollection);
        } catch (InterruptedException e) {
            throw new RuntimeException();
        }
    }
}
Also used : NCBIGeneInfo(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo)

Example 2 with NCBIGeneInfo

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo in project Gemma by PavlidisLab.

the class NcbiGeneConverter method convert.

public Gene convert(NcbiGeneData data) {
    // get gene info and fill in gene
    NCBIGeneInfo geneInfo = data.getGeneInfo();
    Gene gene = this.convert(geneInfo);
    // grab all accessions and fill in GeneProduct/DatabaseEntry
    // and associate with Gene
    Collection<NCBIGene2Accession> gene2accession = data.getAccessions();
    Collection<GeneProduct> geneProducts = new HashSet<>();
    for (NCBIGene2Accession acc : gene2accession) {
        geneProducts.addAll(this.convert(acc, gene));
    }
    gene.setProducts(geneProducts);
    return gene;
}
Also used : NCBIGeneInfo(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo) NCBIGene2Accession(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) HashSet(java.util.HashSet)

Example 3 with NCBIGeneInfo

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo in project Gemma by PavlidisLab.

the class NcbiGeneDomainObjectGenerator method processLocalFiles.

private void processLocalFiles(final LocalFile geneInfoFile, final LocalFile gene2AccessionFile, LocalFile geneHistoryFile, LocalFile geneEnsemblFile, final BlockingQueue<NcbiGeneData> geneDataQueue) {
    final NcbiGeneInfoParser infoParser = new NcbiGeneInfoParser();
    infoParser.setFilter(this.filter);
    if (this.filter) {
        infoParser.setSupportedTaxa(supportedTaxa.keySet());
    }
    final NcbiGeneEnsemblFileParser ensemblParser = new NcbiGeneEnsemblFileParser();
    final NcbiGene2AccessionParser accParser = new NcbiGene2AccessionParser();
    accParser.setStartingNbiId(startingNcbiId);
    final File gene2accessionFileHandle = gene2AccessionFile.asFile();
    final NcbiGeneHistoryParser historyParser = new NcbiGeneHistoryParser();
    try {
        NcbiGeneDomainObjectGenerator.log.debug("Parsing gene history");
        historyParser.parse(geneHistoryFile.asFile());
        if (geneEnsemblFile != null) {
            NcbiGeneDomainObjectGenerator.log.debug("Parsing ensembl");
            ensemblParser.parse(geneEnsemblFile.asFile());
        }
        // 
        NcbiGeneDomainObjectGenerator.log.debug("Parsing GeneInfo =" + geneInfoFile.asFile().getAbsolutePath());
        try (InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(geneInfoFile.asFile().getAbsolutePath())) {
            infoParser.parse(is);
        }
    } catch (IOException e) {
        // infoProducerDone.set( true );
        throw new RuntimeException(e);
    }
    Collection<NCBIGeneInfo> geneInfoList = infoParser.getResults();
    // put into HashMap
    final Map<String, NCBIGeneInfo> geneInfoMap = new HashMap<>();
    Map<Integer, Integer> taxaCount = new HashMap<>();
    for (NCBIGeneInfo geneInfo : geneInfoList) {
        NcbiGeneHistory history = historyParser.get(geneInfo.getGeneId());
        geneInfo.setHistory(history);
        if (history == null) {
            String discontinuedIdForGene = historyParser.discontinuedIdForSymbol(geneInfo.getDefaultSymbol(), geneInfo.getTaxId());
            geneInfo.setDiscontinuedId(discontinuedIdForGene);
        }
        if (geneEnsemblFile != null) {
            String ensemblId = ensemblParser.get(geneInfo.getGeneId());
            geneInfo.setEnsemblId(ensemblId);
        }
        int taxId = geneInfo.getTaxId();
        if (!taxaCount.containsKey(taxId)) {
            taxaCount.put(taxId, 0);
        }
        taxaCount.put(taxId, taxaCount.get(taxId) + 1);
        geneInfoMap.put(geneInfo.getGeneId(), geneInfo);
    }
    supportedTaxaWithNCBIGenes = new HashSet<>();
    if (supportedTaxa != null) {
        for (Integer taxId : taxaCount.keySet()) {
            if (taxaCount.get(taxId) > 0) {
                NcbiGeneDomainObjectGenerator.log.debug("Taxon " + taxId + ": " + taxaCount.get(taxId) + " genes");
                Taxon t = supportedTaxa.get(taxId);
                supportedTaxaWithNCBIGenes.add(t);
            }
        }
    }
    // 1) use a producer-consumer model for Gene2Accession conversion
    // 1a) Parse Gene2Accession until the gene id changes. This means that
    // all accessions for the gene are done.
    // 1b) Create a Collection<Gene2Accession>, and push into BlockingQueue
    Thread parseThread = new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                NcbiGeneDomainObjectGenerator.log.debug("Parsing gene2accession=" + gene2AccessionFile.asFile().getAbsolutePath());
                accParser.setStartingNbiId(startingNcbiId);
                accParser.parse(gene2accessionFileHandle, geneDataQueue, geneInfoMap);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            NcbiGeneDomainObjectGenerator.log.debug("Domain object generator done");
            producerDone.set(true);
        }
    }, "gene2accession parser");
    parseThread.start();
// 1c) As elements get added to BlockingQueue, NCBIGeneConverter
// consumes
// and creates Gene/GeneProduct/DatabaseEntry objects.
// 1d) Push Gene to another BlockingQueue genePersistence
// 2) use producer-consumer model for Gene persistence
// 2a) as elements get added to genePersistence, persist Gene and
// associated entries.
}
Also used : NCBIGeneInfo(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo) NcbiGeneHistory(ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory) HashMap(java.util.HashMap) InputStream(java.io.InputStream) Taxon(ubic.gemma.model.genome.Taxon) IOException(java.io.IOException) File(java.io.File) LocalFile(ubic.gemma.model.common.description.LocalFile)

Example 4 with NCBIGeneInfo

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo in project Gemma by PavlidisLab.

the class NcbiGeneInfoParser method parseOneLine.

@Override
public NCBIGeneInfo parseOneLine(String line) {
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
    if (fields.length != NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW) {
        // noinspection StatementWithEmptyBody // backwards compatibility, old format, hopefully okay
        if (fields.length == 13 || fields.length == 14 || fields.length == 15) {
        // They keep adding fields at the end...we only need the first few.
        } else {
            throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length + " fields, expected " + NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW);
        }
    }
    NCBIGeneInfo geneInfo = new NCBIGeneInfo();
    try {
        // Skip taxa that we don't support.
        int taxonId = Integer.parseInt(fields[0]);
        if (filter && ncbiTaxonIds != null) {
            if (!ncbiTaxonIds.contains(taxonId)) {
                return null;
            }
        }
        // See ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README
        // #Format:
        // tax_id
        // GeneID
        // Symbol
        // LocusTag
        // Synonyms
        // dbXrefs, separated by |
        // chromosome
        // map_location
        // description
        // type_of_gene
        // Symbol_from_nomenclature_authority
        // Full_name_from_nomenclature_authority
        // Nomenclature_status
        // Other_designations
        // Modification_date
        // Feature type
        geneInfo.setTaxId(taxonId);
        geneInfo.setGeneId(fields[1]);
        geneInfo.setDefaultSymbol(fields[2]);
        geneInfo.setLocusTag(fields[3]);
        String[] synonyms = StringUtils.splitPreserveAllTokens(fields[4], '|');
        for (String synonym : synonyms) {
            if (synonym.equals("-"))
                continue;
            geneInfo.addToSynonyms(synonym);
        }
        if (!fields[5].equals("-")) {
            String[] dbXRefs = StringUtils.splitPreserveAllTokens(fields[5], '|');
            for (String dbXr : dbXRefs) {
                String[] dbF = StringUtils.split(dbXr, ':');
                if (dbF.length != 2) {
                    /*
                         * Annoyingly, HGCN identifiers now have the format HGNC:X where X is an integer. This is
                         * apparent from downloading files from HGCN (http://www.genenames.org/cgi-bin/statistics). Same
                         * situation for MGI
                         *
                         * Therefore we have a special case.
                         */
                    if (dbF.length == 3 && (dbF[1].equals("HGNC") || dbF[1].equals("MGI"))) {
                        dbF[1] = dbF[1] + ":" + dbF[2];
                    } else {
                        // we're very stringent to avoid data corruption.
                        throw new FileFormatException("Expected 2 fields, got " + dbF.length + " from '" + dbXr + "'");
                    }
                }
                geneInfo.addToDbXRefs(dbF[0], dbF[1]);
            }
        }
        geneInfo.setChromosome(fields[6]);
        geneInfo.setMapLocation(fields[7]);
        geneInfo.setDescription(fields[8]);
        geneInfo.setGeneType(NCBIGeneInfo.typeStringToGeneType(fields[9]));
        geneInfo.setSymbolIsFromAuthority(!fields[10].equals("-"));
        geneInfo.setNameIsFromAuthority(!fields[11].equals("-"));
        geneInfo.setNomenclatureStatus(fields[12].equals("-") ? NomenclatureStatus.UNKNOWN : fields[11].equals("O") ? NomenclatureStatus.OFFICIAL : NomenclatureStatus.INTERIM);
    // ignore 14th field for now - it stores alternate protein names
    // ignore 15th, modification date
    } catch (NumberFormatException e) {
        throw new FileFormatException(e);
    }
    return geneInfo;
}
Also used : NCBIGeneInfo(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo) FileFormatException(ubic.gemma.core.loader.util.parser.FileFormatException)

Aggregations

NCBIGeneInfo (ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo)4 File (java.io.File)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 NCBIGene2Accession (ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession)1 NcbiGeneHistory (ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory)1 FileFormatException (ubic.gemma.core.loader.util.parser.FileFormatException)1 LocalFile (ubic.gemma.model.common.description.LocalFile)1 Gene (ubic.gemma.model.genome.Gene)1 Taxon (ubic.gemma.model.genome.Taxon)1 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)1