Search in sources :

Example 1 with NCBIGene2Accession

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession in project Gemma by PavlidisLab.

the class NcbiGene2AccessionParser method processFields.

private NCBIGene2Accession processFields(String[] fields) {
    NCBIGene2Accession newGene = new NCBIGene2Accession();
    try {
        /*
             * Skip lines that refer to locations in non-reference assemblies.
             */
        if (fields[12].startsWith("Alternate assembly")) {
            return null;
        }
        newGene.setGeneId(fields[1]);
        if (!hasStarted) {
            assert startingNcbiId != null;
            if (startingNcbiId.equals(Integer.parseInt(fields[1]))) {
                log.info("Found the starting gene " + startingNcbiId);
                hasStarted = true;
            } else {
                return null;
            }
        }
        // #Format:
        // tax_id 0
        // GeneID 1
        // status 2
        // RNA_nucleotide_accession.version 3
        // RNA_nucleotide_gi 4
        // protein_accession.version 5
        // protein_gi 6
        // genomic_nucleotide_accession.version 7
        // genomic_nucleotide_gi 8
        // start_position_on_the_genomic_accession 9
        // end_position_on_the_genomic_accession 10
        // orientation 11
        // assembly 12
        // mature_peptide_accession.version 13
        // mature_peptide_gi 14
        // Symbol 15
        newGene.setTaxId(Integer.parseInt(fields[0]));
        newGene.setStatus(fields[2].equals("-") ? null : fields[2]);
        newGene.setRnaNucleotideAccession(fields[3].equals("-") ? null : fields[3]);
        newGene.setRnaNucleotideGI(fields[4].equals("-") ? null : fields[4]);
        newGene.setProteinAccession(fields[5].equals("-") ? null : fields[5]);
        newGene.setProteinGI(fields[6].equals("-") ? null : fields[6]);
        newGene.setGenomicNucleotideAccession(fields[7].equals("-") ? null : fields[7]);
        newGene.setGenomicNucleotideGI(fields[8].equals("-") ? null : fields[8]);
        newGene.setStartPosition(fields[9].equals("-") ? null : Long.parseLong(fields[9]));
        newGene.setEndPosition(fields[10].equals("-") ? null : Long.parseLong(fields[10]));
        newGene.setOrientation(fields[11].equals("?") ? null : fields[11]);
        // set accession version numbers (additional parsing)
        // the assumption is that the string is delimited by a dot
        // and it only has one dot with one version number (ie GS001.1, not GS001.1.1)
        // RNA
        String rnaAccession = newGene.getRnaNucleotideAccession();
        if (StringUtils.isNotBlank(rnaAccession)) {
            String[] tokens = StringUtils.splitPreserveAllTokens(rnaAccession, '.');
            switch(tokens.length) {
                case 1:
                    newGene.setRnaNucleotideAccession(tokens[0]);
                    newGene.setRnaNucleotideAccessionVersion(null);
                    break;
                case 2:
                    newGene.setRnaNucleotideAccession(tokens[0]);
                    newGene.setRnaNucleotideAccessionVersion(tokens[1]);
                    break;
                default:
                    throw new UnsupportedOperationException("Don't know how to deal with " + rnaAccession);
            }
        } else {
            newGene.setRnaNucleotideAccessionVersion(null);
            newGene.setRnaNucleotideAccessionVersion(null);
        }
        // protein
        String proteinAccession = newGene.getProteinAccession();
        if (StringUtils.isNotBlank(proteinAccession)) {
            String[] tokens = StringUtils.splitPreserveAllTokens(proteinAccession, '.');
            switch(tokens.length) {
                case 1:
                    newGene.setProteinAccession(tokens[0]);
                    newGene.setProteinAccessionVersion(null);
                    break;
                case 2:
                    newGene.setProteinAccession(tokens[0]);
                    newGene.setProteinAccessionVersion(tokens[1]);
                    break;
                default:
                    throw new UnsupportedOperationException("Don't know how to deal with " + proteinAccession);
            }
        } else {
            newGene.setProteinAccessionVersion(null);
            newGene.setProteinAccessionVersion(null);
        }
        // Genome (chromosome information)
        String genomicAccession = newGene.getGenomicNucleotideAccession();
        if (StringUtils.isNotBlank(genomicAccession)) {
            String[] tokens = StringUtils.splitPreserveAllTokens(genomicAccession, '.');
            switch(tokens.length) {
                case 1:
                    newGene.setGenomicNucleotideAccession(tokens[0]);
                    newGene.setGenomicNucleotideAccessionVersion(null);
                    break;
                case 2:
                    newGene.setGenomicNucleotideAccession(tokens[0]);
                    newGene.setGenomicNucleotideAccessionVersion(tokens[1]);
                    break;
                default:
                    throw new UnsupportedOperationException("Don't know how to deal with " + genomicAccession);
            }
        } else {
            newGene.setGenomicNucleotideAccessionVersion(null);
            newGene.setGenomicNucleotideAccessionVersion(null);
        }
    } catch (NumberFormatException e) {
        throw new RuntimeException(e);
    }
    return newGene;
}
Also used : NCBIGene2Accession(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession)

Example 2 with NCBIGene2Accession

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession in project Gemma by PavlidisLab.

the class NcbiGene2AccessionParser method parseOneLine.

@Override
public NCBIGene2Accession parseOneLine(String line) {
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
    if (fields.length < NcbiGene2AccessionParser.NCBI_GENE2ACCESSION_FIELDS_PER_ROW) {
        throw new IllegalArgumentException("Line is not in the right format: has " + fields.length + " fields, expected " + NcbiGene2AccessionParser.NCBI_GENE2ACCESSION_FIELDS_PER_ROW);
    }
    NCBIGene2Accession currentAccession = this.processFields(fields);
    if (currentAccession == null) {
        return null;
    }
    // really doesn't serve much of a purpose
    this.addResult(currentAccession);
    /*
         * Only some genes are relevant - for example, we might have filtered them by taxon.
         */
    if (geneInfo != null && !geneInfo.containsKey(currentAccession.getGeneId())) {
        return null;
    }
    // we are done with the gene Id. Push the geneCollection into the queue.
    if (lastGeneId != null && !lastGeneId.equalsIgnoreCase(currentAccession.getGeneId())) {
        // push the gene set to the queue
        try {
            queue.put(geneData);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        // clear the gene set
        geneData = new NcbiGeneData();
        if (geneInfo != null)
            geneInfo.remove(lastGeneId);
    }
    assert currentAccession.getGeneId() != null;
    // we're either starting a new one, or continuing with an old one.
    lastGeneId = currentAccession.getGeneId();
    geneData.addAccession(currentAccession);
    geneData.setGeneInfo(geneInfo.get(currentAccession.getGeneId()));
    // this will be a trailing accession.?
    return currentAccession;
}
Also used : NCBIGene2Accession(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession)

Example 3 with NCBIGene2Accession

use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession in project Gemma by PavlidisLab.

the class NcbiGeneConverter method convert.

public Gene convert(NcbiGeneData data) {
    // get gene info and fill in gene
    NCBIGeneInfo geneInfo = data.getGeneInfo();
    Gene gene = this.convert(geneInfo);
    // grab all accessions and fill in GeneProduct/DatabaseEntry
    // and associate with Gene
    Collection<NCBIGene2Accession> gene2accession = data.getAccessions();
    Collection<GeneProduct> geneProducts = new HashSet<>();
    for (NCBIGene2Accession acc : gene2accession) {
        geneProducts.addAll(this.convert(acc, gene));
    }
    gene.setProducts(geneProducts);
    return gene;
}
Also used : NCBIGeneInfo(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo) NCBIGene2Accession(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) HashSet(java.util.HashSet)

Aggregations

NCBIGene2Accession (ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession)3 HashSet (java.util.HashSet)1 NCBIGeneInfo (ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo)1 Gene (ubic.gemma.model.genome.Gene)1 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)1