use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo in project Gemma by PavlidisLab.
the class NcbiGene2AccessionParser method parse.
/*
* This has been overridden to add postprocessing to the gene2accession file. This involves adding the
* last gene that had accessions (if available) and adding the remaining genes without accessions
*
*/
@Override
public void parse(InputStream is) throws IOException {
if (startingNcbiId == null)
hasStarted = true;
super.parse(is);
// add last gene with an accession
if (geneData.getGeneInfo() != null) {
try {
queue.put(geneData);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
geneInfo.remove(lastGeneId);
}
// add remaining genes
// push in remaining genes that did not have accessions
Collection<NCBIGeneInfo> remainingGenes = geneInfo.values();
for (NCBIGeneInfo o : remainingGenes) {
NcbiGeneData geneCollection = new NcbiGeneData();
geneCollection.setGeneInfo(o);
try {
queue.put(geneCollection);
} catch (InterruptedException e) {
throw new RuntimeException();
}
}
}
use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo in project Gemma by PavlidisLab.
the class NcbiGeneConverter method convert.
public Gene convert(NcbiGeneData data) {
// get gene info and fill in gene
NCBIGeneInfo geneInfo = data.getGeneInfo();
Gene gene = this.convert(geneInfo);
// grab all accessions and fill in GeneProduct/DatabaseEntry
// and associate with Gene
Collection<NCBIGene2Accession> gene2accession = data.getAccessions();
Collection<GeneProduct> geneProducts = new HashSet<>();
for (NCBIGene2Accession acc : gene2accession) {
geneProducts.addAll(this.convert(acc, gene));
}
gene.setProducts(geneProducts);
return gene;
}
use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo in project Gemma by PavlidisLab.
the class NcbiGeneDomainObjectGenerator method processLocalFiles.
private void processLocalFiles(final LocalFile geneInfoFile, final LocalFile gene2AccessionFile, LocalFile geneHistoryFile, LocalFile geneEnsemblFile, final BlockingQueue<NcbiGeneData> geneDataQueue) {
final NcbiGeneInfoParser infoParser = new NcbiGeneInfoParser();
infoParser.setFilter(this.filter);
if (this.filter) {
infoParser.setSupportedTaxa(supportedTaxa.keySet());
}
final NcbiGeneEnsemblFileParser ensemblParser = new NcbiGeneEnsemblFileParser();
final NcbiGene2AccessionParser accParser = new NcbiGene2AccessionParser();
accParser.setStartingNbiId(startingNcbiId);
final File gene2accessionFileHandle = gene2AccessionFile.asFile();
final NcbiGeneHistoryParser historyParser = new NcbiGeneHistoryParser();
try {
NcbiGeneDomainObjectGenerator.log.debug("Parsing gene history");
historyParser.parse(geneHistoryFile.asFile());
if (geneEnsemblFile != null) {
NcbiGeneDomainObjectGenerator.log.debug("Parsing ensembl");
ensemblParser.parse(geneEnsemblFile.asFile());
}
//
NcbiGeneDomainObjectGenerator.log.debug("Parsing GeneInfo =" + geneInfoFile.asFile().getAbsolutePath());
try (InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(geneInfoFile.asFile().getAbsolutePath())) {
infoParser.parse(is);
}
} catch (IOException e) {
// infoProducerDone.set( true );
throw new RuntimeException(e);
}
Collection<NCBIGeneInfo> geneInfoList = infoParser.getResults();
// put into HashMap
final Map<String, NCBIGeneInfo> geneInfoMap = new HashMap<>();
Map<Integer, Integer> taxaCount = new HashMap<>();
for (NCBIGeneInfo geneInfo : geneInfoList) {
NcbiGeneHistory history = historyParser.get(geneInfo.getGeneId());
geneInfo.setHistory(history);
if (history == null) {
String discontinuedIdForGene = historyParser.discontinuedIdForSymbol(geneInfo.getDefaultSymbol(), geneInfo.getTaxId());
geneInfo.setDiscontinuedId(discontinuedIdForGene);
}
if (geneEnsemblFile != null) {
String ensemblId = ensemblParser.get(geneInfo.getGeneId());
geneInfo.setEnsemblId(ensemblId);
}
int taxId = geneInfo.getTaxId();
if (!taxaCount.containsKey(taxId)) {
taxaCount.put(taxId, 0);
}
taxaCount.put(taxId, taxaCount.get(taxId) + 1);
geneInfoMap.put(geneInfo.getGeneId(), geneInfo);
}
supportedTaxaWithNCBIGenes = new HashSet<>();
if (supportedTaxa != null) {
for (Integer taxId : taxaCount.keySet()) {
if (taxaCount.get(taxId) > 0) {
NcbiGeneDomainObjectGenerator.log.debug("Taxon " + taxId + ": " + taxaCount.get(taxId) + " genes");
Taxon t = supportedTaxa.get(taxId);
supportedTaxaWithNCBIGenes.add(t);
}
}
}
// 1) use a producer-consumer model for Gene2Accession conversion
// 1a) Parse Gene2Accession until the gene id changes. This means that
// all accessions for the gene are done.
// 1b) Create a Collection<Gene2Accession>, and push into BlockingQueue
Thread parseThread = new Thread(new Runnable() {
@Override
public void run() {
try {
NcbiGeneDomainObjectGenerator.log.debug("Parsing gene2accession=" + gene2AccessionFile.asFile().getAbsolutePath());
accParser.setStartingNbiId(startingNcbiId);
accParser.parse(gene2accessionFileHandle, geneDataQueue, geneInfoMap);
} catch (IOException e) {
throw new RuntimeException(e);
}
NcbiGeneDomainObjectGenerator.log.debug("Domain object generator done");
producerDone.set(true);
}
}, "gene2accession parser");
parseThread.start();
// 1c) As elements get added to BlockingQueue, NCBIGeneConverter
// consumes
// and creates Gene/GeneProduct/DatabaseEntry objects.
// 1d) Push Gene to another BlockingQueue genePersistence
// 2) use producer-consumer model for Gene persistence
// 2a) as elements get added to genePersistence, persist Gene and
// associated entries.
}
use of ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo in project Gemma by PavlidisLab.
the class NcbiGeneInfoParser method parseOneLine.
@Override
public NCBIGeneInfo parseOneLine(String line) {
String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
if (fields.length != NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW) {
// noinspection StatementWithEmptyBody // backwards compatibility, old format, hopefully okay
if (fields.length == 13 || fields.length == 14 || fields.length == 15) {
// They keep adding fields at the end...we only need the first few.
} else {
throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length + " fields, expected " + NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW);
}
}
NCBIGeneInfo geneInfo = new NCBIGeneInfo();
try {
// Skip taxa that we don't support.
int taxonId = Integer.parseInt(fields[0]);
if (filter && ncbiTaxonIds != null) {
if (!ncbiTaxonIds.contains(taxonId)) {
return null;
}
}
// See ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README
// #Format:
// tax_id
// GeneID
// Symbol
// LocusTag
// Synonyms
// dbXrefs, separated by |
// chromosome
// map_location
// description
// type_of_gene
// Symbol_from_nomenclature_authority
// Full_name_from_nomenclature_authority
// Nomenclature_status
// Other_designations
// Modification_date
// Feature type
geneInfo.setTaxId(taxonId);
geneInfo.setGeneId(fields[1]);
geneInfo.setDefaultSymbol(fields[2]);
geneInfo.setLocusTag(fields[3]);
String[] synonyms = StringUtils.splitPreserveAllTokens(fields[4], '|');
for (String synonym : synonyms) {
if (synonym.equals("-"))
continue;
geneInfo.addToSynonyms(synonym);
}
if (!fields[5].equals("-")) {
String[] dbXRefs = StringUtils.splitPreserveAllTokens(fields[5], '|');
for (String dbXr : dbXRefs) {
String[] dbF = StringUtils.split(dbXr, ':');
if (dbF.length != 2) {
/*
* Annoyingly, HGCN identifiers now have the format HGNC:X where X is an integer. This is
* apparent from downloading files from HGCN (http://www.genenames.org/cgi-bin/statistics). Same
* situation for MGI
*
* Therefore we have a special case.
*/
if (dbF.length == 3 && (dbF[1].equals("HGNC") || dbF[1].equals("MGI"))) {
dbF[1] = dbF[1] + ":" + dbF[2];
} else {
// we're very stringent to avoid data corruption.
throw new FileFormatException("Expected 2 fields, got " + dbF.length + " from '" + dbXr + "'");
}
}
geneInfo.addToDbXRefs(dbF[0], dbF[1]);
}
}
geneInfo.setChromosome(fields[6]);
geneInfo.setMapLocation(fields[7]);
geneInfo.setDescription(fields[8]);
geneInfo.setGeneType(NCBIGeneInfo.typeStringToGeneType(fields[9]));
geneInfo.setSymbolIsFromAuthority(!fields[10].equals("-"));
geneInfo.setNameIsFromAuthority(!fields[11].equals("-"));
geneInfo.setNomenclatureStatus(fields[12].equals("-") ? NomenclatureStatus.UNKNOWN : fields[11].equals("O") ? NomenclatureStatus.OFFICIAL : NomenclatureStatus.INTERIM);
// ignore 14th field for now - it stores alternate protein names
// ignore 15th, modification date
} catch (NumberFormatException e) {
throw new FileFormatException(e);
}
return geneInfo;
}
Aggregations