use of ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory in project Gemma by PavlidisLab.
the class NcbiGeneHistoryParser method parseOneLine.
@Override
public NcbiGeneHistory parseOneLine(String line) {
if (line.startsWith("#")) {
return null;
}
String[] fields = StringUtils.split(line, '\t');
if (fields.length > NcbiGeneHistoryParser.GENE_HISTORY_FILE_NUM_FIELDS) {
// sanity check.
throw new IllegalStateException("NCBI gene_history file has unexpected column count. Expected " + NcbiGeneHistoryParser.GENE_HISTORY_FILE_NUM_FIELDS + ", got " + fields.length + " in line=" + line);
}
String geneId = fields[1];
String discontinuedGeneId = fields[2];
if (StringUtils.isBlank(geneId) || geneId.equals("-")) {
String taxonId = fields[0];
String discontinuedSymbol = fields[3];
Integer taxonInt = Integer.parseInt(taxonId);
if (!(discontinuedGenes.containsKey(taxonInt))) {
discontinuedGenes.put(taxonInt, new HashMap<String, String>());
}
discontinuedGenes.get(taxonInt).put(discontinuedSymbol, discontinuedGeneId);
return null;
}
NcbiGeneHistory his;
if (id2history.containsKey(discontinuedGeneId)) {
his = id2history.get(discontinuedGeneId);
his.update(discontinuedGeneId, geneId);
id2history.remove(discontinuedGeneId);
id2history.put(geneId, his);
} else {
his = new NcbiGeneHistory(discontinuedGeneId);
his.update(discontinuedGeneId, geneId);
}
return his;
}
use of ubic.gemma.core.loader.genome.gene.ncbi.model.NcbiGeneHistory in project Gemma by PavlidisLab.
the class NcbiGeneDomainObjectGenerator method processLocalFiles.
private void processLocalFiles(final LocalFile geneInfoFile, final LocalFile gene2AccessionFile, LocalFile geneHistoryFile, LocalFile geneEnsemblFile, final BlockingQueue<NcbiGeneData> geneDataQueue) {
final NcbiGeneInfoParser infoParser = new NcbiGeneInfoParser();
infoParser.setFilter(this.filter);
if (this.filter) {
infoParser.setSupportedTaxa(supportedTaxa.keySet());
}
final NcbiGeneEnsemblFileParser ensemblParser = new NcbiGeneEnsemblFileParser();
final NcbiGene2AccessionParser accParser = new NcbiGene2AccessionParser();
accParser.setStartingNbiId(startingNcbiId);
final File gene2accessionFileHandle = gene2AccessionFile.asFile();
final NcbiGeneHistoryParser historyParser = new NcbiGeneHistoryParser();
try {
NcbiGeneDomainObjectGenerator.log.debug("Parsing gene history");
historyParser.parse(geneHistoryFile.asFile());
if (geneEnsemblFile != null) {
NcbiGeneDomainObjectGenerator.log.debug("Parsing ensembl");
ensemblParser.parse(geneEnsemblFile.asFile());
}
//
NcbiGeneDomainObjectGenerator.log.debug("Parsing GeneInfo =" + geneInfoFile.asFile().getAbsolutePath());
try (InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(geneInfoFile.asFile().getAbsolutePath())) {
infoParser.parse(is);
}
} catch (IOException e) {
// infoProducerDone.set( true );
throw new RuntimeException(e);
}
Collection<NCBIGeneInfo> geneInfoList = infoParser.getResults();
// put into HashMap
final Map<String, NCBIGeneInfo> geneInfoMap = new HashMap<>();
Map<Integer, Integer> taxaCount = new HashMap<>();
for (NCBIGeneInfo geneInfo : geneInfoList) {
NcbiGeneHistory history = historyParser.get(geneInfo.getGeneId());
geneInfo.setHistory(history);
if (history == null) {
String discontinuedIdForGene = historyParser.discontinuedIdForSymbol(geneInfo.getDefaultSymbol(), geneInfo.getTaxId());
geneInfo.setDiscontinuedId(discontinuedIdForGene);
}
if (geneEnsemblFile != null) {
String ensemblId = ensemblParser.get(geneInfo.getGeneId());
geneInfo.setEnsemblId(ensemblId);
}
int taxId = geneInfo.getTaxId();
if (!taxaCount.containsKey(taxId)) {
taxaCount.put(taxId, 0);
}
taxaCount.put(taxId, taxaCount.get(taxId) + 1);
geneInfoMap.put(geneInfo.getGeneId(), geneInfo);
}
supportedTaxaWithNCBIGenes = new HashSet<>();
if (supportedTaxa != null) {
for (Integer taxId : taxaCount.keySet()) {
if (taxaCount.get(taxId) > 0) {
NcbiGeneDomainObjectGenerator.log.debug("Taxon " + taxId + ": " + taxaCount.get(taxId) + " genes");
Taxon t = supportedTaxa.get(taxId);
supportedTaxaWithNCBIGenes.add(t);
}
}
}
// 1) use a producer-consumer model for Gene2Accession conversion
// 1a) Parse Gene2Accession until the gene id changes. This means that
// all accessions for the gene are done.
// 1b) Create a Collection<Gene2Accession>, and push into BlockingQueue
Thread parseThread = new Thread(new Runnable() {
@Override
public void run() {
try {
NcbiGeneDomainObjectGenerator.log.debug("Parsing gene2accession=" + gene2AccessionFile.asFile().getAbsolutePath());
accParser.setStartingNbiId(startingNcbiId);
accParser.parse(gene2accessionFileHandle, geneDataQueue, geneInfoMap);
} catch (IOException e) {
throw new RuntimeException(e);
}
NcbiGeneDomainObjectGenerator.log.debug("Domain object generator done");
producerDone.set(true);
}
}, "gene2accession parser");
parseThread.start();
// 1c) As elements get added to BlockingQueue, NCBIGeneConverter
// consumes
// and creates Gene/GeneProduct/DatabaseEntry objects.
// 1d) Push Gene to another BlockingQueue genePersistence
// 2) use producer-consumer model for Gene persistence
// 2a) as elements get added to genePersistence, persist Gene and
// associated entries.
}
Aggregations