use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class AbstractMatrixRowPairAnalysis method crossHybridizes.
/**
* Determine if the probes at this location in the matrix assay any of the same gene(s). This has nothing to do with
* whether the probes are specific, though non-specific probes (which hit more than one gene) are more likely to be
* affected by this.
*
* @return true if the probes hit the same gene; false otherwise. If the probes hit more than one gene, and any of
* the genes are in common, the result is 'true'.
*/
private boolean crossHybridizes(int i, int j) {
if (this.dataMatrix == null)
// can happen in tests.
return false;
ExpressionDataMatrixRowElement itemA = this.dataMatrix.getRowElement(i);
ExpressionDataMatrixRowElement itemB = this.dataMatrix.getRowElement(j);
Collection<Gene> genesA = this.probeToGeneMap.get(itemA.getDesignElement());
Collection<Gene> genesB = this.probeToGeneMap.get(itemB.getDesignElement());
return CollectionUtils.containsAny(genesA, genesB);
}
use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class AbstractMatrixRowPairAnalysis method init.
/**
* Initialize caches.
*/
private void init() {
this.initGeneToProbeMap();
List<ExpressionDataMatrixRowElement> rowElements = this.dataMatrix.getRowElements();
hasGenesCache = new boolean[rowElements.size()];
for (ExpressionDataMatrixRowElement element : rowElements) {
CompositeSequence de = element.getDesignElement();
rowMapCache.put(element, de);
Set<Gene> geneIdSet = this.probeToGeneMap.get(de);
Integer i = element.getIndex();
hasGenesCache[i] = geneIdSet != null && geneIdSet.size() > 0;
}
assert rowMapCache.size() > 0;
AbstractMatrixRowPairAnalysis.log.info("Initialized caches for probe/gene information");
}
use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class AbstractMatrixRowPairAnalysis method numberOfTestsForGeneAtRow.
private double numberOfTestsForGeneAtRow(int index) {
double testCount = 0;
Set<Gene> clusters = this.getGenesForRow(index);
for (Gene geneId : clusters) {
// how many probes assay that gene
int c = this.geneToProbeMap.get(geneId).size();
if (c > testCount)
testCount = c;
}
return testCount;
}
use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class DiffExMetaAnalyzerServiceImpl method doMetaAnalysis.
private GeneDifferentialExpressionMetaAnalysis doMetaAnalysis(Collection<ExpressionAnalysisResultSet> updatedResultSets, Map<Gene, Collection<DifferentialExpressionAnalysisResult>> gene2result) {
DiffExMetaAnalyzerServiceImpl.log.info("Computing pvalues ...");
DoubleArrayList pvaluesUp = new DoubleArrayList();
DoubleArrayList pvaluesDown = new DoubleArrayList();
// third pass: collate to get p-values. First we have to aggregate within result set for genes which have more
// than one probe
List<GeneDifferentialExpressionMetaAnalysisResult> metaAnalysisResultsUp = new ArrayList<>();
List<GeneDifferentialExpressionMetaAnalysisResult> metaAnalysisResultsDown = new ArrayList<>();
for (Gene g : gene2result.keySet()) {
Map<ExpressionAnalysisResultSet, Collection<DifferentialExpressionAnalysisResult>> resultSet2Results4Gene = this.getResults4GenePerResultSet(g, gene2result);
if (g.getOfficialSymbol().equals("GUK1")) {
DiffExMetaAnalyzerServiceImpl.log.info(g);
}
/*
* Compute the pvalues for each resultset.
*/
DoubleArrayList pvalues4geneUp = new DoubleArrayList();
DoubleArrayList pvalues4geneDown = new DoubleArrayList();
DoubleArrayList foldChanges4gene = new DoubleArrayList();
Collection<DifferentialExpressionAnalysisResult> resultsUsed = new HashSet<>();
for (ExpressionAnalysisResultSet rs : resultSet2Results4Gene.keySet()) {
Collection<DifferentialExpressionAnalysisResult> res = resultSet2Results4Gene.get(rs);
if (res.isEmpty()) {
// shouldn't happen?
DiffExMetaAnalyzerServiceImpl.log.warn("Unexpectedly no results in resultSet for gene " + g);
continue;
}
Double foldChange4GeneInOneResultSet = this.aggregateFoldChangeForGeneWithinResultSet(res);
if (foldChange4GeneInOneResultSet == null) {
// we can't go on.
continue;
}
// we use the pvalue for the 'best' direction, and set the other to be 1- that. An alternative would be
// to use _only_ the best one.
Double pvalue4GeneInOneResultSetUp;
Double pvalue4GeneInOneResultSetDown;
if (foldChange4GeneInOneResultSet < 0) {
pvalue4GeneInOneResultSetDown = this.aggregatePvaluesForGeneWithinResultSet(res, false);
assert pvalue4GeneInOneResultSetDown != null;
pvalue4GeneInOneResultSetUp = 1.0 - pvalue4GeneInOneResultSetDown;
} else {
pvalue4GeneInOneResultSetUp = this.aggregatePvaluesForGeneWithinResultSet(res, true);
assert pvalue4GeneInOneResultSetUp != null;
pvalue4GeneInOneResultSetDown = 1.0 - pvalue4GeneInOneResultSetUp;
}
// If we have missing values, skip them. (this should be impossible!)
if (Double.isNaN(pvalue4GeneInOneResultSetUp) || Double.isNaN(pvalue4GeneInOneResultSetDown)) {
continue;
}
/*
* Now when we correct, we have to 1) bonferroni correct for multiple probes and 2) clip really small
* pvalues. We do this now, so that we don't skew the converse pvalues (up vs. down).
*/
pvalue4GeneInOneResultSetUp = this.correctAndClip(res, pvalue4GeneInOneResultSetUp);
pvalue4GeneInOneResultSetDown = this.correctAndClip(res, pvalue4GeneInOneResultSetDown);
// results used for this one gene's meta-analysis.
boolean added = resultsUsed.addAll(res);
assert added;
pvalues4geneUp.add(pvalue4GeneInOneResultSetUp);
pvalues4geneDown.add(pvalue4GeneInOneResultSetDown);
foldChanges4gene.add(foldChange4GeneInOneResultSet);
if (DiffExMetaAnalyzerServiceImpl.log.isDebugEnabled())
DiffExMetaAnalyzerServiceImpl.log.debug(String.format("%s %.4f %.4f %.1f", g.getOfficialSymbol(), pvalue4GeneInOneResultSetUp, pvalue4GeneInOneResultSetDown, foldChange4GeneInOneResultSet));
}
// loop over results for one gene
assert resultsUsed.size() >= pvalues4geneUp.size();
if (pvalues4geneUp.size() < 2) {
continue;
}
/*
* Note that this value can be misleading. It should not be used to determine the change that was
* "looked for". Use the 'upperTail' field instead.
*/
assert !Double.isNaN(Descriptive.mean(foldChanges4gene));
double fisherPvalueUp = MetaAnalysis.fisherCombinePvalues(pvalues4geneUp);
double fisherPvalueDown = MetaAnalysis.fisherCombinePvalues(pvalues4geneDown);
if (Double.isNaN(fisherPvalueUp) || Double.isNaN(fisherPvalueDown)) {
continue;
}
pvaluesUp.add(fisherPvalueUp);
GeneDifferentialExpressionMetaAnalysisResult metaAnalysisResultUp = this.makeMetaAnalysisResult(g, resultsUsed, fisherPvalueUp, Boolean.TRUE);
metaAnalysisResultsUp.add(metaAnalysisResultUp);
pvaluesDown.add(fisherPvalueDown);
GeneDifferentialExpressionMetaAnalysisResult metaAnalysisResultDown = this.makeMetaAnalysisResult(g, resultsUsed, fisherPvalueDown, Boolean.FALSE);
metaAnalysisResultsDown.add(metaAnalysisResultDown);
this.debug(g, fisherPvalueUp, fisherPvalueDown);
}
assert metaAnalysisResultsUp.size() == metaAnalysisResultsDown.size();
if (metaAnalysisResultsDown.isEmpty()) {
// can happen if platforms don't have any genes that match etc.
DiffExMetaAnalyzerServiceImpl.log.warn("No meta-analysis results were obtained");
return null;
}
DiffExMetaAnalyzerServiceImpl.log.info(metaAnalysisResultsUp.size() + " initial meta-analysis results");
DoubleArrayList qvaluesUp = MultipleTestCorrection.benjaminiHochberg(pvaluesUp);
assert qvaluesUp.size() == metaAnalysisResultsUp.size();
DoubleArrayList qvaluesDown = MultipleTestCorrection.benjaminiHochberg(pvaluesDown);
assert qvaluesDown.size() == metaAnalysisResultsDown.size();
return this.makeMetaAnalysisObject(updatedResultSets, metaAnalysisResultsUp, metaAnalysisResultsDown, qvaluesUp, qvaluesDown);
}
use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method createOrUpdate.
/*
* Errors here will be big trouble, leading to corrupt data. It has to be all one transaction.
*
*/
@Override
public void createOrUpdate(BioAssaySet bioAssaySet, List<NonPersistentNonOrderedCoexpLink> links, LinkCreator c, Set<Gene> genesTested) {
// assumption is that these are _all_ the links for this experiment
assert !links.isEmpty();
assert bioAssaySet != null;
assert c != null;
Collections.sort(links);
Session sess = this.getSessionFactory().getCurrentSession();
sess.setCacheMode(CacheMode.IGNORE);
// to determine the species
Gene gene = (Gene) sess.get(Gene.class, links.iterator().next().getFirstGene());
String geneLinkClassName = CoexpressionQueryUtils.getGeneLinkClassName(gene);
/*
* Check that there are no links for this experiment.
*/
if (this.countLinks(gene.getTaxon(), bioAssaySet) > 0) {
throw new IllegalStateException("There are already links for given bioAssaySet; they must be deleted before proceeding");
}
/*
* Attempt to save database trips
*/
Map<NonPersistentNonOrderedCoexpLink, Boolean> existingResults = this.preFetch(links);
Query q = sess.createQuery("from " + geneLinkClassName + " where firstGene =:f and secondGene=:s and positiveCorrelation=:pc");
SQLQuery updateFlippedLinkQuery = sess.createSQLQuery("UPDATE " + CoexpressionQueryUtils.getGeneLinkTableName(gene.getTaxon()) + " SET SUPPORT=:s WHERE FIRST_GENE_FK=:g2 AND SECOND_GENE_FK=:g1 AND POSITIVE=:po");
// map of linkid to links, for establishing the EE-level links.
// keep order so for this experiment
TreeMap<Long, NonPersistentNonOrderedCoexpLink> linkIds = new TreeMap<>();
// they are in order.
// for sanity checks.
Set<Long> seenExistingLinks = new HashSet<>();
// for sanity checks.
Set<NonPersistentNonOrderedCoexpLink> seenNewLinks = new HashSet<>();
// for sanity checks.
Set<SupportDetails> seenNewSupportDetails = new HashSet<>();
int numNew = 0;
int numUpdated = 0;
int progress = 0;
// make a multiple of jdbc batch size...
int BATCH_SIZE = 1024;
Map<SupportDetails, Gene2GeneCoexpression> batchToCreate = new LinkedHashMap<>();
List<Gene2GeneCoexpression> newFlippedLinks = new ArrayList<>();
Set<Long> genesWithUpdatedData = new HashSet<>();
sess.flush();
sess.clear();
// for each link see if there is already an entry; make a new one if necessary or update the old one.
CoexpressionDaoImpl.log.info("Starting link processing");
for (NonPersistentNonOrderedCoexpLink proposedG2G : links) {
Long firstGene = proposedG2G.getFirstGene();
Long secondGene = proposedG2G.getSecondGene();
// There is an index for f+s, but querying one-at-a-time is going to be slow. I attempted to speed it up by
// fetching all links for a gene when we see it, but this causes problems with data being stale. Prefetching
// with just the ability to tell if a link is new or not takes a lot of memory and doesn't speed things up
// much. Trying keeping an index of which links a gene has, so we know whether we need to check the database
// or not.
//
// Currently it takes about 1 minute to process 10k links on a relatively small database, much of this is
// the findLink call.
Gene2GeneCoexpression existingLink = this.findLink(q, proposedG2G, existingResults);
if (existingLink == null) {
// initialize the supportdetails
SupportDetails sd = c.createSupportDetails(firstGene, secondGene, proposedG2G.isPositiveCorrelation());
sd.addEntity(bioAssaySet.getId());
assert sd.getNumIds() > 0;
assert sd.isIncluded(bioAssaySet.getId());
// Must be unique
assert !seenNewSupportDetails.contains(sd) : "Already saw " + sd + " while processing " + proposedG2G;
assert proposedG2G.getLink() != null;
batchToCreate.put(sd, proposedG2G.getLink());
if (seenNewLinks.contains(proposedG2G)) {
CoexpressionDaoImpl.log.warn("The data passed had the same new link represented more than once: " + proposedG2G);
continue;
}
seenNewSupportDetails.add(sd);
seenNewLinks.add(proposedG2G);
if (CoexpressionDaoImpl.log.isDebugEnabled())
CoexpressionDaoImpl.log.debug("New: " + proposedG2G);
numNew++;
} else {
// Sanity check. If this happens, there must be two versions of the same link already in the input.
if (seenExistingLinks.contains(existingLink.getId())) {
throw new IllegalStateException("The data passed had the same existing link represented more than once: " + existingLink);
}
/* sanity check that we aren't adding dataset twice; we might be able make this an assertion instead. */
if (existingLink.isSupportedBy(bioAssaySet)) {
throw new IllegalStateException("Support for this experiment already exists for " + existingLink + ", must be deleted first");
}
// cache old support for sanity check
int oldSupport = existingLink.getSupportDetails().getNumIds();
// update the support
existingLink.getSupportDetails().addEntity(bioAssaySet.getId());
existingLink.updateNumDatasetsSupporting();
// there is no cascade... on purpose.
sess.update(existingLink.getSupportDetails());
assert oldSupport + 1 == existingLink.getNumDatasetsSupporting();
assert existingLink.getSupportDetails().getNumIds() == oldSupport + 1;
// track so we add corresponding Experiment-level links later.
linkIds.put(existingLink.getId(), new NonPersistentNonOrderedCoexpLink(existingLink));
seenExistingLinks.add(existingLink.getId());
/*
* The flipped link is asserted to be in the database. The support details is already dealt with; we
* just have to update the support value.
*/
int numFlippedUpdated = updateFlippedLinkQuery.setParameter("s", existingLink.getNumDatasetsSupporting()).setParameter("g2", proposedG2G.getSecondGene()).setParameter("g1", proposedG2G.getFirstGene()).setParameter("po", proposedG2G.isPositiveCorrelation() ? 1 : 0).executeUpdate();
assert numFlippedUpdated == 1 : "Flipped link missing for " + proposedG2G + " [" + numFlippedUpdated + "]";
numUpdated++;
if (CoexpressionDaoImpl.log.isDebugEnabled())
CoexpressionDaoImpl.log.debug("Updated: " + proposedG2G);
}
genesWithUpdatedData.add(firstGene);
genesWithUpdatedData.add(secondGene);
if (++progress % 5000 == 0) {
CoexpressionDaoImpl.log.info("Processed " + progress + "/" + links.size() + " gene-level links..." + numUpdated + " updated, " + numNew + " new");
}
if (batchToCreate.size() >= BATCH_SIZE) {
newFlippedLinks.addAll(this.saveBatchAndMakeFlipped(sess, linkIds, batchToCreate, c));
} else if (numUpdated > 0 && numUpdated % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
// tail end batch
if (!batchToCreate.isEmpty()) {
// we make the flipped links later to optimize their ordering.
newFlippedLinks.addAll(this.saveBatchAndMakeFlipped(sess, linkIds, batchToCreate, c));
}
// flush the updated ones one last time...
if (numUpdated > 0) {
sess.flush();
sess.clear();
}
assert links.size() == linkIds.size();
CoexpressionDaoImpl.log.info(numUpdated + " updated, " + numNew + " new links");
/*
* sort and save the accumulated new flipped versions of the new links, which reuse the supportDetails. In the
* flipped links, the first gene is the second gene and vice versa. Continue to accumulate the flipped links.
*/
CoexpressionDaoImpl.log.info("Saving " + newFlippedLinks.size() + " flipped versions of new links ...");
Collections.sort(newFlippedLinks, new Comparator<Gene2GeneCoexpression>() {
@Override
public int compare(Gene2GeneCoexpression o1, Gene2GeneCoexpression o2) {
return o1.getFirstGene().compareTo(o2.getFirstGene());
}
});
progress = 0;
for (Gene2GeneCoexpression gl : newFlippedLinks) {
sess.save(gl);
if (++progress % 5000 == 0) {
CoexpressionDaoImpl.log.info("Processed " + progress + "/" + newFlippedLinks.size() + " new flipped gene-level links...");
}
if (progress % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
/*
* Save experiment-level links
*/
CoexpressionDaoImpl.log.info("Saving " + linkIds.size() + " experiment-level links (plus flipped versions) ...");
this.saveExperimentLevelLinks(sess, c, linkIds, bioAssaySet);
if (genesTested != null)
this.updatedTestedIn(bioAssaySet, genesTested);
this.updateGeneCoexpressedWith(links);
// kick anything we updated out of the cache.
int numRemovedFromCache = this.gene2GeneCoexpressionCache.remove(genesWithUpdatedData);
if (numRemovedFromCache > 0)
CoexpressionDaoImpl.log.info(numRemovedFromCache + " results evicted from cache");
// flush happens on commit...
CoexpressionDaoImpl.log.info("Done, flushing changes ...");
}
Aggregations