use of ubic.gemma.model.analysis.expression.coexpression.SupportDetails in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method createOrUpdate.
/*
* Errors here will be big trouble, leading to corrupt data. It has to be all one transaction.
*
*/
@Override
public void createOrUpdate(BioAssaySet bioAssaySet, List<NonPersistentNonOrderedCoexpLink> links, LinkCreator c, Set<Gene> genesTested) {
// assumption is that these are _all_ the links for this experiment
assert !links.isEmpty();
assert bioAssaySet != null;
assert c != null;
Collections.sort(links);
Session sess = this.getSessionFactory().getCurrentSession();
sess.setCacheMode(CacheMode.IGNORE);
// to determine the species
Gene gene = (Gene) sess.get(Gene.class, links.iterator().next().getFirstGene());
String geneLinkClassName = CoexpressionQueryUtils.getGeneLinkClassName(gene);
/*
* Check that there are no links for this experiment.
*/
if (this.countLinks(gene.getTaxon(), bioAssaySet) > 0) {
throw new IllegalStateException("There are already links for given bioAssaySet; they must be deleted before proceeding");
}
/*
* Attempt to save database trips
*/
Map<NonPersistentNonOrderedCoexpLink, Boolean> existingResults = this.preFetch(links);
Query q = sess.createQuery("from " + geneLinkClassName + " where firstGene =:f and secondGene=:s and positiveCorrelation=:pc");
SQLQuery updateFlippedLinkQuery = sess.createSQLQuery("UPDATE " + CoexpressionQueryUtils.getGeneLinkTableName(gene.getTaxon()) + " SET SUPPORT=:s WHERE FIRST_GENE_FK=:g2 AND SECOND_GENE_FK=:g1 AND POSITIVE=:po");
// map of linkid to links, for establishing the EE-level links.
// keep order so for this experiment
TreeMap<Long, NonPersistentNonOrderedCoexpLink> linkIds = new TreeMap<>();
// they are in order.
// for sanity checks.
Set<Long> seenExistingLinks = new HashSet<>();
// for sanity checks.
Set<NonPersistentNonOrderedCoexpLink> seenNewLinks = new HashSet<>();
// for sanity checks.
Set<SupportDetails> seenNewSupportDetails = new HashSet<>();
int numNew = 0;
int numUpdated = 0;
int progress = 0;
// make a multiple of jdbc batch size...
int BATCH_SIZE = 1024;
Map<SupportDetails, Gene2GeneCoexpression> batchToCreate = new LinkedHashMap<>();
List<Gene2GeneCoexpression> newFlippedLinks = new ArrayList<>();
Set<Long> genesWithUpdatedData = new HashSet<>();
sess.flush();
sess.clear();
// for each link see if there is already an entry; make a new one if necessary or update the old one.
CoexpressionDaoImpl.log.info("Starting link processing");
for (NonPersistentNonOrderedCoexpLink proposedG2G : links) {
Long firstGene = proposedG2G.getFirstGene();
Long secondGene = proposedG2G.getSecondGene();
// There is an index for f+s, but querying one-at-a-time is going to be slow. I attempted to speed it up by
// fetching all links for a gene when we see it, but this causes problems with data being stale. Prefetching
// with just the ability to tell if a link is new or not takes a lot of memory and doesn't speed things up
// much. Trying keeping an index of which links a gene has, so we know whether we need to check the database
// or not.
//
// Currently it takes about 1 minute to process 10k links on a relatively small database, much of this is
// the findLink call.
Gene2GeneCoexpression existingLink = this.findLink(q, proposedG2G, existingResults);
if (existingLink == null) {
// initialize the supportdetails
SupportDetails sd = c.createSupportDetails(firstGene, secondGene, proposedG2G.isPositiveCorrelation());
sd.addEntity(bioAssaySet.getId());
assert sd.getNumIds() > 0;
assert sd.isIncluded(bioAssaySet.getId());
// Must be unique
assert !seenNewSupportDetails.contains(sd) : "Already saw " + sd + " while processing " + proposedG2G;
assert proposedG2G.getLink() != null;
batchToCreate.put(sd, proposedG2G.getLink());
if (seenNewLinks.contains(proposedG2G)) {
CoexpressionDaoImpl.log.warn("The data passed had the same new link represented more than once: " + proposedG2G);
continue;
}
seenNewSupportDetails.add(sd);
seenNewLinks.add(proposedG2G);
if (CoexpressionDaoImpl.log.isDebugEnabled())
CoexpressionDaoImpl.log.debug("New: " + proposedG2G);
numNew++;
} else {
// Sanity check. If this happens, there must be two versions of the same link already in the input.
if (seenExistingLinks.contains(existingLink.getId())) {
throw new IllegalStateException("The data passed had the same existing link represented more than once: " + existingLink);
}
/* sanity check that we aren't adding dataset twice; we might be able make this an assertion instead. */
if (existingLink.isSupportedBy(bioAssaySet)) {
throw new IllegalStateException("Support for this experiment already exists for " + existingLink + ", must be deleted first");
}
// cache old support for sanity check
int oldSupport = existingLink.getSupportDetails().getNumIds();
// update the support
existingLink.getSupportDetails().addEntity(bioAssaySet.getId());
existingLink.updateNumDatasetsSupporting();
// there is no cascade... on purpose.
sess.update(existingLink.getSupportDetails());
assert oldSupport + 1 == existingLink.getNumDatasetsSupporting();
assert existingLink.getSupportDetails().getNumIds() == oldSupport + 1;
// track so we add corresponding Experiment-level links later.
linkIds.put(existingLink.getId(), new NonPersistentNonOrderedCoexpLink(existingLink));
seenExistingLinks.add(existingLink.getId());
/*
* The flipped link is asserted to be in the database. The support details is already dealt with; we
* just have to update the support value.
*/
int numFlippedUpdated = updateFlippedLinkQuery.setParameter("s", existingLink.getNumDatasetsSupporting()).setParameter("g2", proposedG2G.getSecondGene()).setParameter("g1", proposedG2G.getFirstGene()).setParameter("po", proposedG2G.isPositiveCorrelation() ? 1 : 0).executeUpdate();
assert numFlippedUpdated == 1 : "Flipped link missing for " + proposedG2G + " [" + numFlippedUpdated + "]";
numUpdated++;
if (CoexpressionDaoImpl.log.isDebugEnabled())
CoexpressionDaoImpl.log.debug("Updated: " + proposedG2G);
}
genesWithUpdatedData.add(firstGene);
genesWithUpdatedData.add(secondGene);
if (++progress % 5000 == 0) {
CoexpressionDaoImpl.log.info("Processed " + progress + "/" + links.size() + " gene-level links..." + numUpdated + " updated, " + numNew + " new");
}
if (batchToCreate.size() >= BATCH_SIZE) {
newFlippedLinks.addAll(this.saveBatchAndMakeFlipped(sess, linkIds, batchToCreate, c));
} else if (numUpdated > 0 && numUpdated % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
// tail end batch
if (!batchToCreate.isEmpty()) {
// we make the flipped links later to optimize their ordering.
newFlippedLinks.addAll(this.saveBatchAndMakeFlipped(sess, linkIds, batchToCreate, c));
}
// flush the updated ones one last time...
if (numUpdated > 0) {
sess.flush();
sess.clear();
}
assert links.size() == linkIds.size();
CoexpressionDaoImpl.log.info(numUpdated + " updated, " + numNew + " new links");
/*
* sort and save the accumulated new flipped versions of the new links, which reuse the supportDetails. In the
* flipped links, the first gene is the second gene and vice versa. Continue to accumulate the flipped links.
*/
CoexpressionDaoImpl.log.info("Saving " + newFlippedLinks.size() + " flipped versions of new links ...");
Collections.sort(newFlippedLinks, new Comparator<Gene2GeneCoexpression>() {
@Override
public int compare(Gene2GeneCoexpression o1, Gene2GeneCoexpression o2) {
return o1.getFirstGene().compareTo(o2.getFirstGene());
}
});
progress = 0;
for (Gene2GeneCoexpression gl : newFlippedLinks) {
sess.save(gl);
if (++progress % 5000 == 0) {
CoexpressionDaoImpl.log.info("Processed " + progress + "/" + newFlippedLinks.size() + " new flipped gene-level links...");
}
if (progress % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
/*
* Save experiment-level links
*/
CoexpressionDaoImpl.log.info("Saving " + linkIds.size() + " experiment-level links (plus flipped versions) ...");
this.saveExperimentLevelLinks(sess, c, linkIds, bioAssaySet);
if (genesTested != null)
this.updatedTestedIn(bioAssaySet, genesTested);
this.updateGeneCoexpressedWith(links);
// kick anything we updated out of the cache.
int numRemovedFromCache = this.gene2GeneCoexpressionCache.remove(genesWithUpdatedData);
if (numRemovedFromCache > 0)
CoexpressionDaoImpl.log.info(numRemovedFromCache + " results evicted from cache");
// flush happens on commit...
CoexpressionDaoImpl.log.info("Done, flushing changes ...");
}
use of ubic.gemma.model.analysis.expression.coexpression.SupportDetails in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method initializeFromOldData.
/*
* This assumes that we're going to do this for all genes, so we get links in both directions eventually. We don't
* have to explicitly make the flipped linSks here.
*/
@Override
public Map<SupportDetails, Gene2GeneCoexpression> initializeFromOldData(Gene gene, Map<Long, Gene> geneIdMap, Map<NonPersistentNonOrderedCoexpLink, SupportDetails> linksSoFar, Set<Long> skipGenes) {
Session sess = this.getSessionFactory().getCurrentSession();
LinkCreator c = new LinkCreator(gene.getTaxon());
String geneLinkTableName = CoexpressionQueryUtils.getGeneLinkTableName(gene.getTaxon());
String oldGeneLinkTableName = geneLinkTableName.replace("COEX", "CO_EX");
assert oldGeneLinkTableName.contains("CO_EX");
int BATCH_SIZE = 1024;
/*
* Query the old table
*/
SQLQuery oldLinkQuery = sess.createSQLQuery("select FIRST_GENE_FK, SECOND_GENE_FK, EFFECT from " + oldGeneLinkTableName + " where FIRST_GENE_FK=?");
List<Object[]> oldLinks = oldLinkQuery.setLong(0, gene.getId()).list();
if (oldLinks.isEmpty()) {
return null;
}
Map<SupportDetails, Gene2GeneCoexpression> linksToSave = new LinkedHashMap<>();
/*
* Make new links.
*/
Collection<NonPersistentNonOrderedCoexpLink> links = new HashSet<>();
int i = 0;
for (Object[] o : oldLinks) {
Long fgid = ((BigInteger) o[0]).longValue();
Long sgid = ((BigInteger) o[1]).longValue();
if (skipGenes != null && (skipGenes.contains(fgid) || skipGenes.contains(sgid))) {
continue;
}
Double eff = (Double) o[2];
if (fgid.equals(sgid)) {
continue;
}
assert geneIdMap.containsKey(fgid);
assert geneIdMap.containsKey(sgid);
Gene2GeneCoexpression g2g = c.create(eff, fgid, sgid);
/*
* Check if we already have a link like this for the reverse - if so, reuse the supportdetails; the keys of
* linksSoFar are id-less, so equals() is by genes and direction.
*/
SupportDetails sdOfFlipped = linksSoFar.get(new NonPersistentNonOrderedCoexpLink(geneIdMap.get(fgid), geneIdMap.get(sgid), eff > 0));
SupportDetails sd;
if (sdOfFlipped != null) {
sd = sdOfFlipped;
} else {
// we haven't saved the flipped link already so make a new support details.
sd = c.createSupportDetails(geneIdMap.get(fgid), geneIdMap.get(sgid), eff > 0);
sess.save(sd);
}
g2g.setNumDatasetsSupporting(0);
g2g.setSupportDetails(sd);
assert sd.getId() != null;
linksToSave.put(sd, g2g);
links.add(new NonPersistentNonOrderedCoexpLink(g2g));
if (i++ % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
for (SupportDetails sd : linksToSave.keySet()) {
assert sd.getId() != null;
sess.save(linksToSave.get(sd));
if (i++ % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
this.updateGeneCoexpressedWith(links);
return linksToSave;
}
use of ubic.gemma.model.analysis.expression.coexpression.SupportDetails in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method deleteLinks.
/*
* Errors here will be big trouble, leading to corrupt data. It has to be all one transaction.
*
*/
@Override
@Transactional
public void deleteLinks(Taxon t, BioAssaySet experiment) {
Session sess = this.getSessionFactory().getCurrentSession();
sess.setCacheMode(CacheMode.IGNORE);
CoexpressionDaoImpl.log.info("Fetching any old coexpression ...");
Collection<Gene2GeneCoexpression> links = this.getCoexpression(t, experiment);
Set<NonPersistentNonOrderedCoexpLink> toRemove = new HashSet<>();
// even if there are no links, we shouldn't assume we can bail; the 'tested-in' information might be there.
if (!links.isEmpty()) {
CoexpressionDaoImpl.log.info("Removing coexpression information for " + experiment + "; updating " + links.size() + " links (count includes flipped versions).");
// adjust gene-level links
int count = 0;
int numWithZeroSupportLeft = 0;
int BATCH_SIZE = 1024;
Collection<SupportDetails> supportDetailsToDelete = new HashSet<>();
Collection<SupportDetails> supportDetailsToUpdate = new HashSet<>();
Collection<Long> genesAffected = new HashSet<>();
for (Gene2GeneCoexpression g2g : links) {
genesAffected.add(g2g.getFirstGene());
genesAffected.add(g2g.getSecondGene());
// decrement support; details are shared by both links, just update it once!
SupportDetails sd = g2g.getSupportDetails();
if (!supportDetailsToUpdate.contains(sd) && !supportDetailsToDelete.contains(sd)) {
/*
* If we already saw the supportDetails it might already be zero. But if we didn't, it can't.
*/
assert g2g.getNumDatasetsSupporting() > 0 : "Support was " + g2g.getNumDatasetsSupporting() + " for " + g2g;
sd.removeEntity(experiment.getId());
assert !sd.getIds().contains(experiment.getId());
supportDetailsToUpdate.add(sd);
}
g2g.updateNumDatasetsSupporting();
assert g2g.getNumDatasetsSupporting() >= 0;
if (g2g.getNumDatasetsSupporting() == 0) {
/*
* we might still want to keep it, on the presumption that it will get filled back in.
*/
if (CoexpressionDaoImpl.DELETE_ORPHAN_LINKS) {
sess.delete(g2g);
// it might be in here already (flipped), but that's okay.
supportDetailsToDelete.add(sd);
// from the quickindex. But leave it there otherwise.
toRemove.add(new NonPersistentNonOrderedCoexpLink(g2g));
} else {
sess.update(g2g);
}
numWithZeroSupportLeft++;
} else {
sess.update(g2g);
}
if (++count % 10000 == 0) {
CoexpressionDaoImpl.log.info("Removed support for " + count + " links for " + experiment + "...");
}
if (count % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
sess.flush();
sess.clear();
this.updateModifiedSupportDetails(experiment, supportDetailsToDelete, supportDetailsToUpdate);
if (CoexpressionDaoImpl.DELETE_ORPHAN_LINKS) {
CoexpressionDaoImpl.log.info("Adjusted " + links.size() + " gene-level links supported by the experiment; " + numWithZeroSupportLeft + " links removed from the system as support dropped to zero.");
} else {
CoexpressionDaoImpl.log.info("Adjusted " + links.size() + " gene-level links supported by the experiment; " + numWithZeroSupportLeft + " gene-level links now have support dropped to zero but they were left in place");
}
// remove the ExperimentCoexpressionLinks
int numDeleted = sess.createQuery("delete from " + CoexpressionQueryUtils.getExperimentLinkClassName(t) + " where experiment=:ee").setParameter("ee", experiment).executeUpdate();
CoexpressionDaoImpl.log.info("Deleted " + numDeleted + " experiment-level links");
// invalidate the cache.
int numRemovedFromCache = gene2GeneCoexpressionCache.remove(genesAffected);
if (numRemovedFromCache > 0)
CoexpressionDaoImpl.log.info(numRemovedFromCache + " results evicted from cache");
}
// we do NOT redo the node degree information, which will be refreshed "periodically"
// we always have to do this, even if there are no links.
this.removeTestedIn(t, experiment);
// update our quick index
if (!toRemove.isEmpty())
this.removeCoexpressedWith(toRemove);
}
use of ubic.gemma.model.analysis.expression.coexpression.SupportDetails in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method saveBatchAndMakeFlipped.
/**
* Save a batch of <strong>new</strong> links, and construct the to-be-persisted flipped versions.
*
* @param session session
* @param linkIds will be updated with the ids of the links which were saved.
* @param batch; will be cleared by this call.
* @param c to create flipped versions of appropriate class
* @return flipped versions which we will accumulate, sort and save later.
*/
private List<Gene2GeneCoexpression> saveBatchAndMakeFlipped(Session session, Map<Long, NonPersistentNonOrderedCoexpLink> linkIds, Map<SupportDetails, Gene2GeneCoexpression> batch, LinkCreator c) {
StopWatch timer = new StopWatch();
timer.start();
List<Gene2GeneCoexpression> flipped = new ArrayList<>();
for (SupportDetails sd : batch.keySet()) {
// have to do this first otherwise adding the ID changes hashcode...
Gene2GeneCoexpression g2g = batch.get(sd);
assert g2g != null;
session.save(sd);
assert sd.getNumIds() > 0;
g2g.setSupportDetails(sd);
assert sd.getNumIds() > 0;
assert g2g.getNumDatasetsSupporting() > 0;
assert g2g.getSupportDetails().getNumIds() > 0;
// make a copy that has the genes flipped; reuse the supportDetails.
Gene2GeneCoexpression flippedG2g = c.create(g2g.isPositiveCorrelation() ? 1 : -1, g2g.getSecondGene(), g2g.getFirstGene());
flippedG2g.setSupportDetails(g2g.getSupportDetails());
flipped.add(flippedG2g);
assert flippedG2g.getFirstGene().equals(g2g.getSecondGene());
assert flippedG2g.getSecondGene().equals(g2g.getFirstGene());
}
for (Gene2GeneCoexpression g2g : batch.values()) {
Long id = (Long) session.save(g2g);
linkIds.put(id, new NonPersistentNonOrderedCoexpLink(g2g));
}
session.flush();
session.clear();
batch.clear();
if (timer.getTime() > 1000) {
CoexpressionDaoImpl.log.info("Saved batch: " + timer.getTime() + "ms");
}
return flipped;
}
use of ubic.gemma.model.analysis.expression.coexpression.SupportDetails in project Gemma by PavlidisLab.
the class LinkAnalysisPersisterImpl method initializeLinksFromOldData.
@Override
public void initializeLinksFromOldData(Taxon t) {
Collection<Gene> genes = geneService.loadAll(t);
Map<Long, Gene> idMap = EntityUtils.getIdMap(genes);
/*
* First count the old links for every gene, and remove genes that have too few. That set of genes has to be
* passed in to the service so they would be recognized in the second gene. We have to do that counting as a
* separate step because we need to know ahead of time. This might be more trouble than it is worth...
*/
LinkAnalysisPersisterImpl.log.info("Counting old links for " + genes.size() + " genes.");
Map<Gene, Integer> counts = gene2GeneCoexpressionService.countOldLinks(genes);
int LIMIT = 100;
Set<Long> skipGenes = new HashSet<>();
for (Gene g : counts.keySet()) {
if (counts.get(g) < LIMIT) {
skipGenes.add(g.getId());
}
}
if (skipGenes.size() == genes.size()) {
throw new IllegalStateException("There weren't enough links to bother making any stubs.");
}
Map<NonPersistentNonOrderedCoexpLink, SupportDetails> linksSoFar = new HashMap<>();
LinkAnalysisPersisterImpl.log.info("Creating stub links for up to " + genes.size() + " genes; " + skipGenes.size() + " genes will be ignored because they have too few links.");
int numGenes = 0;
int count = 0;
for (Gene gene : genes) {
Map<SupportDetails, Gene2GeneCoexpression> links = gene2GeneCoexpressionService.initializeLinksFromOldData(gene, idMap, linksSoFar, skipGenes);
if (links == null || links.isEmpty())
continue;
count += links.size();
/*
* Keep track of links created so far (ignoring "direction") so we can resuse the supportDetails.
*/
for (SupportDetails sd : links.keySet()) {
assert sd.getId() != null;
Gene2GeneCoexpression g2g = links.get(sd);
assert g2g.getId() != null;
assert g2g.getSupportDetails() != null && g2g.getSupportDetails().getId() != null;
assert sd.equals(g2g.getSupportDetails());
NonPersistentNonOrderedCoexpLink linkVO = new NonPersistentNonOrderedCoexpLink(g2g.getFirstGene(), g2g.getSecondGene(), g2g.isPositiveCorrelation());
if (linksSoFar.containsKey(linkVO)) {
// directions. Removing it will help us free up memory.
assert sd.equals(linksSoFar.get(linkVO));
linksSoFar.remove(linkVO);
} else {
linksSoFar.put(linkVO, sd);
}
}
LinkAnalysisPersisterImpl.log.info(links.size() + " links created for " + gene + ", " + count + " links created so far.");
if (++numGenes % 500 == 0) {
LinkAnalysisPersisterImpl.log.info("***** " + numGenes + " processed");
}
}
}
Aggregations