use of ubic.gemma.model.association.coexpression.Gene2GeneCoexpression in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method createOrUpdate.
/*
* Errors here will be big trouble, leading to corrupt data. It has to be all one transaction.
*
*/
@Override
public void createOrUpdate(BioAssaySet bioAssaySet, List<NonPersistentNonOrderedCoexpLink> links, LinkCreator c, Set<Gene> genesTested) {
// assumption is that these are _all_ the links for this experiment
assert !links.isEmpty();
assert bioAssaySet != null;
assert c != null;
Collections.sort(links);
Session sess = this.getSessionFactory().getCurrentSession();
sess.setCacheMode(CacheMode.IGNORE);
// to determine the species
Gene gene = (Gene) sess.get(Gene.class, links.iterator().next().getFirstGene());
String geneLinkClassName = CoexpressionQueryUtils.getGeneLinkClassName(gene);
/*
* Check that there are no links for this experiment.
*/
if (this.countLinks(gene.getTaxon(), bioAssaySet) > 0) {
throw new IllegalStateException("There are already links for given bioAssaySet; they must be deleted before proceeding");
}
/*
* Attempt to save database trips
*/
Map<NonPersistentNonOrderedCoexpLink, Boolean> existingResults = this.preFetch(links);
Query q = sess.createQuery("from " + geneLinkClassName + " where firstGene =:f and secondGene=:s and positiveCorrelation=:pc");
SQLQuery updateFlippedLinkQuery = sess.createSQLQuery("UPDATE " + CoexpressionQueryUtils.getGeneLinkTableName(gene.getTaxon()) + " SET SUPPORT=:s WHERE FIRST_GENE_FK=:g2 AND SECOND_GENE_FK=:g1 AND POSITIVE=:po");
// map of linkid to links, for establishing the EE-level links.
// keep order so for this experiment
TreeMap<Long, NonPersistentNonOrderedCoexpLink> linkIds = new TreeMap<>();
// they are in order.
// for sanity checks.
Set<Long> seenExistingLinks = new HashSet<>();
// for sanity checks.
Set<NonPersistentNonOrderedCoexpLink> seenNewLinks = new HashSet<>();
// for sanity checks.
Set<SupportDetails> seenNewSupportDetails = new HashSet<>();
int numNew = 0;
int numUpdated = 0;
int progress = 0;
// make a multiple of jdbc batch size...
int BATCH_SIZE = 1024;
Map<SupportDetails, Gene2GeneCoexpression> batchToCreate = new LinkedHashMap<>();
List<Gene2GeneCoexpression> newFlippedLinks = new ArrayList<>();
Set<Long> genesWithUpdatedData = new HashSet<>();
sess.flush();
sess.clear();
// for each link see if there is already an entry; make a new one if necessary or update the old one.
CoexpressionDaoImpl.log.info("Starting link processing");
for (NonPersistentNonOrderedCoexpLink proposedG2G : links) {
Long firstGene = proposedG2G.getFirstGene();
Long secondGene = proposedG2G.getSecondGene();
// There is an index for f+s, but querying one-at-a-time is going to be slow. I attempted to speed it up by
// fetching all links for a gene when we see it, but this causes problems with data being stale. Prefetching
// with just the ability to tell if a link is new or not takes a lot of memory and doesn't speed things up
// much. Trying keeping an index of which links a gene has, so we know whether we need to check the database
// or not.
//
// Currently it takes about 1 minute to process 10k links on a relatively small database, much of this is
// the findLink call.
Gene2GeneCoexpression existingLink = this.findLink(q, proposedG2G, existingResults);
if (existingLink == null) {
// initialize the supportdetails
SupportDetails sd = c.createSupportDetails(firstGene, secondGene, proposedG2G.isPositiveCorrelation());
sd.addEntity(bioAssaySet.getId());
assert sd.getNumIds() > 0;
assert sd.isIncluded(bioAssaySet.getId());
// Must be unique
assert !seenNewSupportDetails.contains(sd) : "Already saw " + sd + " while processing " + proposedG2G;
assert proposedG2G.getLink() != null;
batchToCreate.put(sd, proposedG2G.getLink());
if (seenNewLinks.contains(proposedG2G)) {
CoexpressionDaoImpl.log.warn("The data passed had the same new link represented more than once: " + proposedG2G);
continue;
}
seenNewSupportDetails.add(sd);
seenNewLinks.add(proposedG2G);
if (CoexpressionDaoImpl.log.isDebugEnabled())
CoexpressionDaoImpl.log.debug("New: " + proposedG2G);
numNew++;
} else {
// Sanity check. If this happens, there must be two versions of the same link already in the input.
if (seenExistingLinks.contains(existingLink.getId())) {
throw new IllegalStateException("The data passed had the same existing link represented more than once: " + existingLink);
}
/* sanity check that we aren't adding dataset twice; we might be able make this an assertion instead. */
if (existingLink.isSupportedBy(bioAssaySet)) {
throw new IllegalStateException("Support for this experiment already exists for " + existingLink + ", must be deleted first");
}
// cache old support for sanity check
int oldSupport = existingLink.getSupportDetails().getNumIds();
// update the support
existingLink.getSupportDetails().addEntity(bioAssaySet.getId());
existingLink.updateNumDatasetsSupporting();
// there is no cascade... on purpose.
sess.update(existingLink.getSupportDetails());
assert oldSupport + 1 == existingLink.getNumDatasetsSupporting();
assert existingLink.getSupportDetails().getNumIds() == oldSupport + 1;
// track so we add corresponding Experiment-level links later.
linkIds.put(existingLink.getId(), new NonPersistentNonOrderedCoexpLink(existingLink));
seenExistingLinks.add(existingLink.getId());
/*
* The flipped link is asserted to be in the database. The support details is already dealt with; we
* just have to update the support value.
*/
int numFlippedUpdated = updateFlippedLinkQuery.setParameter("s", existingLink.getNumDatasetsSupporting()).setParameter("g2", proposedG2G.getSecondGene()).setParameter("g1", proposedG2G.getFirstGene()).setParameter("po", proposedG2G.isPositiveCorrelation() ? 1 : 0).executeUpdate();
assert numFlippedUpdated == 1 : "Flipped link missing for " + proposedG2G + " [" + numFlippedUpdated + "]";
numUpdated++;
if (CoexpressionDaoImpl.log.isDebugEnabled())
CoexpressionDaoImpl.log.debug("Updated: " + proposedG2G);
}
genesWithUpdatedData.add(firstGene);
genesWithUpdatedData.add(secondGene);
if (++progress % 5000 == 0) {
CoexpressionDaoImpl.log.info("Processed " + progress + "/" + links.size() + " gene-level links..." + numUpdated + " updated, " + numNew + " new");
}
if (batchToCreate.size() >= BATCH_SIZE) {
newFlippedLinks.addAll(this.saveBatchAndMakeFlipped(sess, linkIds, batchToCreate, c));
} else if (numUpdated > 0 && numUpdated % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
// tail end batch
if (!batchToCreate.isEmpty()) {
// we make the flipped links later to optimize their ordering.
newFlippedLinks.addAll(this.saveBatchAndMakeFlipped(sess, linkIds, batchToCreate, c));
}
// flush the updated ones one last time...
if (numUpdated > 0) {
sess.flush();
sess.clear();
}
assert links.size() == linkIds.size();
CoexpressionDaoImpl.log.info(numUpdated + " updated, " + numNew + " new links");
/*
* sort and save the accumulated new flipped versions of the new links, which reuse the supportDetails. In the
* flipped links, the first gene is the second gene and vice versa. Continue to accumulate the flipped links.
*/
CoexpressionDaoImpl.log.info("Saving " + newFlippedLinks.size() + " flipped versions of new links ...");
Collections.sort(newFlippedLinks, new Comparator<Gene2GeneCoexpression>() {
@Override
public int compare(Gene2GeneCoexpression o1, Gene2GeneCoexpression o2) {
return o1.getFirstGene().compareTo(o2.getFirstGene());
}
});
progress = 0;
for (Gene2GeneCoexpression gl : newFlippedLinks) {
sess.save(gl);
if (++progress % 5000 == 0) {
CoexpressionDaoImpl.log.info("Processed " + progress + "/" + newFlippedLinks.size() + " new flipped gene-level links...");
}
if (progress % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
/*
* Save experiment-level links
*/
CoexpressionDaoImpl.log.info("Saving " + linkIds.size() + " experiment-level links (plus flipped versions) ...");
this.saveExperimentLevelLinks(sess, c, linkIds, bioAssaySet);
if (genesTested != null)
this.updatedTestedIn(bioAssaySet, genesTested);
this.updateGeneCoexpressedWith(links);
// kick anything we updated out of the cache.
int numRemovedFromCache = this.gene2GeneCoexpressionCache.remove(genesWithUpdatedData);
if (numRemovedFromCache > 0)
CoexpressionDaoImpl.log.info(numRemovedFromCache + " results evicted from cache");
// flush happens on commit...
CoexpressionDaoImpl.log.info("Done, flushing changes ...");
}
use of ubic.gemma.model.association.coexpression.Gene2GeneCoexpression in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method initializeFromOldData.
/*
* This assumes that we're going to do this for all genes, so we get links in both directions eventually. We don't
* have to explicitly make the flipped linSks here.
*/
@Override
public Map<SupportDetails, Gene2GeneCoexpression> initializeFromOldData(Gene gene, Map<Long, Gene> geneIdMap, Map<NonPersistentNonOrderedCoexpLink, SupportDetails> linksSoFar, Set<Long> skipGenes) {
Session sess = this.getSessionFactory().getCurrentSession();
LinkCreator c = new LinkCreator(gene.getTaxon());
String geneLinkTableName = CoexpressionQueryUtils.getGeneLinkTableName(gene.getTaxon());
String oldGeneLinkTableName = geneLinkTableName.replace("COEX", "CO_EX");
assert oldGeneLinkTableName.contains("CO_EX");
int BATCH_SIZE = 1024;
/*
* Query the old table
*/
SQLQuery oldLinkQuery = sess.createSQLQuery("select FIRST_GENE_FK, SECOND_GENE_FK, EFFECT from " + oldGeneLinkTableName + " where FIRST_GENE_FK=?");
List<Object[]> oldLinks = oldLinkQuery.setLong(0, gene.getId()).list();
if (oldLinks.isEmpty()) {
return null;
}
Map<SupportDetails, Gene2GeneCoexpression> linksToSave = new LinkedHashMap<>();
/*
* Make new links.
*/
Collection<NonPersistentNonOrderedCoexpLink> links = new HashSet<>();
int i = 0;
for (Object[] o : oldLinks) {
Long fgid = ((BigInteger) o[0]).longValue();
Long sgid = ((BigInteger) o[1]).longValue();
if (skipGenes != null && (skipGenes.contains(fgid) || skipGenes.contains(sgid))) {
continue;
}
Double eff = (Double) o[2];
if (fgid.equals(sgid)) {
continue;
}
assert geneIdMap.containsKey(fgid);
assert geneIdMap.containsKey(sgid);
Gene2GeneCoexpression g2g = c.create(eff, fgid, sgid);
/*
* Check if we already have a link like this for the reverse - if so, reuse the supportdetails; the keys of
* linksSoFar are id-less, so equals() is by genes and direction.
*/
SupportDetails sdOfFlipped = linksSoFar.get(new NonPersistentNonOrderedCoexpLink(geneIdMap.get(fgid), geneIdMap.get(sgid), eff > 0));
SupportDetails sd;
if (sdOfFlipped != null) {
sd = sdOfFlipped;
} else {
// we haven't saved the flipped link already so make a new support details.
sd = c.createSupportDetails(geneIdMap.get(fgid), geneIdMap.get(sgid), eff > 0);
sess.save(sd);
}
g2g.setNumDatasetsSupporting(0);
g2g.setSupportDetails(sd);
assert sd.getId() != null;
linksToSave.put(sd, g2g);
links.add(new NonPersistentNonOrderedCoexpLink(g2g));
if (i++ % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
for (SupportDetails sd : linksToSave.keySet()) {
assert sd.getId() != null;
sess.save(linksToSave.get(sd));
if (i++ % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
this.updateGeneCoexpressedWith(links);
return linksToSave;
}
use of ubic.gemma.model.association.coexpression.Gene2GeneCoexpression in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method loadAndConvertLinks.
/**
* Load links given their ids (e.g. retrieved from the EE link tables). This is predicted to be slow when fetching
* many links, because of random seeks in the g2g table.
*
* @param t t
* @param linkIds to fetch; should be unique. Can already be stringency-filtered to some extent, but this will be
* checked again.
* @param queryGenes can be null if was unconstrained
* @param quick if true, the 'testedin' details will be populated.
* @return map
*/
private Map<Long, List<CoexpressionValueObject>> loadAndConvertLinks(Taxon t, List<Long> linkIds, Collection<Long> queryGenes, boolean quick) {
assert !linkIds.isEmpty();
/*
* Note that we are not checking the cache, but we could by getting the firstGene from the EE-level links?
*/
Query q = this.getSessionFactory().getCurrentSession().createQuery("from " + CoexpressionQueryUtils.getGeneLinkClassName(t) + " g2g join fetch g2g.supportDetails where g2g.id in (:ids)");
/*
* It is possible that we are retrieving the same underlying link twice - in the a-b and b-a orientations. Those
* have to be merged. This is taken care of in the convertToValueObjects
*/
int BATCH_SIZE = 1024;
// more efficient querying.
Collections.sort(linkIds);
BatchIterator<Long> idBatches = BatchIterator.batches(linkIds, BATCH_SIZE);
StopWatch timer = new StopWatch();
timer.start();
List<Gene2GeneCoexpression> rawResults = new ArrayList<>();
for (; idBatches.hasNext(); ) {
rawResults.addAll(q.setParameterList("ids", idBatches.next()).list());
}
if (rawResults.isEmpty()) {
CoexpressionDaoImpl.log.warn("Ids were invalid: no results for linkIds including " + linkIds.get(0));
return new HashMap<>();
} else if (rawResults.size() < linkIds.size() && rawResults.size() < new HashSet<>(linkIds).size()) {
// maybe linkIds has repeats?
CoexpressionDaoImpl.log.warn("Some ids were invalid, only got " + rawResults.size() + ", expected " + linkIds.size() + " results");
}
if (timer.getTime() > 2000) {
CoexpressionDaoImpl.log.info("Load and convert " + rawResults.size() + " links: " + timer.getTime() + "ms");
}
Map<Long, List<CoexpressionValueObject>> results = this.convertToValueObjects(rawResults, queryGenes);
for (Long g : results.keySet()) {
if (!quick) {
assert queryGenes == null || queryGenes.contains(g);
this.populateTestedInDetails(results.get(g));
}
}
return results;
}
use of ubic.gemma.model.association.coexpression.Gene2GeneCoexpression in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method convertToValueObjects.
/**
* Remove duplicates and convert to value objects. Links are marked as "interQuery" if the geneIds is non-null and
* the link is between two of them.
*
* @param rawResults from the database. The support details might not have been fetched.
* @param geneIds gene IDs used in the query, can be null
* @return value objects, organized by the "first" gene of each entity. Note: For some query genes, we might not
* have gotten any results.
*/
private Map<Long, List<CoexpressionValueObject>> convertToValueObjects(List<Gene2GeneCoexpression> rawResults, Collection<Long> geneIds) {
int removed = 0;
Set<NonPersistentNonOrderedCoexpLink> allSeen = new HashSet<>(rawResults.size());
// raw results from db.
Map<Long, List<CoexpressionValueObject>> results = new HashMap<>();
int numUnsupported = 0;
for (Gene2GeneCoexpression g2g : rawResults) {
if (g2g.getNumDatasetsSupporting() == 0) {
throw new IllegalArgumentException("Links should not be unsupported: " + g2g);
}
Long queryGeneId = g2g.getFirstGene();
if (geneIds != null && !geneIds.contains(queryGeneId)) {
continue;
}
NonPersistentNonOrderedCoexpLink seen = new NonPersistentNonOrderedCoexpLink(g2g);
/*
* remove duplicates, since each link can be here twice (x->y and y->x). (can happen; + and - links are
* counted separately.)
*/
if (allSeen.contains(seen)) {
++removed;
continue;
}
allSeen.add(seen);
if (!results.containsKey(queryGeneId)) {
results.put(queryGeneId, new ArrayList<CoexpressionValueObject>());
}
CoexpressionValueObject g2gvo = new CoexpressionValueObject(g2g);
assert g2gvo.getNumDatasetsSupporting() > 0;
results.get(queryGeneId).add(g2gvo);
if (geneIds != null && geneIds.contains(g2gvo.getCoexGeneId())) {
g2gvo.setInterQueryLink(true);
}
}
if (removed > 0)
CoexpressionDaoImpl.log.debug("Removed " + removed + " duplicate links");
// noinspection ConstantConditions // Can change
if (numUnsupported > 0)
CoexpressionDaoImpl.log.info("Removed " + numUnsupported + " links that had support of zero.");
if (results.isEmpty())
throw new IllegalStateException("Removed everything! (of" + rawResults.size() + " results)");
return results;
}
use of ubic.gemma.model.association.coexpression.Gene2GeneCoexpression in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method deleteLinks.
/*
* Errors here will be big trouble, leading to corrupt data. It has to be all one transaction.
*
*/
@Override
@Transactional
public void deleteLinks(Taxon t, BioAssaySet experiment) {
Session sess = this.getSessionFactory().getCurrentSession();
sess.setCacheMode(CacheMode.IGNORE);
CoexpressionDaoImpl.log.info("Fetching any old coexpression ...");
Collection<Gene2GeneCoexpression> links = this.getCoexpression(t, experiment);
Set<NonPersistentNonOrderedCoexpLink> toRemove = new HashSet<>();
// even if there are no links, we shouldn't assume we can bail; the 'tested-in' information might be there.
if (!links.isEmpty()) {
CoexpressionDaoImpl.log.info("Removing coexpression information for " + experiment + "; updating " + links.size() + " links (count includes flipped versions).");
// adjust gene-level links
int count = 0;
int numWithZeroSupportLeft = 0;
int BATCH_SIZE = 1024;
Collection<SupportDetails> supportDetailsToDelete = new HashSet<>();
Collection<SupportDetails> supportDetailsToUpdate = new HashSet<>();
Collection<Long> genesAffected = new HashSet<>();
for (Gene2GeneCoexpression g2g : links) {
genesAffected.add(g2g.getFirstGene());
genesAffected.add(g2g.getSecondGene());
// decrement support; details are shared by both links, just update it once!
SupportDetails sd = g2g.getSupportDetails();
if (!supportDetailsToUpdate.contains(sd) && !supportDetailsToDelete.contains(sd)) {
/*
* If we already saw the supportDetails it might already be zero. But if we didn't, it can't.
*/
assert g2g.getNumDatasetsSupporting() > 0 : "Support was " + g2g.getNumDatasetsSupporting() + " for " + g2g;
sd.removeEntity(experiment.getId());
assert !sd.getIds().contains(experiment.getId());
supportDetailsToUpdate.add(sd);
}
g2g.updateNumDatasetsSupporting();
assert g2g.getNumDatasetsSupporting() >= 0;
if (g2g.getNumDatasetsSupporting() == 0) {
/*
* we might still want to keep it, on the presumption that it will get filled back in.
*/
if (CoexpressionDaoImpl.DELETE_ORPHAN_LINKS) {
sess.delete(g2g);
// it might be in here already (flipped), but that's okay.
supportDetailsToDelete.add(sd);
// from the quickindex. But leave it there otherwise.
toRemove.add(new NonPersistentNonOrderedCoexpLink(g2g));
} else {
sess.update(g2g);
}
numWithZeroSupportLeft++;
} else {
sess.update(g2g);
}
if (++count % 10000 == 0) {
CoexpressionDaoImpl.log.info("Removed support for " + count + " links for " + experiment + "...");
}
if (count % BATCH_SIZE == 0) {
sess.flush();
sess.clear();
}
}
sess.flush();
sess.clear();
this.updateModifiedSupportDetails(experiment, supportDetailsToDelete, supportDetailsToUpdate);
if (CoexpressionDaoImpl.DELETE_ORPHAN_LINKS) {
CoexpressionDaoImpl.log.info("Adjusted " + links.size() + " gene-level links supported by the experiment; " + numWithZeroSupportLeft + " links removed from the system as support dropped to zero.");
} else {
CoexpressionDaoImpl.log.info("Adjusted " + links.size() + " gene-level links supported by the experiment; " + numWithZeroSupportLeft + " gene-level links now have support dropped to zero but they were left in place");
}
// remove the ExperimentCoexpressionLinks
int numDeleted = sess.createQuery("delete from " + CoexpressionQueryUtils.getExperimentLinkClassName(t) + " where experiment=:ee").setParameter("ee", experiment).executeUpdate();
CoexpressionDaoImpl.log.info("Deleted " + numDeleted + " experiment-level links");
// invalidate the cache.
int numRemovedFromCache = gene2GeneCoexpressionCache.remove(genesAffected);
if (numRemovedFromCache > 0)
CoexpressionDaoImpl.log.info(numRemovedFromCache + " results evicted from cache");
}
// we do NOT redo the node degree information, which will be refreshed "periodically"
// we always have to do this, even if there are no links.
this.removeTestedIn(t, experiment);
// update our quick index
if (!toRemove.isEmpty())
this.removeCoexpressedWith(toRemove);
}
Aggregations