use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ExpressionExperimentPlatformSwitchService method switchExperimentToArrayDesign.
/**
* If you know the arraydesigns are already in a merged state, you should use switchExperimentToMergedPlatform
*
* @param ee ee
* @param arrayDesign The array design to switch to. If some samples already use that array design, nothing will be
* changed for them.
*/
public ExpressionExperiment switchExperimentToArrayDesign(ExpressionExperiment ee, ArrayDesign arrayDesign) {
assert arrayDesign != null;
// remove stuff that will be in the way.
processedExpressionDataVectorService.removeProcessedDataVectors(ee);
sampleCoexpressionMatrixService.delete(ee);
for (ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets(ee)) {
subsetService.remove(subset);
}
// get relation between sequence and designelements.
Map<BioSequence, Collection<CompositeSequence>> designElementMap = new HashMap<>();
Collection<CompositeSequence> elsWithNoSeq = new HashSet<>();
this.populateCSeq(arrayDesign, designElementMap, elsWithNoSeq);
ee = expressionExperimentService.thaw(ee);
ExpressionExperimentPlatformSwitchService.log.info(elsWithNoSeq.size() + " elements on the new platform have no associated sequence.");
designElementMap.put(ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE, elsWithNoSeq);
boolean multiPlatformPerSample = this.checkMultiPerSample(ee, arrayDesign);
/*
* For a multiplatform-per-sample case: (note that some samples might just be on one platform...)
* 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it).
* 2. Switch vectors to use it - may require adding NaNs and reordering the vectors
* 3. Delete the Bioassays that are using other BADs
*/
/*
* Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that
* includes all BioMaterials.
*/
Collection<BioAssayDimension> unusedBADs = new HashSet<>();
BioAssayDimension maxBAD = null;
int maxSize = 0;
if (multiPlatformPerSample) {
maxBAD = this.doMultiSample(ee, unusedBADs, maxSize);
}
Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee);
Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements = new HashMap<>();
for (ArrayDesign oldAd : oldArrayDesigns) {
this.runOldAd(ee, arrayDesign, designElementMap, maxBAD, usedDesignElements, oldAd);
}
ee.setDescription(ee.getDescription() + " [Switched to use " + arrayDesign.getShortName() + " by Gemma]");
helperService.persist(ee, arrayDesign);
/*
* This might need to be done inside the transaction we're using to make the switch.
*/
if (maxBAD != null && !unusedBADs.isEmpty()) {
this.checkUnused(unusedBADs, maxBAD);
}
return ee;
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ExpressionExperimentPlatformSwitchService method runOldAd.
private void runOldAd(ExpressionExperiment ee, ArrayDesign arrayDesign, Map<BioSequence, Collection<CompositeSequence>> designElementMap, BioAssayDimension maxBAD, Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, ArrayDesign oldAd) {
if (oldAd.equals(arrayDesign))
return;
oldAd = arrayDesignService.thaw(oldAd);
if (oldAd.getCompositeSequences().size() == 0 && !oldAd.getTechnologyType().equals(TechnologyType.NONE)) {
/*
* Bug 3451 - this is okay if it is a RNA-seq experiment etc. prior to data upload.
*/
throw new IllegalStateException(oldAd + " has no elements");
}
Collection<QuantitationType> qts = expressionExperimentService.getQuantitationTypes(ee, oldAd);
ExpressionExperimentPlatformSwitchService.log.info("Processing " + qts.size() + " quantitation types for vectors on " + oldAd);
for (QuantitationType type : qts) {
// use each design element only once per quantitation type + bioassaydimension per array design
usedDesignElements.clear();
Collection<RawExpressionDataVector> rawForQt = this.getRawVectorsForOneQuantitationType(oldAd, type);
Collection<ProcessedExpressionDataVector> processedForQt = this.getProcessedVectorsForOneQuantitationType(oldAd, type);
if (//
(rawForQt == null || rawForQt.size() == 0) && (processedForQt == null || processedForQt.size() == 0)) {
/*
* This can happen when the quantitation types vary for the array designs.
*/
ExpressionExperimentPlatformSwitchService.log.debug("No vectors for " + type + " on " + oldAd);
continue;
}
// This check assures we do not mix raw and processed vectors further down the line
if ((rawForQt != null && rawForQt.size() > 0) && (processedForQt != null && processedForQt.size() > 0)) {
throw new IllegalStateException("Two types of vector for quantitationType " + type);
}
Collection<DesignElementDataVector> vectors = new HashSet<>();
if (rawForQt != null) {
vectors.addAll(rawForQt);
}
if (processedForQt != null) {
vectors.addAll(processedForQt);
}
ExpressionExperimentPlatformSwitchService.log.info("Switching " + vectors.size() + " vectors for " + type + " from " + oldAd.getShortName() + " to " + arrayDesign.getShortName());
int count = 0;
// noinspection MismatchedQueryAndUpdateOfCollection // Only used for logging
Collection<DesignElementDataVector> unMatched = new HashSet<>();
for (DesignElementDataVector vector : vectors) {
assert RawExpressionDataVector.class.isAssignableFrom(vector.getClass()) : "Unexpected class: " + vector.getClass().getName();
CompositeSequence oldDe = vector.getDesignElement();
if (oldDe.getArrayDesign().equals(arrayDesign)) {
continue;
}
this.processVector(designElementMap, usedDesignElements, vector, maxBAD);
if (++count % 20000 == 0) {
ExpressionExperimentPlatformSwitchService.log.info("Found matches for " + count + " vectors for " + type);
}
}
/*
* This is bad.
*/
if (unMatched.size() > 0) {
throw new IllegalStateException("There were " + unMatched.size() + " vectors that couldn't be matched to the new design for: " + type + ", example: " + unMatched.iterator().next());
}
// Force collection update
if (rawForQt != null && rawForQt.size() > 0) {
int s = ee.getRawExpressionDataVectors().size();
ee.getRawExpressionDataVectors().removeAll(rawForQt);
assert s > ee.getRawExpressionDataVectors().size();
ee.getRawExpressionDataVectors().addAll(rawForQt);
assert s == ee.getRawExpressionDataVectors().size();
} else if (processedForQt != null && processedForQt.size() > 0) {
int s = ee.getProcessedExpressionDataVectors().size();
ee.getProcessedExpressionDataVectors().removeAll(processedForQt);
assert s > ee.getProcessedExpressionDataVectors().size();
ee.getProcessedExpressionDataVectors().addAll(processedForQt);
assert s == ee.getProcessedExpressionDataVectors().size();
}
}
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.
@Override
public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB, boolean ncbiIds) throws IOException {
if (taxon == null && !ncbiIds) {
throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
}
if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
}
try (BufferedReader b = new BufferedReader(new FileReader(source))) {
String line;
int numSkipped = 0;
ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
arrayDesignService.deleteGeneProductAssociations(arrayDesign);
while ((line = b.readLine()) != null) {
if (StringUtils.isBlank(line)) {
continue;
}
if (line.startsWith("#")) {
continue;
}
String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
if (fields.length != 3) {
throw new IOException("Illegal format, expected three columns, got " + fields.length);
}
String probeId = fields[0];
String seqName = fields[1];
/*
* FIXME. We have to allow NCBI gene ids here.
*/
String geneSymbol = fields[2];
if (StringUtils.isBlank(geneSymbol)) {
numSkipped++;
continue;
}
CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);
if (c == null) {
if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
ArrayDesignProbeMapperServiceImpl.log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
numSkipped++;
continue;
}
// a probe can have more than one gene associated with it if so they are piped |
Collection<Gene> geneListProbe = new HashSet<>();
// indicate multiple genes
Gene geneDetails;
StringTokenizer st = new StringTokenizer(geneSymbol, "|");
while (st.hasMoreTokens()) {
String geneToken = st.nextToken().trim();
if (ncbiIds) {
geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
} else {
geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
}
if (geneDetails != null) {
geneListProbe.add(geneDetails);
}
}
if (geneListProbe.size() == 0) {
ArrayDesignProbeMapperServiceImpl.log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
numSkipped++;
continue;
} else if (geneListProbe.size() > 1) {
// this is a common situation, when the geneSymbol actually has |-separated genes, so no need to
// make a
// lot of fuss.
ArrayDesignProbeMapperServiceImpl.log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
}
BioSequence bs = c.getBiologicalCharacteristic();
if (bs != null) {
if (StringUtils.isNotBlank(seqName)) {
bs = bioSequenceService.thaw(bs);
if (!bs.getName().equals(seqName)) {
ArrayDesignProbeMapperServiceImpl.log.warn("Sequence name '" + seqName + "' given for " + probeId + " does not match existing entry " + bs.getName() + ", skipping");
numSkipped++;
continue;
}
}
// otherwise we assume everything is okay.
} else {
// create one based on the text provided.
if (StringUtils.isBlank(seqName)) {
ArrayDesignProbeMapperServiceImpl.log.warn("You must provide sequence names for probes which are not already mapped. probeName=" + probeId + " had no sequence associated and no name provided; skipping");
numSkipped++;
continue;
}
bs = BioSequence.Factory.newInstance();
bs.setName(seqName);
bs.setTaxon(taxon);
bs.setDescription("Imported from annotation file");
// Placeholder.
bs.setType(SequenceType.OTHER);
bs = bioSequenceService.create(bs);
c.setBiologicalCharacteristic(bs);
compositeSequenceService.update(c);
}
assert bs != null;
assert bs.getId() != null;
for (Gene gene : geneListProbe) {
gene = geneService.thaw(gene);
if (gene.getProducts().size() == 0) {
ArrayDesignProbeMapperServiceImpl.log.warn("There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping");
numSkipped++;
continue;
}
for (GeneProduct gp : gene.getProducts()) {
AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
association.setBioSequence(bs);
association.setGeneProduct(gp);
association.setSource(sourceDB);
annotationAssociationService.create(association);
}
}
}
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
this.deleteOldFiles(arrayDesign);
ArrayDesignProbeMapperServiceImpl.log.info("Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
}
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.
@Override
public void processArrayDesign(ArrayDesign arrayDesign, ProbeMapperConfig config, boolean useDB) {
assert config != null;
if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
}
Collection<Taxon> taxa = arrayDesignService.getTaxa(arrayDesign.getId());
Taxon taxon = arrayDesign.getPrimaryTaxon();
if (taxa.size() > 1 && taxon == null) {
throw new IllegalArgumentException("Array design has sequence from multiple taxa and has no primary taxon set: " + arrayDesign);
}
GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(taxon);
BlockingQueue<BACS> persistingQueue = new ArrayBlockingQueue<>(ArrayDesignProbeMapperServiceImpl.QUEUE_SIZE);
AtomicBoolean generatorDone = new AtomicBoolean(false);
AtomicBoolean loaderDone = new AtomicBoolean(false);
this.load(persistingQueue, generatorDone, loaderDone, useDB);
if (useDB) {
ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
arrayDesignService.deleteGeneProductAssociations(arrayDesign);
}
int count = 0;
int hits = 0;
ArrayDesignProbeMapperServiceImpl.log.info("Start processing " + arrayDesign.getCompositeSequences().size() + " probes ...");
for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
if (compositeSequence.getName().equals("1431126_a_at")) {
ArrayDesignProbeMapperServiceImpl.log.debug("HERE");
}
Map<String, Collection<BlatAssociation>> results = this.processCompositeSequence(config, taxon, goldenPathDb, compositeSequence);
if (results == null)
continue;
for (Collection<BlatAssociation> col : results.values()) {
for (BlatAssociation association : col) {
if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
ArrayDesignProbeMapperServiceImpl.log.debug(association);
persistingQueue.add(new BACS(compositeSequence, association));
}
++hits;
}
if (++count % 200 == 0) {
ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences" + " with blat results; " + hits + " mappings found.");
}
}
generatorDone.set(true);
ArrayDesignProbeMapperServiceImpl.log.info("Waiting for loading to complete ...");
while (!loaderDone.get()) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences with blat results; " + hits + " mappings found.");
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
this.deleteOldFiles(arrayDesign);
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceAlignmentServiceImpl method getSequences.
/**
* @param taxon (specified in case array has multiple taxa)
*/
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public static Collection<BioSequence> getSequences(ArrayDesign ad, Taxon taxon) {
Collection<CompositeSequence> compositeSequences = ad.getCompositeSequences();
Collection<BioSequence> sequencesToBlat = new HashSet<>();
int numWithNoBioSequence = 0;
int numWithNoSequenceData = 0;
boolean warned = false;
for (CompositeSequence cs : compositeSequences) {
BioSequence bs = cs.getBiologicalCharacteristic();
if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
warned = true;
ArrayDesignSequenceAlignmentServiceImpl.log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
}
if (bs == null) {
++numWithNoBioSequence;
if (!warned) {
ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had no associated biosequence object");
}
continue;
}
if (bs.getTaxon() == null) {
warned = true;
ArrayDesignSequenceAlignmentServiceImpl.log.warn("There is no taxon defined for this biosequence ");
continue;
}
// if the taxon is null that means we want this run for all taxa for that array
if (taxon != null && !bs.getTaxon().equals(taxon)) {
continue;
}
// noinspection ConstantConditions // Better readability
if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
warned = true;
ArrayDesignSequenceAlignmentServiceImpl.log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
}
if (StringUtils.isBlank(bs.getSequence())) {
++numWithNoSequenceData;
if (!warned) {
ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had " + bs + " but no sequence, skipping");
}
continue;
}
sequencesToBlat.add(bs);
}
if (numWithNoBioSequence > 0 || numWithNoSequenceData > 0) {
ArrayDesignSequenceAlignmentServiceImpl.log.warn(numWithNoBioSequence + " composite sequences lacked biosequence associations; " + numWithNoSequenceData + " lacked sequence data ( out of " + compositeSequences.size() + " total).");
}
return sequencesToBlat;
}
Aggregations