use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class GeoConverterImpl method processId.
private int processId(GeoPlatform platform, ArrayDesign arrayDesign, String probeOrganismColumn, ExternalDatabase externalDb, List<String> sequences, List<String> probeOrganism, Taxon primaryTaxon, List<String> cloneIdentifiers, List<List<String>> externalRefs, Iterator<String> descIter, Pattern refSeqAccessionPattern, boolean strictSelection, List<String> skipped, Collection<CompositeSequence> compositeSequences, int i, String id) {
String externalAccession = null;
if (externalRefs != null) {
externalAccession = this.getExternalAccession(externalRefs, i);
}
if (strictSelection && StringUtils.isBlank(externalAccession)) {
// currently this is crafted to deal with affymetrix exon arrays, but could be expanded.
// mrna_assignment is less strict than gene_assignement
// salvage it if it has a gene assignment.
// String filteringColumn = "gene_assignment";
String filteringColumn = "gene_assignment";
if (platform.getColumnNames().contains(filteringColumn)) {
String cd = platform.getColumnData(filteringColumn).get(i);
if (StringUtils.isBlank(cd) || cd.equals("---")) {
skipped.add(id);
if (skipped.size() % 10000 == 0) {
GeoConverterImpl.log.info("Skipped " + skipped.size() + " elements due to strict selection; last was " + id);
}
i++;
return i;
}
// keep it.
} else {
// we just skip ones that don't have an external accession.
return i;
}
// remaining case here: externalAccession is blank, but there is another column that we think saves it.
}
String cloneIdentifier = cloneIdentifiers == null ? null : cloneIdentifiers.get(i);
String description = "";
if (externalAccession != null) {
String[] refs = externalAccession.split(",");
if (refs.length > 1) {
description = "Multiple external sequence references: " + externalAccession + "; ";
externalAccession = refs[0];
}
}
if (descIter != null)
description = description + " " + descIter.next();
CompositeSequence cs = CompositeSequence.Factory.newInstance();
String probeName = platform.getProbeNamesInGemma().get(id);
if (probeName == null) {
probeName = id;
if (GeoConverterImpl.log.isDebugEnabled())
GeoConverterImpl.log.debug("Probe retaining original name: " + probeName);
// must make sure this is populated.
platform.getProbeNamesInGemma().put(id, id);
} else {
if (GeoConverterImpl.log.isDebugEnabled())
GeoConverterImpl.log.debug("Found probe: " + probeName);
}
cs.setName(probeName);
cs.setDescription(description);
cs.setArrayDesign(arrayDesign);
// LMD:1647- If There is a Organism Column given for the probe then set taxon from that overwriting platform
// if probeOrganismColumn is set but for this probe no taxon do not set probeTaxon and thus create no
// biosequence
Taxon probeTaxon = Taxon.Factory.newInstance();
if (probeOrganism != null && StringUtils.isNotBlank(probeOrganism.get(i))) {
probeTaxon = this.convertProbeOrganism(probeOrganism.get(i));
}
// if there are no probe taxons then all the probes should take the taxon from the primary taxon
if (probeOrganismColumn == null) {
probeTaxon = primaryTaxon;
}
BioSequence bs = this.createMinimalBioSequence(probeTaxon);
this.setBsProps(platform, externalDb, sequences, refSeqAccessionPattern, i, id, externalAccession, cloneIdentifier, bs);
this.checkCs(arrayDesign, externalAccession, cloneIdentifier, cs, probeTaxon, bs);
compositeSequences.add(cs);
platformDesignElementMap.get(arrayDesign.getShortName()).put(probeName, cs);
i++;
return i;
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorDaoImpl method getRanks.
@Override
public Map<CompositeSequence, Double> getRanks(ExpressionExperiment expressionExperiment, RankMethod method) {
// language=HQL
final String queryString = "select dedv.designElement, dedv.rankByMean, dedv.rankByMax from ProcessedExpressionDataVector dedv " + "where dedv.expressionExperiment.id = :ee";
List qr = this.getSessionFactory().getCurrentSession().createQuery(queryString).setParameter("ee", expressionExperiment.getId()).list();
Map<CompositeSequence, Double> result = new HashMap<>();
for (Object o : qr) {
Object[] oa = (Object[]) o;
CompositeSequence d = (CompositeSequence) oa[0];
Double rMean = oa[1] == null ? Double.NaN : (Double) oa[1];
Double rMax = oa[2] == null ? Double.NaN : (Double) oa[2];
switch(method) {
case mean:
result.put(d, rMean);
break;
case max:
result.put(d, rMax);
break;
default:
break;
}
}
return result;
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorDaoImpl method maskAndUnpack.
private Map<CompositeSequence, DoubleVectorValueObject> maskAndUnpack(Collection<RawExpressionDataVector> preferredData, Collection<RawExpressionDataVector> missingValueData) {
Map<CompositeSequence, DoubleVectorValueObject> unpackedData = this.unpack(preferredData);
if (missingValueData.size() == 0) {
AbstractDao.log.info("There is no separate missing data information, simply using the data as is");
for (DoubleVectorValueObject rv : unpackedData.values()) {
rv.setMasked(true);
}
return unpackedData;
}
Collection<BooleanVectorValueObject> unpackedMissingValueData = this.unpackBooleans(missingValueData);
Map<CompositeSequenceValueObject, BooleanVectorValueObject> missingValueMap = new HashMap<>();
for (BooleanVectorValueObject bv : unpackedMissingValueData) {
missingValueMap.put(bv.getDesignElement(), bv);
}
boolean warned = false;
for (DoubleVectorValueObject rv : unpackedData.values()) {
double[] data = rv.getData();
CompositeSequenceValueObject de = rv.getDesignElement();
BooleanVectorValueObject mv = missingValueMap.get(de);
if (mv == null) {
if (!warned && AbstractDao.log.isWarnEnabled())
AbstractDao.log.warn("No mask vector for " + de + ", additional warnings for missing masks for this job will be skipped");
// we're missing a mask vector for it for some reason, but still flag it as effectively masked.
rv.setMasked(true);
warned = true;
continue;
}
boolean[] mvData = mv.getData();
if (mvData.length != data.length) {
throw new IllegalStateException("Missing value data didn't match data length");
}
for (int i = 0; i < data.length; i++) {
if (!mvData[i]) {
data[i] = Double.NaN;
}
}
rv.setMasked(true);
}
return unpackedData;
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorDaoImpl method createProcessedDataVectors.
@Override
public ExpressionExperiment createProcessedDataVectors(ExpressionExperiment ee) {
if (ee == null) {
throw new IllegalStateException("ExpressionExperiment cannot be null");
}
ExpressionExperiment expressionExperiment = (ExpressionExperiment) this.getSessionFactory().getCurrentSession().get(ExpressionExperiment.class, ee.getId());
assert expressionExperiment != null;
this.removeProcessedDataVectors(expressionExperiment);
Hibernate.initialize(expressionExperiment);
Hibernate.initialize(expressionExperiment.getQuantitationTypes());
Hibernate.initialize(expressionExperiment.getProcessedExpressionDataVectors());
expressionExperiment.getProcessedExpressionDataVectors().clear();
AbstractDao.log.info("Computing processed expression vectors for " + expressionExperiment);
boolean isTwoChannel = this.isTwoChannel(expressionExperiment);
Collection<RawExpressionDataVector> missingValueVectors = new HashSet<>();
if (isTwoChannel) {
missingValueVectors = this.getMissingValueVectors(expressionExperiment);
}
Collection<RawExpressionDataVector> preferredDataVectors = this.getPreferredDataVectors(expressionExperiment);
if (preferredDataVectors.isEmpty()) {
throw new IllegalArgumentException("No preferred data vectors for " + expressionExperiment);
}
Map<CompositeSequence, DoubleVectorValueObject> maskedVectorObjects = this.maskAndUnpack(preferredDataVectors, missingValueVectors);
/*
* Create the vectors. Do a sanity check that we don't have more than we should
*/
Collection<CompositeSequence> seenDes = new HashSet<>();
RawExpressionDataVector preferredDataVectorExemplar = preferredDataVectors.iterator().next();
QuantitationType preferredMaskedDataQuantitationType = this.getPreferredMaskedDataQuantitationType(preferredDataVectorExemplar.getQuantitationType());
/*
* Note that we used to not normalize count data, but we've removed this restriction; and in any case we have
* moved to using non-count summaries for the primary data type.
*/
if (preferredMaskedDataQuantitationType.getType().equals(StandardQuantitationType.COUNT)) {
/*
* Backfill target
*/
AbstractDao.log.warn("Preferred data are counts; please convert to log2cpm");
}
if (!preferredMaskedDataQuantitationType.getIsRatio() && maskedVectorObjects.size() > ProcessedExpressionDataVectorDaoImpl.MIN_SIZE_FOR_RENORMALIZATION) {
AbstractDao.log.info("Normalizing the data");
this.renormalize(maskedVectorObjects);
} else {
AbstractDao.log.info("Normalization skipped for this data set (not suitable)");
}
int i = 0;
for (CompositeSequence cs : maskedVectorObjects.keySet()) {
DoubleVectorValueObject dvvo = maskedVectorObjects.get(cs);
if (seenDes.contains(cs)) {
// defensive programming, this happens.
throw new IllegalStateException("Duplicated design element: " + cs + "; make sure the experiment has only one 'preferred' quantitation type. " + "Perhaps you need to run vector merging following an array design switch?");
}
ProcessedExpressionDataVector vec = (ProcessedExpressionDataVector) dvvo.toDesignElementDataVector(ee, cs, preferredMaskedDataQuantitationType);
expressionExperiment.getProcessedExpressionDataVectors().add(vec);
seenDes.add(cs);
if (++i % 5000 == 0) {
AbstractDao.log.info(i + " vectors built");
}
}
AbstractDao.log.info("Persisting " + expressionExperiment.getProcessedExpressionDataVectors().size() + " processed data vectors");
expressionExperiment.getQuantitationTypes().add(preferredMaskedDataQuantitationType);
expressionExperiment.setNumberOfDataVectors(expressionExperiment.getProcessedExpressionDataVectors().size());
this.getSessionFactory().getCurrentSession().update(expressionExperiment);
assert expressionExperiment.getNumberOfDataVectors() != null;
this.processedDataVectorCache.clearCache(expressionExperiment.getId());
return expressionExperiment;
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class CompositeSequenceDaoImpl method getGenesWithSpecificity.
@Override
public Map<CompositeSequence, Collection<BioSequence2GeneProduct>> getGenesWithSpecificity(Collection<CompositeSequence> compositeSequences) {
AbstractDao.log.info("Getting cs -> alignment specificity map for " + compositeSequences.size() + " composite sequences");
Map<CompositeSequence, Collection<BioSequence2GeneProduct>> results = new HashMap<>();
BatchIterator<CompositeSequence> it = BatchIterator.batches(compositeSequences, CompositeSequenceDaoImpl.PROBE_TO_GENE_MAP_BATCH_SIZE);
StopWatch timer = new StopWatch();
timer.start();
int total = 0;
for (; it.hasNext(); ) {
Collection<CompositeSequence> batch = it.next();
this.batchGetGenesWithSpecificity(batch, results);
total += batch.size();
}
timer.stop();
if (timer.getTime() > 10000) {
AbstractDao.log.info("Probe to gene map finished: " + total + " retrieved in " + timer.getTime() + "ms");
}
return results;
}
Aggregations