use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ArrayDesignAnnotationServiceImpl method generateAnnotationFile.
@Override
public int generateAnnotationFile(Writer writer, Map<CompositeSequence, Collection<BioSequence2GeneProduct>> genesWithSpecificity, OutputType ty) throws IOException {
int compositeSequencesProcessed = 0;
int simple = 0;
int empty = 0;
int complex = 0;
// we used LinkedHasSets to keep everything in a predictable order - this is important for the gene symbols,
// descriptions and NCBIIds (but not important for GO terms). When a probe maps to multiple genes, we list those
// three items for the genes in the same order. There is a feature request to make
// the order deterministic (i.e.,lexicographic sort), this could be done by using little gene objects or whatever.
Collection<OntologyTerm> goTerms = new LinkedHashSet<>();
Set<String> genes = new LinkedHashSet<>();
Set<String> geneDescriptions = new LinkedHashSet<>();
Set<String> geneIds = new LinkedHashSet<>();
Set<String> ncbiIds = new LinkedHashSet<>();
Map<Gene, Collection<VocabCharacteristic>> goMappings = this.getGOMappings(genesWithSpecificity);
for (CompositeSequence cs : genesWithSpecificity.keySet()) {
Collection<BioSequence2GeneProduct> geneclusters = genesWithSpecificity.get(cs);
if (++compositeSequencesProcessed % 2000 == 0 && ArrayDesignAnnotationServiceImpl.log.isInfoEnabled()) {
ArrayDesignAnnotationServiceImpl.log.info("Processed " + compositeSequencesProcessed + "/" + genesWithSpecificity.size() + " compositeSequences " + empty + " empty; " + simple + " simple; " + complex + " complex;");
}
if (geneclusters.isEmpty()) {
this.writeAnnotationLine(writer, cs.getName(), "", "", null, "", "");
empty++;
continue;
}
if (geneclusters.size() == 1) {
// common case, do it quickly.
BioSequence2GeneProduct b2g = geneclusters.iterator().next();
Gene g = b2g.getGeneProduct().getGene();
goTerms = this.getGoTerms(goMappings.get(g), ty);
String gemmaId = g.getId() == null ? "" : g.getId().toString();
String ncbiId = g.getNcbiGeneId() == null ? "" : g.getNcbiGeneId().toString();
this.writeAnnotationLine(writer, cs.getName(), g.getOfficialSymbol(), g.getOfficialName(), goTerms, gemmaId, ncbiId);
simple++;
continue;
}
goTerms.clear();
genes.clear();
geneDescriptions.clear();
geneIds.clear();
ncbiIds.clear();
for (BioSequence2GeneProduct bioSequence2GeneProduct : geneclusters) {
Gene g = bioSequence2GeneProduct.getGeneProduct().getGene();
genes.add(g.getOfficialSymbol());
geneDescriptions.add(g.getOfficialName());
geneIds.add(g.getId().toString());
Integer ncbiGeneId = g.getNcbiGeneId();
if (ncbiGeneId != null) {
ncbiIds.add(ncbiGeneId.toString());
}
goTerms.addAll(this.getGoTerms(goMappings.get(g), ty));
}
String geneString = StringUtils.join(genes, "|");
String geneDescriptionString = StringUtils.join(geneDescriptions, "|");
String geneIdsString = StringUtils.join(geneIds, "|");
String ncbiIdsString = StringUtils.join(ncbiIds, "|");
this.writeAnnotationLine(writer, cs.getName(), geneString, geneDescriptionString, goTerms, geneIdsString, ncbiIdsString);
complex++;
}
writer.close();
return compositeSequencesProcessed;
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class AffyProbeNameFilter method filter.
@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix data) {
int numRows = data.rows();
List<CompositeSequence> kept = new ArrayList<>();
for (int i = 0; i < numRows; i++) {
CompositeSequence d = data.getDesignElementForRow(i);
assert d != null;
BioSequence sequence = d.getBiologicalCharacteristic();
String name;
if (sequence != null) {
name = sequence.getName();
} else {
name = d.getName();
}
// apply the rules.
if (skip_ST && name.contains("_st")) {
// 'st' means sense strand.
continue;
}
// control probes.
if (skip_AFFX && name.contains("AFFX")) {
continue;
}
// gene family.
if (skip_F && name.contains("_f_at")) {
continue;
}
if (skip_X && name.contains("_x_at")) {
continue;
}
if (skip_G && name.contains("_g_at")) {
continue;
}
kept.add(d);
}
AffyProbeNameFilter.log.info("There are " + kept.size() + " rows left after Affy probe name filtering.");
return new ExpressionDataDoubleMatrix(data, kept);
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class RowLevelFilter method filter.
@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix data) {
if (lowCut == -Double.MAX_VALUE && highCut == Double.MAX_VALUE) {
RowLevelFilter.log.info("No filtering requested");
return data;
}
int numRows = data.rows();
DoubleArrayList criteria = new DoubleArrayList(new double[numRows]);
int numAllNeg = this.computeCriteria(data, criteria);
DoubleArrayList sortedCriteria = criteria.copy();
sortedCriteria.sort();
int consideredRows = numRows;
int startIndex = 0;
if (removeAllNegative) {
consideredRows = numRows - numAllNeg;
startIndex = numAllNeg;
}
double realHighCut = this.getHighThreshold(sortedCriteria, consideredRows);
double realLowCut = this.getLowThreshold(numRows, sortedCriteria, consideredRows, startIndex);
if (Double.isNaN(realHighCut)) {
throw new IllegalStateException("High threshold cut is NaN");
}
RowLevelFilter.log.debug("Low cut = " + realLowCut);
RowLevelFilter.log.debug("High cut = " + realHighCut);
if (realHighCut <= realLowCut) {
throw new RuntimeException("High cut " + realHighCut + " is lower or same as low cut " + realLowCut);
}
List<CompositeSequence> kept = new ArrayList<>();
for (int i = 0; i < numRows; i++) {
// values, zeros should always be removed
if (criteria.get(i) > realLowCut && criteria.get(i) <= realHighCut) {
kept.add(data.getDesignElementForRow(i));
}
}
this.logInfo(numRows, kept);
return new ExpressionDataDoubleMatrix(data, kept);
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class ArrayDesignProbeRenamerCli method rename.
private void rename(ArrayDesign arrayDesign, InputStream newIdFile) {
Map<String, String> old2new;
try {
old2new = this.parseIdFile(newIdFile);
} catch (IOException e) {
throw new RuntimeException(e);
}
AbstractCLI.log.info(old2new.size() + " potential renaming items read");
int count = 0;
for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
if (old2new.containsKey(cs.getName())) {
String descriptionAddendum = " [Renamed by Gemma from " + cs.getName() + "]";
if (StringUtils.isNotBlank(cs.getDescription())) {
cs.setDescription(cs.getDescription() + descriptionAddendum);
} else {
cs.setDescription(descriptionAddendum);
}
cs.setName(old2new.get(cs.getName()));
if (++count % 2000 == 0) {
AbstractCLI.log.info("Renamed " + count + " composite sequences, last to be renamed was " + cs);
}
}
}
arrayDesignService.update(arrayDesign);
}
use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.
the class SVDServiceHelperImpl method getTopLoadedVectors.
@Override
public Map<ProbeLoading, DoubleVectorValueObject> getTopLoadedVectors(ExpressionExperiment ee, int component, int count) {
PrincipalComponentAnalysis pca = principalComponentAnalysisService.loadForExperiment(ee);
Map<ProbeLoading, DoubleVectorValueObject> result = new HashMap<>();
if (pca == null) {
return result;
}
List<ProbeLoading> topLoadedProbes = principalComponentAnalysisService.getTopLoadedProbes(ee, component, count);
if (topLoadedProbes == null) {
SVDServiceHelperImpl.log.warn("No probes?");
return result;
}
Map<Long, ProbeLoading> probes = new LinkedHashMap<>();
Set<CompositeSequence> p = new HashSet<>();
for (ProbeLoading probeLoading : topLoadedProbes) {
CompositeSequence probe = probeLoading.getProbe();
probes.put(probe.getId(), probeLoading);
p.add(probe);
}
if (probes.isEmpty())
return result;
assert probes.size() <= count;
Collection<ExpressionExperiment> ees = new HashSet<>();
ees.add(ee);
Collection<DoubleVectorValueObject> dvVos = processedExpressionDataVectorService.getProcessedDataArraysByProbe(ees, p);
if (dvVos.isEmpty()) {
SVDServiceHelperImpl.log.warn("No vectors came back from the call; check the Gene2CS table?");
return result;
}
// note that this might have come from a cache.
/*
* This is actually expected, because we go through the genes.
*/
BioAssayDimension bioAssayDimension = pca.getBioAssayDimension();
assert bioAssayDimension != null;
assert !bioAssayDimension.getBioAssays().isEmpty();
for (DoubleVectorValueObject vct : dvVos) {
ProbeLoading probeLoading = probes.get(vct.getDesignElement().getId());
if (probeLoading == null) {
/*
* This is okay, we will skip this probe. It was another probe for a gene that _was_ highly loaded.
*/
continue;
}
assert bioAssayDimension.getBioAssays().size() == vct.getData().length;
vct.setRank(probeLoading.getLoadingRank().doubleValue());
vct.setExpressionExperiment(new ExpressionExperimentValueObject(ee));
result.put(probeLoading, vct);
}
if (result.isEmpty()) {
SVDServiceHelperImpl.log.warn("No results, something went wrong; there were " + dvVos.size() + " vectors to start but they all got filtered out.");
}
return result;
}
Aggregations