use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class DifferentialExpressionAnalysisUtil method filterFactorValuesFromBiomaterials.
/**
* Returns biomaterials with 'filtered' factor values. That is, each biomaterial will only contain those factor
* values equivalent to a factor value from one of the input experimental factors.
*
* @return Collection<BioMaterial>
*/
private static Collection<BioMaterial> filterFactorValuesFromBiomaterials(Collection<ExperimentalFactor> factors, Collection<BioMaterial> biomaterials) {
assert !biomaterials.isEmpty();
assert !factors.isEmpty();
Collection<FactorValue> allFactorValuesFromGivenFactors = new HashSet<>();
for (ExperimentalFactor ef : factors) {
allFactorValuesFromGivenFactors.addAll(ef.getFactorValues());
}
Collection<BioMaterial> biomaterialsWithGivenFactorValues = new HashSet<>();
int numHaveAny = 0;
for (BioMaterial b : biomaterials) {
Collection<FactorValue> biomaterialFactorValues = b.getFactorValues();
Collection<FactorValue> factorValuesToConsider = new HashSet<>(biomaterialFactorValues);
for (FactorValue biomaterialFactorValue : biomaterialFactorValues) {
numHaveAny++;
if (!allFactorValuesFromGivenFactors.contains(biomaterialFactorValue)) {
factorValuesToConsider.remove(biomaterialFactorValue);
}
}
b.setFactorValues(factorValuesToConsider);
biomaterialsWithGivenFactorValues.add(b);
}
if (numHaveAny == 0) {
throw new IllegalStateException("No biomaterials had any factor values");
}
return biomaterialsWithGivenFactorValues;
}
use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class DifferentialExpressionAnalysisUtil method generateFactorValuePairings.
/**
* Generates all possible factor value pairings for the given experimental factors.
*
* @param experimentalFactors exp. factors
* @return A collection of hashSets, where each hashSet is a pairing.
*/
private static Collection<Set<FactorValue>> generateFactorValuePairings(Collection<ExperimentalFactor> experimentalFactors) {
/* set up the possible pairings */
Collection<FactorValue> allFactorValues = new HashSet<>();
for (ExperimentalFactor experimentalFactor : experimentalFactors) {
allFactorValues.addAll(experimentalFactor.getFactorValues());
}
Collection<Set<FactorValue>> factorValuePairings = new HashSet<>();
for (FactorValue factorValue : allFactorValues) {
for (FactorValue f : allFactorValues) {
if (f.getExperimentalFactor().equals(factorValue.getExperimentalFactor()))
continue;
HashSet<FactorValue> factorValuePairing = new HashSet<>();
factorValuePairing.add(factorValue);
factorValuePairing.add(f);
if (!factorValuePairings.contains(factorValuePairing)) {
factorValuePairings.add(factorValuePairing);
}
}
}
return factorValuePairings;
}
use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSort method orderBiomaterialsBySortedFactors.
/**
* Sort biomaterials according to a list of ordered factors
*
* @param start biomaterials to sort
* @param factors sorted list of factors to define sort order for biomaterials, cannot be null
*/
private static List<BioMaterial> orderBiomaterialsBySortedFactors(List<BioMaterial> start, List<ExperimentalFactor> factors) {
if (start.size() == 1) {
return start;
}
if (start.size() == 0) {
throw new IllegalArgumentException("Must provide some biomaterials");
}
if (factors == null) {
throw new IllegalArgumentException("Must provide sorted factors, or at least an empty list");
}
if (factors.isEmpty()) {
// we're done.
return start;
}
ExperimentalFactor simplest = factors.get(0);
if (simplest == null) {
// we're done.
return start;
}
/*
* Order this chunk by the selected factor
*/
Map<FactorValue, List<BioMaterial>> fv2bms = ExpressionDataMatrixColumnSort.buildFv2BmMap(start);
List<BioMaterial> ordered = ExpressionDataMatrixColumnSort.orderByFactor(simplest, fv2bms, start);
// Abort ordering, so we are ordered only by the first continuous factor.
if (ExperimentalDesignUtils.isContinuous(simplest)) {
assert ordered != null;
return ordered;
}
LinkedList<ExperimentalFactor> factorsStillToDo = new LinkedList<>();
factorsStillToDo.addAll(factors);
factorsStillToDo.remove(simplest);
if (factorsStillToDo.size() == 0) {
/*
* No more ordering is necessary.
*/
return ordered;
}
ExpressionDataMatrixColumnSort.log.debug("Factors: " + factors.size());
/*
* Recurse in and order each chunk. First split it up, but retaining the order we just made.
*/
LinkedHashMap<FactorValue, List<BioMaterial>> chunks = ExpressionDataMatrixColumnSort.chunkOnFactor(simplest, ordered);
if (chunks == null) {
// this means we should bail, gracefully.
return start;
}
/*
* Process each chunk.
*/
List<BioMaterial> result = new ArrayList<>();
for (FactorValue fv : chunks.keySet()) {
List<BioMaterial> chunk = chunks.get(fv);
if (chunk.size() < 2) {
result.addAll(chunk);
} else {
List<BioMaterial> orderedChunk = ExpressionDataMatrixColumnSort.orderBiomaterialsBySortedFactors(chunk, factorsStillToDo);
result.addAll(orderedChunk);
}
}
return result;
}
use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSort method getBaselineLevels.
/**
* Identify the FactorValue that should be treated as 'Baseline' for each of the given factors. This is done
* heuristically, and if all else fails we choose arbitrarily. For continuous factors, the minimum value is treated
* as baseline.
*
* @param samplesUsed These are used to make sure we don't bother using factor values as baselines if they are not
* used by any of the samples. This is important for subsets. If null, this is ignored.
* @param factors factors
* @return map of factors to the baseline factorvalue for that factor.
*/
public static Map<ExperimentalFactor, FactorValue> getBaselineLevels(List<BioMaterial> samplesUsed, Collection<ExperimentalFactor> factors) {
Map<ExperimentalFactor, FactorValue> result = new HashMap<>();
for (ExperimentalFactor factor : factors) {
if (factor.getFactorValues().isEmpty()) {
throw new IllegalStateException("Factor has no factor values: " + factor);
}
if (ExperimentalDesignUtils.isContinuous(factor)) {
// then there is no baseline, but we'll take the minimum value.
TreeMap<Double, FactorValue> sortedVals = new TreeMap<>();
for (FactorValue fv : factor.getFactorValues()) {
/*
* Check that this factor value is used by at least one of the given samples. Only matters if this
* is a subset of the full data set.
*/
if (samplesUsed != null && !ExpressionDataMatrixColumnSort.used(fv, samplesUsed)) {
// this factorValue cannot be a candidate baseline for this subset.
continue;
}
if (fv.getMeasurement() == null) {
throw new IllegalStateException("Continuous factors should have Measurements as values");
}
Double v = Double.parseDouble(fv.getMeasurement().getValue());
sortedVals.put(v, fv);
}
result.put(factor, sortedVals.firstEntry().getValue());
} else {
for (FactorValue fv : factor.getFactorValues()) {
/*
* Check that this factor value is used by at least one of the given samples. Only matters if this
* is a subset of the full data set.
*/
if (samplesUsed != null && !ExpressionDataMatrixColumnSort.used(fv, samplesUsed)) {
// this factorValue cannot be a candidate baseline for this subset.
continue;
}
if (BaselineSelection.isForcedBaseline(fv)) {
ExpressionDataMatrixColumnSort.log.info("Baseline chosen: " + fv);
result.put(factor, fv);
break;
}
if (BaselineSelection.isBaselineCondition(fv)) {
if (result.containsKey(factor)) {
ExpressionDataMatrixColumnSort.log.warn("A second potential baseline was found for " + factor + ": " + fv);
continue;
}
ExpressionDataMatrixColumnSort.log.info("Baseline chosen: " + fv);
result.put(factor, fv);
}
}
if (!result.containsKey(factor)) {
// fallback
FactorValue arbitraryBaselineFV = null;
if (samplesUsed != null) {
// make sure we choose a fv that is actually used (see above for non-arbitrary case)
for (FactorValue fv : factor.getFactorValues()) {
for (BioMaterial bm : samplesUsed) {
for (FactorValue bfv : bm.getFactorValues()) {
if (fv.equals(bfv)) {
arbitraryBaselineFV = fv;
break;
}
}
if (arbitraryBaselineFV != null)
break;
}
if (arbitraryBaselineFV != null)
break;
}
} else {
arbitraryBaselineFV = factor.getFactorValues().iterator().next();
}
if (arbitraryBaselineFV == null) {
throw new IllegalStateException("No baseline could be identified for factor: " + factor + " has " + factor.getFactorValues().size() + " factor values");
}
ExpressionDataMatrixColumnSort.log.info("Falling back on choosing baseline arbitrarily: " + arbitraryBaselineFV);
result.put(factor, arbitraryBaselineFV);
}
}
}
return result;
}
use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSort method chooseSimplestFactor.
/**
* Choose the factor with the smallest number of categories. 'Batch' is a special case and is always considered
* 'last'. Another special case is if a factor is continuous: it is returned first and aborts reordering by other
* factors.
*
* @return null if no factor has at least 2 values represented, or the factor with the fewest number of values (at
* least 2 values that is)
*/
private static ExperimentalFactor chooseSimplestFactor(List<BioMaterial> bms, Collection<ExperimentalFactor> factors) {
ExperimentalFactor simplest = null;
int smallestSize = Integer.MAX_VALUE;
Collection<FactorValue> usedValues = new HashSet<>();
for (BioMaterial bm : bms) {
usedValues.addAll(bm.getFactorValues());
}
for (ExperimentalFactor ef : factors) {
if (ExperimentalDesignUtils.isContinuous(ef)) {
return ef;
}
/*
* Always push 'batch' down the list
*/
if (factors.size() > 1 && ExperimentalDesignUtils.isBatch(ef)) {
continue;
}
int numvals = 0;
for (FactorValue fv : ef.getFactorValues()) {
if (usedValues.contains(fv)) {
numvals++;
}
}
if (numvals > 1 && numvals < smallestSize) {
smallestSize = numvals;
simplest = ef;
}
}
return simplest;
}
Aggregations