use of ubic.gemma.model.expression.experiment.FactorValue in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSort method organizeByFactorValues.
/**
* Organized the results by the factor values (for one factor)
*
* @param fv2bms master map
* @param bioMaterialChunk biomaterials to organize
* @param factorValues factor value to consider - biomaterials will be organized in the order given
* @param chunks map of factor values to chunks goes here
* @param organized the results go here
*/
private static void organizeByFactorValues(Map<FactorValue, List<BioMaterial>> fv2bms, List<BioMaterial> bioMaterialChunk, List<FactorValue> factorValues, LinkedHashMap<FactorValue, List<BioMaterial>> chunks, List<BioMaterial> organized) {
Collection<BioMaterial> seenBioMaterials = new HashSet<>();
for (FactorValue fv : factorValues) {
if (!fv2bms.containsKey(fv)) {
/*
* This can happen if a factorvalue has been created but not yet associated with any biomaterials. This
* can also be cruft.
*/
continue;
}
// all in entire experiment, so we might not want them all as we may just be processing a small chunk.
List<BioMaterial> bioMsForFv = fv2bms.get(fv);
for (BioMaterial bioMaterial : bioMsForFv) {
if (bioMaterialChunk.contains(bioMaterial)) {
if (!chunks.containsKey(fv)) {
chunks.put(fv, new ArrayList<BioMaterial>());
}
if (!chunks.get(fv).contains(bioMaterial)) {
/*
* shouldn't be twice, but ya never know.
*/
chunks.get(fv).add(bioMaterial);
}
}
seenBioMaterials.add(bioMaterial);
}
// If we used that fv ...
if (chunks.containsKey(fv)) {
// now at least this is in order of this factor
organized.addAll(chunks.get(fv));
}
}
// Leftovers contains biomaterials which have no factorvalue assigned for this factor.
Collection<BioMaterial> leftovers = new HashSet<>();
for (BioMaterial bm : bioMaterialChunk) {
if (!seenBioMaterials.contains(bm)) {
leftovers.add(bm);
}
}
if (leftovers.size() > 0) {
organized.addAll(leftovers);
chunks.put(null, new ArrayList<>(leftovers));
}
}
use of ubic.gemma.model.expression.experiment.FactorValue in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSort method getBaselineLevels.
/**
* Identify the FactorValue that should be treated as 'Baseline' for each of the given factors. This is done
* heuristically, and if all else fails we choose arbitrarily. For continuous factors, the minimum value is treated
* as baseline.
*
* @param samplesUsed These are used to make sure we don't bother using factor values as baselines if they are not
* used by any of the samples. This is important for subsets. If null, this is ignored.
* @param factors factors
* @return map of factors to the baseline factorvalue for that factor.
*/
public static Map<ExperimentalFactor, FactorValue> getBaselineLevels(List<BioMaterial> samplesUsed, Collection<ExperimentalFactor> factors) {
Map<ExperimentalFactor, FactorValue> result = new HashMap<>();
for (ExperimentalFactor factor : factors) {
if (factor.getFactorValues().isEmpty()) {
throw new IllegalStateException("Factor has no factor values: " + factor);
}
if (ExperimentalDesignUtils.isContinuous(factor)) {
// then there is no baseline, but we'll take the minimum value.
TreeMap<Double, FactorValue> sortedVals = new TreeMap<>();
for (FactorValue fv : factor.getFactorValues()) {
/*
* Check that this factor value is used by at least one of the given samples. Only matters if this
* is a subset of the full data set.
*/
if (samplesUsed != null && !ExpressionDataMatrixColumnSort.used(fv, samplesUsed)) {
// this factorValue cannot be a candidate baseline for this subset.
continue;
}
if (fv.getMeasurement() == null) {
throw new IllegalStateException("Continuous factors should have Measurements as values");
}
Double v = Double.parseDouble(fv.getMeasurement().getValue());
sortedVals.put(v, fv);
}
result.put(factor, sortedVals.firstEntry().getValue());
} else {
for (FactorValue fv : factor.getFactorValues()) {
/*
* Check that this factor value is used by at least one of the given samples. Only matters if this
* is a subset of the full data set.
*/
if (samplesUsed != null && !ExpressionDataMatrixColumnSort.used(fv, samplesUsed)) {
// this factorValue cannot be a candidate baseline for this subset.
continue;
}
if (BaselineSelection.isForcedBaseline(fv)) {
ExpressionDataMatrixColumnSort.log.info("Baseline chosen: " + fv);
result.put(factor, fv);
break;
}
if (BaselineSelection.isBaselineCondition(fv)) {
if (result.containsKey(factor)) {
ExpressionDataMatrixColumnSort.log.warn("A second potential baseline was found for " + factor + ": " + fv);
continue;
}
ExpressionDataMatrixColumnSort.log.info("Baseline chosen: " + fv);
result.put(factor, fv);
}
}
if (!result.containsKey(factor)) {
// fallback
FactorValue arbitraryBaselineFV = null;
if (samplesUsed != null) {
// make sure we choose a fv that is actually used (see above for non-arbitrary case)
for (FactorValue fv : factor.getFactorValues()) {
for (BioMaterial bm : samplesUsed) {
for (FactorValue bfv : bm.getFactorValues()) {
if (fv.equals(bfv)) {
arbitraryBaselineFV = fv;
break;
}
}
if (arbitraryBaselineFV != null)
break;
}
if (arbitraryBaselineFV != null)
break;
}
} else {
arbitraryBaselineFV = factor.getFactorValues().iterator().next();
}
if (arbitraryBaselineFV == null) {
throw new IllegalStateException("No baseline could be identified for factor: " + factor + " has " + factor.getFactorValues().size() + " factor values");
}
ExpressionDataMatrixColumnSort.log.info("Falling back on choosing baseline arbitrarily: " + arbitraryBaselineFV);
result.put(factor, arbitraryBaselineFV);
}
}
}
return result;
}
use of ubic.gemma.model.expression.experiment.FactorValue in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSort method orderByFactor.
/**
* @param fv2bms map of factorValues to lists of biomaterials that have that factorValue.
* @param bms Chunk of biomaterials to organize.
* @return ordered list, or null if there was a problem.
*/
private static List<BioMaterial> orderByFactor(ExperimentalFactor ef, Map<FactorValue, List<BioMaterial>> fv2bms, List<BioMaterial> bms) {
if (bms.size() == 1)
return bms;
ExpressionDataMatrixColumnSort.log.debug("Ordering " + bms.size() + " biomaterials by " + ef);
// probably redundant.
ExpressionDataMatrixColumnSort.sortBioMaterials(bms);
List<FactorValue> factorValues = new ArrayList<>(ef.getFactorValues());
if (factorValues.size() < 2) {
/*
* Not strictly disallowed, but useless.
*/
return bms;
}
if (!ExperimentalDesignUtils.isContinuous(ef)) {
ExpressionDataMatrixColumnSort.sortByControl(factorValues);
} else {
ExpressionDataMatrixColumnSort.sortIfMeasurement(factorValues);
}
LinkedHashMap<FactorValue, List<BioMaterial>> chunks = new LinkedHashMap<>();
List<BioMaterial> organized = new ArrayList<>();
ExpressionDataMatrixColumnSort.organizeByFactorValues(fv2bms, bms, factorValues, chunks, organized);
if (ExpressionDataMatrixColumnSort.log.isDebugEnabled()) {
for (BioMaterial b : organized) {
for (FactorValue f : b.getFactorValues()) {
if (f.getExperimentalFactor().equals(ef)) {
System.err.println(b.getId() + " " + f);
}
}
}
}
if (organized.size() != bms.size()) {
// fail gracefully.
ExpressionDataMatrixColumnSort.log.error("Could not order by factor: " + ef + " Biomaterial count (" + bms.size() + ") does not equal the size of the reorganized biomaterial list (" + organized.size() + "). Check the experimental design for completeness/correctness");
// return bms;
return null;
}
return organized;
}
use of ubic.gemma.model.expression.experiment.FactorValue in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSort method buildFv2BmMap.
private static Map<FactorValue, List<BioMaterial>> buildFv2BmMap(Collection<BioMaterial> bms) {
Map<FactorValue, List<BioMaterial>> fv2bms = new HashMap<>();
FactorValue dummy = FactorValue.Factory.newInstance();
dummy.setId(-1L);
for (BioMaterial bm : bms) {
// boolean used = false;
Collection<FactorValue> factorValues = bm.getFactorValues();
for (FactorValue fv : factorValues) {
if (!fv2bms.containsKey(fv)) {
fv2bms.put(fv, new ArrayList<BioMaterial>());
}
fv2bms.get(fv).add(bm);
}
}
for (Entry<FactorValue, List<BioMaterial>> e : fv2bms.entrySet()) {
List<BioMaterial> biomaterials = e.getValue();
ExpressionDataMatrixColumnSort.sortBioMaterials(biomaterials);
}
return fv2bms;
}
use of ubic.gemma.model.expression.experiment.FactorValue in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSort method chooseSimplestFactor.
/**
* Choose the factor with the smallest number of categories. 'Batch' is a special case and is always considered
* 'last'. Another special case is if a factor is continuous: it is returned first and aborts reordering by other
* factors.
*
* @return null if no factor has at least 2 values represented, or the factor with the fewest number of values (at
* least 2 values that is)
*/
private static ExperimentalFactor chooseSimplestFactor(List<BioMaterial> bms, Collection<ExperimentalFactor> factors) {
ExperimentalFactor simplest = null;
int smallestSize = Integer.MAX_VALUE;
Collection<FactorValue> usedValues = new HashSet<>();
for (BioMaterial bm : bms) {
usedValues.addAll(bm.getFactorValues());
}
for (ExperimentalFactor ef : factors) {
if (ExperimentalDesignUtils.isContinuous(ef)) {
return ef;
}
/*
* Always push 'batch' down the list
*/
if (factors.size() > 1 && ExperimentalDesignUtils.isBatch(ef)) {
continue;
}
int numvals = 0;
for (FactorValue fv : ef.getFactorValues()) {
if (usedValues.contains(fv)) {
numvals++;
}
}
if (numvals > 1 && numvals < smallestSize) {
smallestSize = numvals;
simplest = ef;
}
}
return simplest;
}
Aggregations