use of ubic.gemma.model.expression.bioAssayData.BioAssayDimension in project Gemma by PavlidisLab.
the class TwoChannelMissingValuesTest method print.
/**
* Debug code.
*/
@SuppressWarnings("unused")
private void print(Collection<RawExpressionDataVector> calls) {
ByteArrayConverter bac = new ByteArrayConverter();
BioAssayDimension dim = calls.iterator().next().getBioAssayDimension();
System.err.print("\n");
for (BioAssay bas : dim.getBioAssays()) {
System.err.print("\t" + bas);
}
System.err.print("\n");
for (DesignElementDataVector vector : calls) {
System.err.print(vector.getDesignElement());
byte[] dat = vector.getData();
boolean[] row = bac.byteArrayToBooleans(dat);
for (boolean b : row) {
System.err.print("\t" + b);
}
System.err.print("\n");
}
}
use of ubic.gemma.model.expression.bioAssayData.BioAssayDimension in project Gemma by PavlidisLab.
the class TwoChannelMissingValuesTest method testMissingValue.
@Test
public void testMissingValue() throws Exception {
ExpressionExperiment old = eeService.findByShortName("GSE2221");
if (old != null)
eeService.remove(old);
InputStream is = new GZIPInputStream(this.getClass().getResourceAsStream("/data/loader/expression/geo/shortGenePix/GSE2221_family.soft.gz"));
GeoFamilyParser parser = new GeoFamilyParser();
parser.parse(is);
GeoSeries series = ((GeoParseResult) parser.getResults().iterator().next()).getSeriesMap().get("GSE2221");
DatasetCombiner datasetCombiner = new DatasetCombiner();
GeoSampleCorrespondence correspondence = datasetCombiner.findGSECorrespondence(series);
series.setSampleCorrespondence(correspondence);
Object result = this.gc.convert(series);
assertNotNull(result);
ExpressionExperiment expExp = (ExpressionExperiment) ((Collection<?>) result).iterator().next();
expExp = persisterHelper.persist(expExp, persisterHelper.prepare(expExp));
Collection<RawExpressionDataVector> calls = tcmv.computeMissingValues(expExp, 2.0, new ArrayList<Double>());
assertEquals(500, calls.size());
BioAssayDimension dim = calls.iterator().next().getBioAssayDimension();
// Spot check the results. For sample ME-TMZ, ID #27 should be 'true' and 26 should be false.
ByteArrayConverter bac = new ByteArrayConverter();
boolean foundA = false;
boolean foundB = false;
for (DesignElementDataVector vector : calls) {
if (vector.getDesignElement().getName().equals("26")) {
byte[] dat = vector.getData();
boolean[] row = bac.byteArrayToBooleans(dat);
int i = 0;
for (BioAssay bas : dim.getBioAssays()) {
if (bas.getName().equals("expression array ME-TMZ")) {
assertTrue(!row[i]);
foundA = true;
}
i++;
}
}
if (vector.getDesignElement().getName().equals("27")) {
byte[] dat = vector.getData();
boolean[] row = bac.byteArrayToBooleans(dat);
int i = 0;
for (BioAssay bas : dim.getBioAssays()) {
if (bas.getName().equals("expression array ME-TMZ")) {
assertTrue(row[i]);
foundB = true;
}
i++;
}
}
}
assertTrue(foundA && foundB);
}
use of ubic.gemma.model.expression.bioAssayData.BioAssayDimension in project Gemma by PavlidisLab.
the class ExpressionDataMatrixColumnSortTest method testOrderByExperimentalDesignB.
@Test
public void testOrderByExperimentalDesignB() {
BioAssayDimension bad = BioAssayDimension.Factory.newInstance();
/*
* Five factors. Factor4 is a measurmeent.
*/
Collection<ExperimentalFactor> factors = new HashSet<>();
for (int i = 0; i < 5; i++) {
ExperimentalFactor ef = ExperimentalFactor.Factory.newInstance();
ef.setType(FactorType.CATEGORICAL);
ef.setName("factor" + i);
if (i == 4) {
ef.setName("mfact" + i);
}
ef.setId((long) i);
for (int j = 0; j < 3; j++) {
FactorValue fv = FactorValue.Factory.newInstance();
fv.setValue("fv" + (j + 1) * (i + 1));
fv.setId((long) (j + 1) * (i + 1));
fv.setExperimentalFactor(ef);
ef.getFactorValues().add(fv);
if (j == 2 && i != 4) {
fv.setValue("control_group");
}
if (i == 4) {
ef.setType(FactorType.CONTINUOUS);
Measurement m = Measurement.Factory.newInstance();
m.setId((long) j * (i + 1));
m.setValue(j + ".00");
m.setRepresentation(PrimitiveType.DOUBLE);
fv.setMeasurement(m);
}
}
factors.add(ef);
}
Random random = new Random();
for (int i = 0; i < 100; i++) {
BioAssay ba = BioAssay.Factory.newInstance();
ba.setName("ba" + i);
ba.setId((long) i);
bad.getBioAssays().add(ba);
BioMaterial bm = BioMaterial.Factory.newInstance();
bm.setId((long) i);
bm.setName("bm" + i);
ba.setSampleUsed(bm);
for (ExperimentalFactor ef : factors) {
/*
* Note: if we use 4, then some of the biomaterials will not have a factorvalue for each factor. This is
* realistic. Use 3 to fill it in completely.
*/
int k = random.nextInt(4);
int m = 0;
FactorValue toUse = null;
for (FactorValue fv : ef.getFactorValues()) {
if (m == k) {
toUse = fv;
break;
}
m++;
}
if (toUse != null)
bm.getFactorValues().add(toUse);
// log.info( ba + " -> " + bm + " -> " + ef + " -> " + toUse );
}
}
EmptyExpressionMatrix mat = new EmptyExpressionMatrix(bad);
assertEquals(100, mat.columns());
List<BioMaterial> ordered = ExpressionDataMatrixColumnSort.orderByExperimentalDesign(mat);
assertEquals(100, ordered.size());
// for ( BioMaterial bioMaterial : ordered ) {
// log.info( bioMaterial + " .... " + StringUtils.join( bioMaterial.getFactorValues(), " --- " ) );
// }
}
use of ubic.gemma.model.expression.bioAssayData.BioAssayDimension in project Gemma by PavlidisLab.
the class BaseExpressionDataMatrix method getBestBioAssayDimension.
@Override
public BioAssayDimension getBestBioAssayDimension() {
Collection<BioAssayDimension> dims = new HashSet<>(this.bioAssayDimensions.values());
BioAssayDimension b = dims.iterator().next();
if (dims.size() > 1) {
/*
* Special complication if there is more than one BioAssayDimension
*/
int s = -1;
Collection<BioMaterial> allBioMaterials = new HashSet<>();
// find the largest BioAssayDimension
for (BioAssayDimension bioAssayDimension : dims) {
if (bioAssayDimension.getBioAssays().size() > s) {
s = bioAssayDimension.getBioAssays().size();
b = bioAssayDimension;
}
for (BioAssay ba : b.getBioAssays()) {
allBioMaterials.add(ba.getSampleUsed());
}
}
for (BioAssay ba : b.getBioAssays()) {
if (!allBioMaterials.contains(ba.getSampleUsed())) {
/*
* In rare cases none of the usual ones has all the samples.
*
* This can also happen if the data are not sample-matched or vector-merged
*/
throw new IllegalStateException("Could not find an appropriate BioAssayDimension to represent the data matrix; data might need to be matched or merged");
}
}
}
return b;
}
use of ubic.gemma.model.expression.bioAssayData.BioAssayDimension in project Gemma by PavlidisLab.
the class BaseExpressionDataMatrix method setUpColumnElements.
/**
* <p>
* Note: In the current versions of Gemma, we require that there can be only a single BioAssayDimension. Thus this
* code is overly complex. If an experiment has multiple BioAssayDimensions (due to multiple arrays), we merge the
* vectors (e.g., needed in the last case shown below). However, the issue of having multiple "BioMaterials" per
* "BioAssay" still exists.
* <p>
* Deals with the fact that the bioassay dimensions can vary in size, and don't even need to overlap in the
* biomaterials used. In the case where there is a single BioAssayDimension this reduces to simply associating each
* column with a bioassay (though we are forced to use an integer under the hood).
* <p>
* For example, in the following diagram "-" indicates a biomaterial, while "*" indicates a bioassay. Each row of
* "*" indicates samples run on a different microarray design (a different bio assay material). In the examples we
* assume there is just a single biomaterial dimension.
* <pre>
* ---------------
* ***** -- only a few samples run on this platform
* ********** -- ditto
* **** -- these samples were not run on any of the other platforms .
* </pre>
* <p>
* A simpler case:
* </p>
* <pre>
* ---------------
* ***************
* ***********
* *******
* </pre>
* <p>
* A more typical and easy case (one microarray design used):
* </p>
* <pre>
* ----------------
* ****************
* </pre>
* <p>
* If every sample was run on two different array designs:
* </p>
* <pre>
* ----------------
* ****************
* ****************
* </pre>
* <p>
* Every sample was run on a different array design:
* <pre>
* -----------------------
* ******
* *********
* ********
* </pre>
* <p>
* Because there can be limited or no overlap between the bioassay dimensions, we cannot assume the dimensions of
* the matrix will be defined by the longest BioAssayDimension. Note that later in processing, this possible lack of
* overlap is fixed by sample matching or vector merging; this class has to deal with the general case.
* </p>
*/
int setUpColumnElements() {
BaseExpressionDataMatrix.log.debug("Setting up column elements");
assert this.bioAssayDimensions != null && this.bioAssayDimensions.size() > 0 : "No bioAssayDimensions defined";
Map<BioMaterial, Collection<BioAssay>> bioMaterialMap = new LinkedHashMap<>();
for (BioAssayDimension dimension : this.bioAssayDimensions.values()) {
List<BioAssay> bioAssays = dimension.getBioAssays();
BaseExpressionDataMatrix.log.debug("Processing: " + dimension + " with " + bioAssays.size() + " assays");
this.getBioMaterialGroupsForAssays(bioMaterialMap, bioAssays);
}
if (BaseExpressionDataMatrix.log.isDebugEnabled())
BaseExpressionDataMatrix.log.debug(bioMaterialMap.size() + " biomaterialGroups (correspond to columns)");
int column = 0;
for (BioMaterial bioMaterial : bioMaterialMap.keySet()) {
if (BaseExpressionDataMatrix.log.isDebugEnabled())
BaseExpressionDataMatrix.log.debug("Column " + column + " **--->>>> " + bioMaterial);
for (BioAssay assay : bioMaterialMap.get(bioMaterial)) {
if (this.columnBioMaterialMap.containsKey(bioMaterial)) {
int existingColumn = columnBioMaterialMap.get(bioMaterial);
this.columnAssayMap.put(assay, existingColumn);
if (BaseExpressionDataMatrix.log.isDebugEnabled())
BaseExpressionDataMatrix.log.debug(assay + " --> column " + existingColumn);
if (columnBioAssayMapByInteger.get(existingColumn) == null) {
columnBioAssayMapByInteger.put(existingColumn, new HashSet<BioAssay>());
}
columnBioAssayMapByInteger.get(existingColumn).add(assay);
} else {
if (BaseExpressionDataMatrix.log.isDebugEnabled()) {
BaseExpressionDataMatrix.log.debug(bioMaterial + " --> column " + column);
BaseExpressionDataMatrix.log.debug(assay + " --> column " + column);
}
this.columnBioMaterialMap.put(bioMaterial, column);
this.columnAssayMap.put(assay, column);
if (columnBioAssayMapByInteger.get(column) == null) {
columnBioAssayMapByInteger.put(column, new HashSet<BioAssay>());
}
columnBioMaterialMapByInteger.put(column, bioMaterial);
columnBioAssayMapByInteger.get(column).add(assay);
}
}
column++;
}
if (BaseExpressionDataMatrix.log.isDebugEnabled()) {
for (BioAssay o : this.columnAssayMap.keySet()) {
BaseExpressionDataMatrix.log.debug(o + " " + this.columnAssayMap.get(o));
}
}
assert bioMaterialMap.size() == columnBioMaterialMapByInteger.keySet().size();
return columnBioMaterialMapByInteger.keySet().size();
}
Aggregations