use of org.apache.commons.math3.ml.clustering.Cluster in project hadoop by apache.
the class TestClusterTopology method testChooseRandom.
/**
* Test how well we pick random nodes.
*/
@Test
public void testChooseRandom() {
// create the topology
NetworkTopology cluster = NetworkTopology.getInstance(new Configuration());
NodeElement node1 = getNewNode("node1", "/d1/r1");
cluster.add(node1);
NodeElement node2 = getNewNode("node2", "/d1/r2");
cluster.add(node2);
NodeElement node3 = getNewNode("node3", "/d1/r3");
cluster.add(node3);
NodeElement node4 = getNewNode("node4", "/d1/r3");
cluster.add(node4);
// Number of iterations to do the test
int numIterations = 100;
// Pick random nodes
HashMap<String, Integer> histogram = new HashMap<String, Integer>();
for (int i = 0; i < numIterations; i++) {
String randomNode = cluster.chooseRandom(NodeBase.ROOT).getName();
if (!histogram.containsKey(randomNode)) {
histogram.put(randomNode, 0);
}
histogram.put(randomNode, histogram.get(randomNode) + 1);
}
assertEquals("Random is not selecting all nodes", 4, histogram.size());
// Check with 99% confidence (alpha=0.01 as confidence = (100 * (1 - alpha)
ChiSquareTest chiSquareTest = new ChiSquareTest();
double[] expected = new double[histogram.size()];
long[] observed = new long[histogram.size()];
int j = 0;
for (Integer occurrence : histogram.values()) {
expected[j] = 1.0 * numIterations / histogram.size();
observed[j] = occurrence;
j++;
}
boolean chiSquareTestRejected = chiSquareTest.chiSquareTest(expected, observed, 0.01);
// Check that they have the proper distribution
assertFalse("Not choosing nodes randomly", chiSquareTestRejected);
// Pick random nodes excluding the 2 nodes in /d1/r3
histogram = new HashMap<String, Integer>();
for (int i = 0; i < numIterations; i++) {
String randomNode = cluster.chooseRandom("~/d1/r3").getName();
if (!histogram.containsKey(randomNode)) {
histogram.put(randomNode, 0);
}
histogram.put(randomNode, histogram.get(randomNode) + 1);
}
assertEquals("Random is not selecting the nodes it should", 2, histogram.size());
}
use of org.apache.commons.math3.ml.clustering.Cluster in project GDSC-SMLM by aherbert.
the class PCPALMFitting method fitClusteredModel.
/**
* Fits the correlation curve with r>0 to the clustered model using the estimated density and precision. Parameters
* must be fit within a tolerance of the starting values.
*
* @param gr
* @param sigmaS
* The estimated precision
* @param proteinDensity
* The estimated protein density
* @return The fitted parameters [precision, density, clusterRadius, clusterDensity]
*/
private double[] fitClusteredModel(double[][] gr, double sigmaS, double proteinDensity, String resultColour) {
final ClusteredModelFunctionGradient function = new ClusteredModelFunctionGradient();
clusteredModel = function;
log("Fitting %s: Estimated precision = %f nm, estimated protein density = %g um^-2", clusteredModel.getName(), sigmaS, proteinDensity * 1e6);
clusteredModel.setLogging(true);
for (int i = offset; i < gr[0].length; i++) {
// Only fit the curve above the estimated resolution (points below it will be subject to error)
if (gr[0][i] > sigmaS * fitAboveEstimatedPrecision)
clusteredModel.addPoint(gr[0][i], gr[1][i]);
}
double[] parameters;
// The model is: sigma, density, range, amplitude
double[] initialSolution = new double[] { sigmaS, proteinDensity, sigmaS * 5, 1 };
int evaluations = 0;
// Constrain the fitting to be close to the estimated precision (sigmaS) and protein density.
// LVM fitting does not support constrained fitting so use a bounded optimiser.
SumOfSquaresModelFunction clusteredModelMulti = new SumOfSquaresModelFunction(clusteredModel);
double[] x = clusteredModelMulti.x;
// Put some bounds around the initial guess. Use the fitting tolerance (in %) if provided.
double limit = (fittingTolerance > 0) ? 1 + fittingTolerance / 100 : 2;
double[] lB = new double[] { initialSolution[0] / limit, initialSolution[1] / limit, 0, 0 };
// The amplitude and range should not extend beyond the limits of the g(r) curve.
double[] uB = new double[] { initialSolution[0] * limit, initialSolution[1] * limit, Maths.max(x), Maths.max(gr[1]) };
log("Fitting %s using a bounded search: %s < precision < %s & %s < density < %s", clusteredModel.getName(), Utils.rounded(lB[0], 4), Utils.rounded(uB[0], 4), Utils.rounded(lB[1] * 1e6, 4), Utils.rounded(uB[1] * 1e6, 4));
PointValuePair constrainedSolution = runBoundedOptimiser(gr, initialSolution, lB, uB, clusteredModelMulti);
if (constrainedSolution == null)
return null;
parameters = constrainedSolution.getPointRef();
evaluations = boundedEvaluations;
// Refit using a LVM
if (useLSE) {
log("Re-fitting %s using a gradient optimisation", clusteredModel.getName());
LevenbergMarquardtOptimizer optimizer = new LevenbergMarquardtOptimizer();
Optimum lvmSolution;
try {
//@formatter:off
LeastSquaresProblem problem = new LeastSquaresBuilder().maxEvaluations(Integer.MAX_VALUE).maxIterations(3000).start(parameters).target(function.getY()).weight(new DiagonalMatrix(function.getWeights())).model(function, new MultivariateMatrixFunction() {
public double[][] value(double[] point) throws IllegalArgumentException {
return function.jacobian(point);
}
}).build();
//@formatter:on
lvmSolution = optimizer.optimize(problem);
evaluations += lvmSolution.getEvaluations();
double ss = lvmSolution.getResiduals().dotProduct(lvmSolution.getResiduals());
if (ss < constrainedSolution.getValue()) {
log("Re-fitting %s improved the SS from %s to %s (-%s%%)", clusteredModel.getName(), Utils.rounded(constrainedSolution.getValue(), 4), Utils.rounded(ss, 4), Utils.rounded(100 * (constrainedSolution.getValue() - ss) / constrainedSolution.getValue(), 4));
parameters = lvmSolution.getPoint().toArray();
}
} catch (TooManyIterationsException e) {
log("Failed to re-fit %s: Too many iterations (%s)", clusteredModel.getName(), e.getMessage());
} catch (ConvergenceException e) {
log("Failed to re-fit %s: %s", clusteredModel.getName(), e.getMessage());
}
}
clusteredModel.setLogging(false);
// Ensure the width is positive
parameters[0] = Math.abs(parameters[0]);
//parameters[2] = Math.abs(parameters[2]);
double ss = 0;
double[] obs = clusteredModel.getY();
double[] exp = clusteredModel.value(parameters);
for (int i = 0; i < obs.length; i++) ss += (obs[i] - exp[i]) * (obs[i] - exp[i]);
ic2 = Maths.getAkaikeInformationCriterionFromResiduals(ss, clusteredModel.size(), parameters.length);
final double fitSigmaS = parameters[0];
final double fitProteinDensity = parameters[1];
//The radius of the cluster domain
final double domainRadius = parameters[2];
//The density of the cluster domain
final double domainDensity = parameters[3];
// This is from the PC-PALM paper. However that paper fits the g(r)protein exponential convolved in 2D
// with the g(r)PSF. In this method we have just fit the exponential
final double nCluster = 2 * domainDensity * Math.PI * domainRadius * domainRadius * fitProteinDensity;
double e1 = parameterDrift(sigmaS, fitSigmaS);
double e2 = parameterDrift(proteinDensity, fitProteinDensity);
log(" %s fit: SS = %f. cAIC = %f. %d evaluations", clusteredModel.getName(), ss, ic2, evaluations);
log(" %s parameters:", clusteredModel.getName());
log(" Average precision = %s nm (%s%%)", Utils.rounded(fitSigmaS, 4), Utils.rounded(e1, 4));
log(" Average protein density = %s um^-2 (%s%%)", Utils.rounded(fitProteinDensity * 1e6, 4), Utils.rounded(e2, 4));
log(" Domain radius = %s nm", Utils.rounded(domainRadius, 4));
log(" Domain density = %s", Utils.rounded(domainDensity, 4));
log(" nCluster = %s", Utils.rounded(nCluster, 4));
// Check the fitted parameters are within tolerance of the initial estimates
valid2 = true;
if (fittingTolerance > 0 && (Math.abs(e1) > fittingTolerance || Math.abs(e2) > fittingTolerance)) {
log(" Failed to fit %s within tolerance (%s%%): Average precision = %f nm (%s%%), average protein density = %g um^-2 (%s%%)", clusteredModel.getName(), Utils.rounded(fittingTolerance, 4), fitSigmaS, Utils.rounded(e1, 4), fitProteinDensity * 1e6, Utils.rounded(e2, 4));
valid2 = false;
}
// Check extra parameters. Domain radius should be higher than the precision. Density should be positive
if (domainRadius < fitSigmaS) {
log(" Failed to fit %s: Domain radius is smaller than the average precision (%s < %s)", clusteredModel.getName(), Utils.rounded(domainRadius, 4), Utils.rounded(fitSigmaS, 4));
valid2 = false;
}
if (domainDensity < 0) {
log(" Failed to fit %s: Domain density is negative (%s)", clusteredModel.getName(), Utils.rounded(domainDensity, 4));
valid2 = false;
}
if (ic2 > ic1) {
log(" Failed to fit %s - Information Criterion has increased %s%%", clusteredModel.getName(), Utils.rounded((100 * (ic2 - ic1) / ic1), 4));
valid2 = false;
}
addResult(clusteredModel.getName(), resultColour, valid2, fitSigmaS, fitProteinDensity, domainRadius, domainDensity, nCluster, 0, ic2);
return parameters;
}
use of org.apache.commons.math3.ml.clustering.Cluster in project GDSC-SMLM by aherbert.
the class TraceMolecules method summarise.
private void summarise(Trace[] traces, int filtered, double dThreshold, double tThreshold) {
IJ.showStatus("Calculating summary ...");
// Create summary table
createSummaryTable();
Statistics[] stats = new Statistics[NAMES.length];
for (int i = 0; i < stats.length; i++) {
stats[i] = (settings.showHistograms || settings.saveTraceData) ? new StoredDataStatistics() : new Statistics();
}
int singles = 0;
for (Trace trace : traces) {
int nBlinks = trace.getNBlinks() - 1;
stats[BLINKS].add(nBlinks);
int[] onTimes = trace.getOnTimes();
int[] offTimes = trace.getOffTimes();
double tOn = 0;
for (int t : onTimes) {
stats[T_ON].add(t * exposureTime);
tOn += t * exposureTime;
}
stats[TOTAL_T_ON].add(tOn);
if (offTimes != null) {
double tOff = 0;
for (int t : offTimes) {
stats[T_OFF].add(t * exposureTime);
tOff += t * exposureTime;
}
stats[TOTAL_T_OFF].add(tOff);
}
final double signal = trace.getSignal() / results.getGain();
stats[TOTAL_SIGNAL].add(signal);
stats[SIGNAL_PER_FRAME].add(signal / trace.size());
if (trace.size() == 1)
singles++;
}
// Add to the summary table
StringBuilder sb = new StringBuilder();
sb.append(results.getName()).append("\t");
sb.append(outputName.equals("Cluster") ? settings.getClusteringAlgorithm() : settings.getTraceMode()).append("\t");
sb.append(Utils.rounded(exposureTime * 1000, 3)).append("\t");
sb.append(Utils.rounded(dThreshold, 3)).append("\t");
sb.append(Utils.rounded(tThreshold, 3));
if (settings.splitPulses)
sb.append(" *");
sb.append("\t");
sb.append(timeInFrames2(tThreshold)).append("\t");
sb.append(traces.length).append("\t");
sb.append(filtered).append("\t");
sb.append(singles).append("\t");
sb.append(traces.length - singles).append("\t");
for (int i = 0; i < stats.length; i++) {
sb.append(Utils.rounded(stats[i].getMean(), 3)).append("\t");
}
if (java.awt.GraphicsEnvironment.isHeadless()) {
IJ.log(sb.toString());
return;
} else {
summaryTable.append(sb.toString());
}
if (settings.showHistograms) {
IJ.showStatus("Calculating histograms ...");
int[] idList = new int[NAMES.length];
int count = 0;
boolean requireRetile = false;
for (int i = 0; i < NAMES.length; i++) {
if (displayHistograms[i]) {
idList[count++] = Utils.showHistogram(TITLE, (StoredDataStatistics) stats[i], NAMES[i], (integerDisplay[i]) ? 1 : 0, (settings.removeOutliers || alwaysRemoveOutliers[i]) ? 2 : 0, settings.histogramBins);
requireRetile = requireRetile || Utils.isNewWindow();
}
}
if (count > 0 && requireRetile) {
idList = Arrays.copyOf(idList, count);
new WindowOrganiser().tileWindows(idList);
}
}
if (settings.saveTraceData) {
saveTraceData(stats);
}
IJ.showStatus("");
}
use of org.apache.commons.math3.ml.clustering.Cluster in project incubator-systemml by apache.
the class DataGenMR method runJob.
/**
* <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
*
* @param inst MR job instruction
* @param dataGenInstructions array of data gen instructions
* @param instructionsInMapper instructions in mapper
* @param aggInstructionsInReducer aggregate instructions in reducer
* @param otherInstructionsInReducer other instructions in reducer
* @param numReducers number of reducers
* @param replication file replication
* @param resultIndexes result indexes for each random object
* @param dimsUnknownFilePrefix file path prefix when dimensions unknown
* @param outputs output file for each random object
* @param outputInfos output information for each random object
* @return matrix characteristics for each random object
* @throws Exception if Exception occurs
*/
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(DataGenMR.class);
job.setJobName("DataGen-MR");
// whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, true);
byte[] realIndexes = new byte[dataGenInstructions.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
String[] inputs = new String[dataGenInstructions.length];
InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
long[] rlens = new long[dataGenInstructions.length];
long[] clens = new long[dataGenInstructions.length];
int[] brlens = new int[dataGenInstructions.length];
int[] bclens = new int[dataGenInstructions.length];
FileSystem fs = FileSystem.get(job);
String dataGenInsStr = "";
int numblocks = 0;
int maxbrlen = -1, maxbclen = -1;
double maxsparsity = -1;
for (int i = 0; i < dataGenInstructions.length; i++) {
dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];
MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
MRType mrtype = mrins.getMRInstructionType();
DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;
rlens[i] = genInst.getRows();
clens[i] = genInst.getCols();
brlens[i] = genInst.getRowsInBlock();
bclens[i] = genInst.getColsInBlock();
maxbrlen = Math.max(maxbrlen, brlens[i]);
maxbclen = Math.max(maxbclen, bclens[i]);
if (mrtype == MRType.Rand) {
RandInstruction randInst = (RandInstruction) mrins;
inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
maxsparsity = Math.max(maxsparsity, randInst.getSparsity());
PrintWriter pw = null;
try {
pw = new PrintWriter(fs.create(new Path(inputs[i])));
// for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
// seed generation
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
for (long r = 0; r < Math.max(rlens[i], 1); r += brlens[i]) {
long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
for (long c = 0; c < Math.max(clens[i], 1); c += bclens[i]) {
long curBlockColSize = Math.min(bclens[i], (clens[i] - c));
sb.append((r / brlens[i]) + 1);
sb.append(',');
sb.append((c / bclens[i]) + 1);
sb.append(',');
sb.append(curBlockRowSize);
sb.append(',');
sb.append(curBlockColSize);
sb.append(',');
sb.append(bigrand.nextLong());
pw.println(sb.toString());
sb.setLength(0);
numblocks++;
}
}
} finally {
IOUtilFunctions.closeSilently(pw);
}
inputInfos[i] = InputInfo.TextCellInputInfo;
} else if (mrtype == MRType.Seq) {
SeqInstruction seqInst = (SeqInstruction) mrins;
inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
// always dense
maxsparsity = 1.0;
double from = seqInst.fromValue;
double to = seqInst.toValue;
double incr = seqInst.incrValue;
// handle default 1 to -1 for special case of from>to
incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);
// Correctness checks on (from, to, incr)
boolean neg = (from > to);
if (incr == 0)
throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");
if (neg != (incr < 0))
throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");
// Compute the number of rows in the sequence
long numrows = UtilFunctions.getSeqLength(from, to, incr);
if (rlens[i] > 0) {
if (numrows != rlens[i])
throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows);
} else {
rlens[i] = numrows;
}
if (clens[i] > 0 && clens[i] != 1)
throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1.");
else
clens[i] = 1;
PrintWriter pw = null;
try {
pw = new PrintWriter(fs.create(new Path(inputs[i])));
StringBuilder sb = new StringBuilder();
double temp = from;
double block_from, block_to;
for (long r = 0; r < rlens[i]; r += brlens[i]) {
long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
// block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval)
long bid_i = ((r / brlens[i]) + 1);
long bid_j = 1;
block_from = temp;
block_to = temp + (curBlockRowSize - 1) * incr;
// next block starts from here
temp = block_to + incr;
sb.append(bid_i);
sb.append(',');
sb.append(bid_j);
sb.append(',');
sb.append(block_from);
sb.append(',');
sb.append(block_to);
sb.append(',');
sb.append(incr);
pw.println(sb.toString());
sb.setLength(0);
numblocks++;
}
} finally {
IOUtilFunctions.closeSilently(pw);
}
inputInfos[i] = InputInfo.TextCellInputInfo;
} else {
throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
}
}
// remove the first ","
dataGenInsStr = dataGenInsStr.substring(1);
RunningJob runjob;
MatrixCharacteristics[] stats;
try {
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
// set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
// set up the rand Instructions
MRJobConfiguration.setRandInstructions(job, dataGenInsStr);
// set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
// set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
// set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
// set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
// determine degree of parallelism (nmappers: 1<=n<=capacity)
// TODO use maxsparsity whenever we have a way of generating sparse rand data
int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
// correction max number of mappers on yarn clusters
if (InfrastructureAnalyzer.isYarnEnabled())
capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
int nmapers = Math.max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
job.setNumMapTasks(nmapers);
// set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
stats = ret.stats;
// set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// print the complete MRJob instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// Update resultDimsUnknown based on computed "stats"
byte[] resultDimsUnknown = new byte[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) {
if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
resultDimsUnknown[i] = (byte) 1;
} else {
resultDimsUnknown[i] = (byte) 0;
}
}
boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg");
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);
// configure mapper and the mapper output key value pairs
job.setMapperClass(DataGenMapper.class);
if (numReducers == 0) {
job.setMapOutputKeyClass(Writable.class);
job.setMapOutputValueClass(Writable.class);
} else {
job.setMapOutputKeyClass(MatrixIndexes.class);
job.setMapOutputValueClass(TaggedMatrixBlock.class);
}
// set up combiner
if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
job.setCombinerClass(GMRCombiner.class);
// configure reducer
job.setReducerClass(GMRReducer.class);
// job.setReducerClass(PassThroughReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
stats = MapReduceTool.processDimsFiles(dir, stats);
MapReduceTool.deleteFileIfExistOnHDFS(dir);
} finally {
for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.commons.math3.ml.clustering.Cluster in project pyramid by cheng-li.
the class MultiLabelSynthesizer method sampleFromMix.
/**
* C0, y0: w=(0,1)
* C0, y1: w=(1,1)
* C1, y0: w=(1,0)
* C1, y1: w=(1,-1)
* @return
*/
public static MultiLabelClfDataSet sampleFromMix() {
int numData = 10000;
int numClass = 2;
int numFeature = 2;
int numClusters = 2;
double[] proportions = { 0.4, 0.6 };
int[] indices = { 0, 1 };
MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numFeatures(numFeature).numClasses(numClass).numDataPoints(numData).build();
// generate weights
Vector[][] weights = new Vector[numClusters][numClass];
for (int c = 0; c < numClusters; c++) {
for (int l = 0; l < numClass; l++) {
Vector vector = new DenseVector(numFeature);
weights[c][l] = vector;
}
}
weights[0][0].set(0, 0);
weights[0][0].set(1, 1);
weights[0][1].set(0, 1);
weights[0][1].set(1, 1);
weights[1][0].set(0, 1);
weights[1][0].set(1, 0);
weights[1][1].set(0, 1);
weights[1][1].set(1, -1);
// generate features
for (int i = 0; i < numData; i++) {
for (int j = 0; j < numFeature; j++) {
dataSet.setFeatureValue(i, j, Sampling.doubleUniform(-1, 1));
}
}
IntegerDistribution distribution = new EnumeratedIntegerDistribution(indices, proportions);
// assign labels
for (int i = 0; i < numData; i++) {
int cluster = distribution.sample();
System.out.println("cluster " + cluster);
for (int l = 0; l < numClass; l++) {
System.out.println("row = " + dataSet.getRow(i));
System.out.println("weight = " + weights[cluster][l]);
double dot = weights[cluster][l].dot(dataSet.getRow(i));
System.out.println("dot = " + dot);
if (dot >= 0) {
dataSet.addLabel(i, l);
}
}
}
return dataSet;
}
Aggregations