Search in sources :

Example 11 with Cluster

use of org.apache.commons.math3.ml.clustering.Cluster in project hadoop by apache.

the class TestClusterTopology method testChooseRandom.

/**
   * Test how well we pick random nodes.
   */
@Test
public void testChooseRandom() {
    // create the topology
    NetworkTopology cluster = NetworkTopology.getInstance(new Configuration());
    NodeElement node1 = getNewNode("node1", "/d1/r1");
    cluster.add(node1);
    NodeElement node2 = getNewNode("node2", "/d1/r2");
    cluster.add(node2);
    NodeElement node3 = getNewNode("node3", "/d1/r3");
    cluster.add(node3);
    NodeElement node4 = getNewNode("node4", "/d1/r3");
    cluster.add(node4);
    // Number of iterations to do the test
    int numIterations = 100;
    // Pick random nodes
    HashMap<String, Integer> histogram = new HashMap<String, Integer>();
    for (int i = 0; i < numIterations; i++) {
        String randomNode = cluster.chooseRandom(NodeBase.ROOT).getName();
        if (!histogram.containsKey(randomNode)) {
            histogram.put(randomNode, 0);
        }
        histogram.put(randomNode, histogram.get(randomNode) + 1);
    }
    assertEquals("Random is not selecting all nodes", 4, histogram.size());
    // Check with 99% confidence (alpha=0.01 as confidence = (100 * (1 - alpha)
    ChiSquareTest chiSquareTest = new ChiSquareTest();
    double[] expected = new double[histogram.size()];
    long[] observed = new long[histogram.size()];
    int j = 0;
    for (Integer occurrence : histogram.values()) {
        expected[j] = 1.0 * numIterations / histogram.size();
        observed[j] = occurrence;
        j++;
    }
    boolean chiSquareTestRejected = chiSquareTest.chiSquareTest(expected, observed, 0.01);
    // Check that they have the proper distribution
    assertFalse("Not choosing nodes randomly", chiSquareTestRejected);
    // Pick random nodes excluding the 2 nodes in /d1/r3
    histogram = new HashMap<String, Integer>();
    for (int i = 0; i < numIterations; i++) {
        String randomNode = cluster.chooseRandom("~/d1/r3").getName();
        if (!histogram.containsKey(randomNode)) {
            histogram.put(randomNode, 0);
        }
        histogram.put(randomNode, histogram.get(randomNode) + 1);
    }
    assertEquals("Random is not selecting the nodes it should", 2, histogram.size());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ChiSquareTest(org.apache.commons.math3.stat.inference.ChiSquareTest) ChiSquareTest(org.apache.commons.math3.stat.inference.ChiSquareTest) Test(org.junit.Test)

Example 12 with Cluster

use of org.apache.commons.math3.ml.clustering.Cluster in project GDSC-SMLM by aherbert.

the class PCPALMFitting method fitClusteredModel.

/**
	 * Fits the correlation curve with r>0 to the clustered model using the estimated density and precision. Parameters
	 * must be fit within a tolerance of the starting values.
	 * 
	 * @param gr
	 * @param sigmaS
	 *            The estimated precision
	 * @param proteinDensity
	 *            The estimated protein density
	 * @return The fitted parameters [precision, density, clusterRadius, clusterDensity]
	 */
private double[] fitClusteredModel(double[][] gr, double sigmaS, double proteinDensity, String resultColour) {
    final ClusteredModelFunctionGradient function = new ClusteredModelFunctionGradient();
    clusteredModel = function;
    log("Fitting %s: Estimated precision = %f nm, estimated protein density = %g um^-2", clusteredModel.getName(), sigmaS, proteinDensity * 1e6);
    clusteredModel.setLogging(true);
    for (int i = offset; i < gr[0].length; i++) {
        // Only fit the curve above the estimated resolution (points below it will be subject to error)
        if (gr[0][i] > sigmaS * fitAboveEstimatedPrecision)
            clusteredModel.addPoint(gr[0][i], gr[1][i]);
    }
    double[] parameters;
    // The model is: sigma, density, range, amplitude
    double[] initialSolution = new double[] { sigmaS, proteinDensity, sigmaS * 5, 1 };
    int evaluations = 0;
    // Constrain the fitting to be close to the estimated precision (sigmaS) and protein density.
    // LVM fitting does not support constrained fitting so use a bounded optimiser.
    SumOfSquaresModelFunction clusteredModelMulti = new SumOfSquaresModelFunction(clusteredModel);
    double[] x = clusteredModelMulti.x;
    // Put some bounds around the initial guess. Use the fitting tolerance (in %) if provided.
    double limit = (fittingTolerance > 0) ? 1 + fittingTolerance / 100 : 2;
    double[] lB = new double[] { initialSolution[0] / limit, initialSolution[1] / limit, 0, 0 };
    // The amplitude and range should not extend beyond the limits of the g(r) curve.
    double[] uB = new double[] { initialSolution[0] * limit, initialSolution[1] * limit, Maths.max(x), Maths.max(gr[1]) };
    log("Fitting %s using a bounded search: %s < precision < %s & %s < density < %s", clusteredModel.getName(), Utils.rounded(lB[0], 4), Utils.rounded(uB[0], 4), Utils.rounded(lB[1] * 1e6, 4), Utils.rounded(uB[1] * 1e6, 4));
    PointValuePair constrainedSolution = runBoundedOptimiser(gr, initialSolution, lB, uB, clusteredModelMulti);
    if (constrainedSolution == null)
        return null;
    parameters = constrainedSolution.getPointRef();
    evaluations = boundedEvaluations;
    // Refit using a LVM  
    if (useLSE) {
        log("Re-fitting %s using a gradient optimisation", clusteredModel.getName());
        LevenbergMarquardtOptimizer optimizer = new LevenbergMarquardtOptimizer();
        Optimum lvmSolution;
        try {
            //@formatter:off
            LeastSquaresProblem problem = new LeastSquaresBuilder().maxEvaluations(Integer.MAX_VALUE).maxIterations(3000).start(parameters).target(function.getY()).weight(new DiagonalMatrix(function.getWeights())).model(function, new MultivariateMatrixFunction() {

                public double[][] value(double[] point) throws IllegalArgumentException {
                    return function.jacobian(point);
                }
            }).build();
            //@formatter:on
            lvmSolution = optimizer.optimize(problem);
            evaluations += lvmSolution.getEvaluations();
            double ss = lvmSolution.getResiduals().dotProduct(lvmSolution.getResiduals());
            if (ss < constrainedSolution.getValue()) {
                log("Re-fitting %s improved the SS from %s to %s (-%s%%)", clusteredModel.getName(), Utils.rounded(constrainedSolution.getValue(), 4), Utils.rounded(ss, 4), Utils.rounded(100 * (constrainedSolution.getValue() - ss) / constrainedSolution.getValue(), 4));
                parameters = lvmSolution.getPoint().toArray();
            }
        } catch (TooManyIterationsException e) {
            log("Failed to re-fit %s: Too many iterations (%s)", clusteredModel.getName(), e.getMessage());
        } catch (ConvergenceException e) {
            log("Failed to re-fit %s: %s", clusteredModel.getName(), e.getMessage());
        }
    }
    clusteredModel.setLogging(false);
    // Ensure the width is positive
    parameters[0] = Math.abs(parameters[0]);
    //parameters[2] = Math.abs(parameters[2]);
    double ss = 0;
    double[] obs = clusteredModel.getY();
    double[] exp = clusteredModel.value(parameters);
    for (int i = 0; i < obs.length; i++) ss += (obs[i] - exp[i]) * (obs[i] - exp[i]);
    ic2 = Maths.getAkaikeInformationCriterionFromResiduals(ss, clusteredModel.size(), parameters.length);
    final double fitSigmaS = parameters[0];
    final double fitProteinDensity = parameters[1];
    //The radius of the cluster domain
    final double domainRadius = parameters[2];
    //The density of the cluster domain
    final double domainDensity = parameters[3];
    // This is from the PC-PALM paper. However that paper fits the g(r)protein exponential convolved in 2D
    // with the g(r)PSF. In this method we have just fit the exponential
    final double nCluster = 2 * domainDensity * Math.PI * domainRadius * domainRadius * fitProteinDensity;
    double e1 = parameterDrift(sigmaS, fitSigmaS);
    double e2 = parameterDrift(proteinDensity, fitProteinDensity);
    log("  %s fit: SS = %f. cAIC = %f. %d evaluations", clusteredModel.getName(), ss, ic2, evaluations);
    log("  %s parameters:", clusteredModel.getName());
    log("    Average precision = %s nm (%s%%)", Utils.rounded(fitSigmaS, 4), Utils.rounded(e1, 4));
    log("    Average protein density = %s um^-2 (%s%%)", Utils.rounded(fitProteinDensity * 1e6, 4), Utils.rounded(e2, 4));
    log("    Domain radius = %s nm", Utils.rounded(domainRadius, 4));
    log("    Domain density = %s", Utils.rounded(domainDensity, 4));
    log("    nCluster = %s", Utils.rounded(nCluster, 4));
    // Check the fitted parameters are within tolerance of the initial estimates
    valid2 = true;
    if (fittingTolerance > 0 && (Math.abs(e1) > fittingTolerance || Math.abs(e2) > fittingTolerance)) {
        log("  Failed to fit %s within tolerance (%s%%): Average precision = %f nm (%s%%), average protein density = %g um^-2 (%s%%)", clusteredModel.getName(), Utils.rounded(fittingTolerance, 4), fitSigmaS, Utils.rounded(e1, 4), fitProteinDensity * 1e6, Utils.rounded(e2, 4));
        valid2 = false;
    }
    // Check extra parameters. Domain radius should be higher than the precision. Density should be positive
    if (domainRadius < fitSigmaS) {
        log("  Failed to fit %s: Domain radius is smaller than the average precision (%s < %s)", clusteredModel.getName(), Utils.rounded(domainRadius, 4), Utils.rounded(fitSigmaS, 4));
        valid2 = false;
    }
    if (domainDensity < 0) {
        log("  Failed to fit %s: Domain density is negative (%s)", clusteredModel.getName(), Utils.rounded(domainDensity, 4));
        valid2 = false;
    }
    if (ic2 > ic1) {
        log("  Failed to fit %s - Information Criterion has increased %s%%", clusteredModel.getName(), Utils.rounded((100 * (ic2 - ic1) / ic1), 4));
        valid2 = false;
    }
    addResult(clusteredModel.getName(), resultColour, valid2, fitSigmaS, fitProteinDensity, domainRadius, domainDensity, nCluster, 0, ic2);
    return parameters;
}
Also used : PointValuePair(org.apache.commons.math3.optim.PointValuePair) LeastSquaresBuilder(org.apache.commons.math3.fitting.leastsquares.LeastSquaresBuilder) Optimum(org.apache.commons.math3.fitting.leastsquares.LeastSquaresOptimizer.Optimum) LevenbergMarquardtOptimizer(org.apache.commons.math3.fitting.leastsquares.LevenbergMarquardtOptimizer) DiagonalMatrix(org.apache.commons.math3.linear.DiagonalMatrix) ConvergenceException(org.apache.commons.math3.exception.ConvergenceException) TooManyIterationsException(org.apache.commons.math3.exception.TooManyIterationsException) LeastSquaresProblem(org.apache.commons.math3.fitting.leastsquares.LeastSquaresProblem) MultivariateMatrixFunction(org.apache.commons.math3.analysis.MultivariateMatrixFunction)

Example 13 with Cluster

use of org.apache.commons.math3.ml.clustering.Cluster in project GDSC-SMLM by aherbert.

the class TraceMolecules method summarise.

private void summarise(Trace[] traces, int filtered, double dThreshold, double tThreshold) {
    IJ.showStatus("Calculating summary ...");
    // Create summary table
    createSummaryTable();
    Statistics[] stats = new Statistics[NAMES.length];
    for (int i = 0; i < stats.length; i++) {
        stats[i] = (settings.showHistograms || settings.saveTraceData) ? new StoredDataStatistics() : new Statistics();
    }
    int singles = 0;
    for (Trace trace : traces) {
        int nBlinks = trace.getNBlinks() - 1;
        stats[BLINKS].add(nBlinks);
        int[] onTimes = trace.getOnTimes();
        int[] offTimes = trace.getOffTimes();
        double tOn = 0;
        for (int t : onTimes) {
            stats[T_ON].add(t * exposureTime);
            tOn += t * exposureTime;
        }
        stats[TOTAL_T_ON].add(tOn);
        if (offTimes != null) {
            double tOff = 0;
            for (int t : offTimes) {
                stats[T_OFF].add(t * exposureTime);
                tOff += t * exposureTime;
            }
            stats[TOTAL_T_OFF].add(tOff);
        }
        final double signal = trace.getSignal() / results.getGain();
        stats[TOTAL_SIGNAL].add(signal);
        stats[SIGNAL_PER_FRAME].add(signal / trace.size());
        if (trace.size() == 1)
            singles++;
    }
    // Add to the summary table
    StringBuilder sb = new StringBuilder();
    sb.append(results.getName()).append("\t");
    sb.append(outputName.equals("Cluster") ? settings.getClusteringAlgorithm() : settings.getTraceMode()).append("\t");
    sb.append(Utils.rounded(exposureTime * 1000, 3)).append("\t");
    sb.append(Utils.rounded(dThreshold, 3)).append("\t");
    sb.append(Utils.rounded(tThreshold, 3));
    if (settings.splitPulses)
        sb.append(" *");
    sb.append("\t");
    sb.append(timeInFrames2(tThreshold)).append("\t");
    sb.append(traces.length).append("\t");
    sb.append(filtered).append("\t");
    sb.append(singles).append("\t");
    sb.append(traces.length - singles).append("\t");
    for (int i = 0; i < stats.length; i++) {
        sb.append(Utils.rounded(stats[i].getMean(), 3)).append("\t");
    }
    if (java.awt.GraphicsEnvironment.isHeadless()) {
        IJ.log(sb.toString());
        return;
    } else {
        summaryTable.append(sb.toString());
    }
    if (settings.showHistograms) {
        IJ.showStatus("Calculating histograms ...");
        int[] idList = new int[NAMES.length];
        int count = 0;
        boolean requireRetile = false;
        for (int i = 0; i < NAMES.length; i++) {
            if (displayHistograms[i]) {
                idList[count++] = Utils.showHistogram(TITLE, (StoredDataStatistics) stats[i], NAMES[i], (integerDisplay[i]) ? 1 : 0, (settings.removeOutliers || alwaysRemoveOutliers[i]) ? 2 : 0, settings.histogramBins);
                requireRetile = requireRetile || Utils.isNewWindow();
            }
        }
        if (count > 0 && requireRetile) {
            idList = Arrays.copyOf(idList, count);
            new WindowOrganiser().tileWindows(idList);
        }
    }
    if (settings.saveTraceData) {
        saveTraceData(stats);
    }
    IJ.showStatus("");
}
Also used : Trace(gdsc.smlm.results.Trace) StoredDataStatistics(gdsc.core.utils.StoredDataStatistics) WindowOrganiser(ij.plugin.WindowOrganiser) Statistics(gdsc.core.utils.Statistics) StoredDataStatistics(gdsc.core.utils.StoredDataStatistics) SummaryStatistics(org.apache.commons.math3.stat.descriptive.SummaryStatistics) ClusterPoint(gdsc.core.clustering.ClusterPoint)

Example 14 with Cluster

use of org.apache.commons.math3.ml.clustering.Cluster in project incubator-systemml by apache.

the class DataGenMR method runJob.

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 *
 * @param inst MR job instruction
 * @param dataGenInstructions array of data gen instructions
 * @param instructionsInMapper instructions in mapper
 * @param aggInstructionsInReducer aggregate instructions in reducer
 * @param otherInstructionsInReducer other instructions in reducer
 * @param numReducers number of reducers
 * @param replication file replication
 * @param resultIndexes result indexes for each random object
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if Exception occurs
 */
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");
    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);
    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];
    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;
    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];
        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRType mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;
        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();
        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);
        if (mrtype == MRType.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());
            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                // for obj reuse and preventing repeated buffer re-allocations
                StringBuilder sb = new StringBuilder();
                // seed generation
                Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
                for (long r = 0; r < Math.max(rlens[i], 1); r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    for (long c = 0; c < Math.max(clens[i], 1); c += bclens[i]) {
                        long curBlockColSize = Math.min(bclens[i], (clens[i] - c));
                        sb.append((r / brlens[i]) + 1);
                        sb.append(',');
                        sb.append((c / bclens[i]) + 1);
                        sb.append(',');
                        sb.append(curBlockRowSize);
                        sb.append(',');
                        sb.append(curBlockColSize);
                        sb.append(',');
                        sb.append(bigrand.nextLong());
                        pw.println(sb.toString());
                        sb.setLength(0);
                        numblocks++;
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRType.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            // always dense
            maxsparsity = 1.0;
            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;
            // handle default 1 to -1 for special case of from>to
            incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);
            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");
            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");
            // Compute the number of rows in the sequence
            long numrows = UtilFunctions.getSeqLength(from, to, incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }
            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1.");
            else
                clens[i] = 1;
            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                StringBuilder sb = new StringBuilder();
                double temp = from;
                double block_from, block_to;
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval)
                    long bid_i = ((r / brlens[i]) + 1);
                    long bid_j = 1;
                    block_from = temp;
                    block_to = temp + (curBlockRowSize - 1) * incr;
                    // next block starts from here
                    temp = block_to + incr;
                    sb.append(bid_i);
                    sb.append(',');
                    sb.append(bid_j);
                    sb.append(',');
                    sb.append(block_from);
                    sb.append(',');
                    sb.append(block_to);
                    sb.append(',');
                    sb.append(incr);
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    // remove the first ","
    dataGenInsStr = dataGenInsStr.substring(1);
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        // set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
        // set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);
        // set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
        // set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
        // set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);
        // set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
        // set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
        // set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
        // set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
        // set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
        // set up custom map/reduce configurations
        MRJobConfiguration.setupCustomMRConfigurations(job, config);
        // determine degree of parallelism (nmappers: 1<=n<=capacity)
        // TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        // correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math.max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);
        // set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;
        // set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);
        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }
        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg");
        // set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);
        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }
        // set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);
        // configure reducer
        job.setReducerClass(GMRReducer.class);
        // job.setReducerClass(PassThroughReducer.class);
        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }
        // set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);
        runjob = JobClient.runJob(job);
        /* Process different counters */
        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }
        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);
    } finally {
        for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) GMRCombiner(org.apache.sysml.runtime.matrix.mapred.GMRCombiner) FileSystem(org.apache.hadoop.fs.FileSystem) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) MRInstruction(org.apache.sysml.runtime.instructions.mr.MRInstruction) JobConf(org.apache.hadoop.mapred.JobConf) PrintWriter(java.io.PrintWriter) Path(org.apache.hadoop.fs.Path) DMLConfig(org.apache.sysml.conf.DMLConfig) SeqInstruction(org.apache.sysml.runtime.instructions.mr.SeqInstruction) RandInstruction(org.apache.sysml.runtime.instructions.mr.RandInstruction) MRType(org.apache.sysml.runtime.instructions.mr.MRInstruction.MRType) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob) Well1024a(org.apache.commons.math3.random.Well1024a)

Example 15 with Cluster

use of org.apache.commons.math3.ml.clustering.Cluster in project pyramid by cheng-li.

the class MultiLabelSynthesizer method sampleFromMix.

/**
     * C0, y0: w=(0,1)
     * C0, y1: w=(1,1)
     * C1, y0: w=(1,0)
     * C1, y1: w=(1,-1)
     * @return
     */
public static MultiLabelClfDataSet sampleFromMix() {
    int numData = 10000;
    int numClass = 2;
    int numFeature = 2;
    int numClusters = 2;
    double[] proportions = { 0.4, 0.6 };
    int[] indices = { 0, 1 };
    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numFeatures(numFeature).numClasses(numClass).numDataPoints(numData).build();
    // generate weights
    Vector[][] weights = new Vector[numClusters][numClass];
    for (int c = 0; c < numClusters; c++) {
        for (int l = 0; l < numClass; l++) {
            Vector vector = new DenseVector(numFeature);
            weights[c][l] = vector;
        }
    }
    weights[0][0].set(0, 0);
    weights[0][0].set(1, 1);
    weights[0][1].set(0, 1);
    weights[0][1].set(1, 1);
    weights[1][0].set(0, 1);
    weights[1][0].set(1, 0);
    weights[1][1].set(0, 1);
    weights[1][1].set(1, -1);
    // generate features
    for (int i = 0; i < numData; i++) {
        for (int j = 0; j < numFeature; j++) {
            dataSet.setFeatureValue(i, j, Sampling.doubleUniform(-1, 1));
        }
    }
    IntegerDistribution distribution = new EnumeratedIntegerDistribution(indices, proportions);
    // assign labels
    for (int i = 0; i < numData; i++) {
        int cluster = distribution.sample();
        System.out.println("cluster " + cluster);
        for (int l = 0; l < numClass; l++) {
            System.out.println("row = " + dataSet.getRow(i));
            System.out.println("weight = " + weights[cluster][l]);
            double dot = weights[cluster][l].dot(dataSet.getRow(i));
            System.out.println("dot = " + dot);
            if (dot >= 0) {
                dataSet.addLabel(i, l);
            }
        }
    }
    return dataSet;
}
Also used : EnumeratedIntegerDistribution(org.apache.commons.math3.distribution.EnumeratedIntegerDistribution) IntegerDistribution(org.apache.commons.math3.distribution.IntegerDistribution) EnumeratedIntegerDistribution(org.apache.commons.math3.distribution.EnumeratedIntegerDistribution) DenseVector(org.apache.mahout.math.DenseVector) Vector(org.apache.mahout.math.Vector) MultiLabelClfDataSet(edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet) DenseVector(org.apache.mahout.math.DenseVector)

Aggregations

ArrayList (java.util.ArrayList)7 ClusterPoint (gdsc.core.clustering.ClusterPoint)5 Plot2 (ij.gui.Plot2)3 WeightedObservedPoint (org.apache.commons.math3.fitting.WeightedObservedPoint)3 PointValuePair (org.apache.commons.math3.optim.PointValuePair)3 Material (com.jme3.material.Material)2 ColorRGBA (com.jme3.math.ColorRGBA)2 Geometry (com.jme3.scene.Geometry)2 Mesh (com.jme3.scene.Mesh)2 Node (com.jme3.scene.Node)2 VertexBuffer (com.jme3.scene.VertexBuffer)2 TDoubleArrayList (gnu.trove.list.array.TDoubleArrayList)2 PrintWriter (java.io.PrintWriter)2 Collections (java.util.Collections)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2