Search in sources :

Example 1 with Well1024a

use of org.apache.commons.math3.random.Well1024a in project incubator-systemml by apache.

the class RandSPInstruction method generateRandData.

private void generateRandData(SparkExecutionContext sec) {
    long lrows = sec.getScalarInput(rows).getLongValue();
    long lcols = sec.getScalarInput(cols).getLongValue();
    // step 1: generate pseudo-random seed (because not specified)
    // seed per invocation
    long lSeed = seed;
    if (lSeed == DataGenOp.UNSPECIFIED_SEED)
        lSeed = DataGenOp.generateRandomSeed();
    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");
    // step 2: potential in-memory rand operations if applicable
    if (isMemAvail(lrows, lcols, sparsity, minValue, maxValue) && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
        RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) lrows, (int) lcols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
        MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);
        sec.setMatrixOutput(output.getName(), mb, getExtendedOpcode());
        Statistics.decrementNoOfExecutedSPInst();
        return;
    }
    // step 3: seed generation
    JavaPairRDD<MatrixIndexes, Long> seedsRDD = null;
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
    double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(lrows, lcols, rowsInBlock, colsInBlock, // overestimate for on disk, ensures hdfs block per partition
    sparsity);
    double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
    MatrixCharacteristics tmp = new MatrixCharacteristics(lrows, lcols, rowsInBlock, colsInBlock);
    long numBlocks = tmp.getNumBlocks();
    long numColBlocks = tmp.getNumColBlocks();
    // a) in-memory seed rdd construction
    if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
        ArrayList<Tuple2<MatrixIndexes, Long>> seeds = new ArrayList<>();
        for (long i = 0; i < numBlocks; i++) {
            long r = 1 + i / numColBlocks;
            long c = 1 + i % numColBlocks;
            MatrixIndexes indx = new MatrixIndexes(r, c);
            Long seedForBlock = bigrand.nextLong();
            seeds.add(new Tuple2<>(indx, seedForBlock));
        }
        // for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
        // create seeds rdd
        seedsRDD = sec.getSparkContext().parallelizePairs(seeds, numPartitions);
    } else // b) file-based seed rdd construction (for robustness wrt large number of blocks)
    {
        Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
        PrintWriter pw = null;
        try {
            FileSystem fs = IOUtilFunctions.getFileSystem(path);
            pw = new PrintWriter(fs.create(path));
            StringBuilder sb = new StringBuilder();
            for (long i = 0; i < numBlocks; i++) {
                sb.append(1 + i / numColBlocks);
                sb.append(',');
                sb.append(1 + i % numColBlocks);
                sb.append(',');
                sb.append(bigrand.nextLong());
                pw.println(sb.toString());
                sb.setLength(0);
            }
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        } finally {
            IOUtilFunctions.closeSilently(pw);
        }
        // for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
        // create seeds rdd
        seedsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).mapToPair(new ExtractSeedTuple());
    }
    // step 4: execute rand instruction over seed input
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(lrows, lcols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));
    // step 5: output handling
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (!mcOut.dimsKnown(true)) {
        // note: we cannot compute the nnz from sparsity because this would not reflect the
        // actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
        long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * lrows * lcols) : -1;
        mcOut.set(lrows, lcols, rowsInBlock, colsInBlock, lnnz);
    }
    sec.setRDDHandleForVariable(output.getName(), out);
}
Also used : RandomMatrixGenerator(org.apache.sysml.runtime.matrix.data.RandomMatrixGenerator) Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) Well1024a(org.apache.commons.math3.random.Well1024a) PrintWriter(java.io.PrintWriter)

Example 2 with Well1024a

use of org.apache.commons.math3.random.Well1024a in project incubator-systemml by apache.

the class RandSPInstruction method generateSample.

/**
 * Helper function to construct a sample.
 *
 * @param sec spark execution context
 */
private void generateSample(SparkExecutionContext sec) {
    long lrows = sec.getScalarInput(rows).getLongValue();
    if (maxValue < lrows && !replace)
        throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement.");
    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + lrows + ", replace=" + replace + ", seed=" + seed);
    // sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
    double fraction = SamplingUtils.computeFractionForSampleSize((int) lrows, UtilFunctions.toLong(maxValue), replace);
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);
    // divide the population range across numPartitions by creating SampleTasks
    double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long outputSize = MatrixBlock.estimateSizeDenseInMemory(lrows, 1);
    int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
    long partitionSize = (long) Math.ceil(maxValue / numPartitions);
    ArrayList<SampleTask> offsets = new ArrayList<>();
    long st = 1;
    while (st <= maxValue) {
        SampleTask s = new SampleTask();
        s.range_start = st;
        s.seed = bigrand.nextLong();
        offsets.add(s);
        st = st + partitionSize;
    }
    JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
    // Construct the sample in a distributed manner
    JavaRDD<Double> rdd = offsetRDD.flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));
    // Randomize the sampled elements
    JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();
    // Trim the sampled list to required size & attach matrix indexes to randomized elements
    JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(lrows)).mapToPair(new Double2MatrixCell());
    MatrixCharacteristics mcOut = new MatrixCharacteristics(lrows, 1, rowsInBlock, colsInBlock, lrows);
    // Construct BinaryBlock representation
    JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);
    sec.getMatrixCharacteristics(output.getName()).setNonZeros(lrows);
    sec.setRDDHandleForVariable(output.getName(), mbRDD);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Well1024a(org.apache.commons.math3.random.Well1024a)

Example 3 with Well1024a

use of org.apache.commons.math3.random.Well1024a in project incubator-systemml by apache.

the class LibMatrixDatagen method setupSeedsForRand.

/**
 * A matrix of random numbers is generated by using multiple seeds, one for each
 * block. Such block-level seeds are produced via Well equidistributed long-period linear
 * generator (Well1024a). For a given seed, this function sets up the block-level seeds.
 *
 * This function is invoked from both CP (RandCPInstruction.processInstruction())
 * as well as MR (RandMR.java while setting up the Rand job).
 *
 * @param seed seed for random generator
 * @return Well1024a pseudo-random number generator
 */
public static Well1024a setupSeedsForRand(long seed) {
    long lSeed = (seed == DataGenOp.UNSPECIFIED_SEED ? DataGenOp.generateRandomSeed() : seed);
    LOG.trace("Setting up RandSeeds with initial seed = " + lSeed + ".");
    Random random = new Random(lSeed);
    Well1024a bigrand = new Well1024a();
    // random.setSeed(lSeed);
    int[] seeds = new int[32];
    for (int s = 0; s < seeds.length; s++) seeds[s] = random.nextInt();
    bigrand.setSeed(seeds);
    return bigrand;
}
Also used : Random(java.util.Random) Well1024a(org.apache.commons.math3.random.Well1024a)

Example 4 with Well1024a

use of org.apache.commons.math3.random.Well1024a in project systemml by apache.

the class PoissonPRNGenerator method setup.

public void setup(double mean, long sd) {
    seed = sd;
    SynchronizedRandomGenerator srg = new SynchronizedRandomGenerator(new Well1024a());
    srg.setSeed(seed);
    _pdist = new PoissonDistribution(srg, _mean, PoissonDistribution.DEFAULT_EPSILON, PoissonDistribution.DEFAULT_MAX_ITERATIONS);
}
Also used : PoissonDistribution(org.apache.commons.math3.distribution.PoissonDistribution) SynchronizedRandomGenerator(org.apache.commons.math3.random.SynchronizedRandomGenerator) Well1024a(org.apache.commons.math3.random.Well1024a)

Example 5 with Well1024a

use of org.apache.commons.math3.random.Well1024a in project systemml by apache.

the class RandSPInstruction method generateSample.

/**
 * Helper function to construct a sample.
 *
 * @param sec spark execution context
 */
private void generateSample(SparkExecutionContext sec) {
    long lrows = sec.getScalarInput(rows).getLongValue();
    if (maxValue < lrows && !replace)
        throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement.");
    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + lrows + ", replace=" + replace + ", seed=" + seed);
    // sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
    double fraction = SamplingUtils.computeFractionForSampleSize((int) lrows, UtilFunctions.toLong(maxValue), replace);
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);
    // divide the population range across numPartitions by creating SampleTasks
    double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long outputSize = MatrixBlock.estimateSizeDenseInMemory(lrows, 1);
    int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
    long partitionSize = (long) Math.ceil(maxValue / numPartitions);
    ArrayList<SampleTask> offsets = new ArrayList<>();
    long st = 1;
    while (st <= maxValue) {
        SampleTask s = new SampleTask();
        s.range_start = st;
        s.seed = bigrand.nextLong();
        offsets.add(s);
        st = st + partitionSize;
    }
    JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
    // Construct the sample in a distributed manner
    JavaRDD<Double> rdd = offsetRDD.flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));
    // Randomize the sampled elements
    JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();
    // Trim the sampled list to required size & attach matrix indexes to randomized elements
    JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(lrows)).mapToPair(new Double2MatrixCell());
    MatrixCharacteristics mcOut = new MatrixCharacteristics(lrows, 1, rowsInBlock, colsInBlock, lrows);
    // Construct BinaryBlock representation
    JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);
    sec.getMatrixCharacteristics(output.getName()).setNonZeros(lrows);
    sec.setRDDHandleForVariable(output.getName(), mbRDD);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Well1024a(org.apache.commons.math3.random.Well1024a)

Aggregations

Well1024a (org.apache.commons.math3.random.Well1024a)18 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)6 PrintWriter (java.io.PrintWriter)4 ArrayList (java.util.ArrayList)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)4 RandomGenerator (org.apache.commons.math3.random.RandomGenerator)3 Test (org.junit.Test)3 NucleotideSequence (com.milaboratory.core.sequence.NucleotideSequence)2 IOException (java.io.IOException)2 Random (java.util.Random)2 PoissonDistribution (org.apache.commons.math3.distribution.PoissonDistribution)2 SynchronizedRandomGenerator (org.apache.commons.math3.random.SynchronizedRandomGenerator)2 Group (org.apache.hadoop.mapred.Counters.Group)2 JobConf (org.apache.hadoop.mapred.JobConf)2 RunningJob (org.apache.hadoop.mapred.RunningJob)2 DMLConfig (org.apache.sysml.conf.DMLConfig)2