Search in sources :

Example 91 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class FrameRDDConverterUtils method matrixBlockToBinaryBlockLongIndex.

public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc, JavaPairRDD<MatrixIndexes, MatrixBlock> input, MatrixCharacteristics mcIn) {
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = input;
    MatrixCharacteristics mc = new MatrixCharacteristics(mcIn);
    // reblock matrix blocks if required (multiple column blocks)
    if (mcIn.getCols() > mcIn.getColsPerBlock()) {
        // split matrix blocks into extended matrix blocks
        in = in.flatMapToPair(new MatrixFrameReblockFunction(mcIn));
        mc.setBlockSize(MatrixFrameReblockFunction.computeBlockSize(mc), (int) mc.getCols());
        // shuffle matrix blocks (instead of frame blocks) in order to exploit
        // sparse formats (for sparse or wide matrices) during shuffle
        in = RDDAggregateUtils.mergeByKey(in, false);
    }
    // convert individual matrix blocks to frame blocks (w/o shuffle)
    return in.mapToPair(new MatrixToFrameBlockFunction(mc));
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 92 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RDDConverterUtils method csvToBinaryBlock.

public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue) {
    // determine unknown dimensions and sparsity if required
    if (!mc.dimsKnown(true)) {
        LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
        JavaRDD<String> tmp = input.values().map(new CSVAnalysisFunction(aNnz, delim));
        long rlen = tmp.count() - (hasHeader ? 1 : 0);
        long clen = tmp.first().split(delim).length;
        long nnz = UtilFunctions.toLong(aNnz.value());
        mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
    }
    // prepare csv w/ row indexes (sorted by filenames)
    JavaPairRDD<Text, Long> prepinput = input.values().zipWithIndex();
    // convert csv rdd to binary block rdd (w/ partial blocks)
    boolean sparse = requiresSparseAllocation(prepinput, mc);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new CSVToBinaryBlockFunction(mc, sparse, hasHeader, delim, fill, fillValue));
    // aggregate partial matrix blocks (w/ preferred number of output
    // partitions as the data is likely smaller in binary block format,
    // but also to bound the size of partitions for compressed inputs)
    int parts = SparkUtils.getNumPreferredPartitions(mc, out);
    return RDDAggregateUtils.mergeByKey(out, parts, false);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Text(org.apache.hadoop.io.Text) SerText(org.apache.sysml.runtime.instructions.spark.data.SerText) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) LongAccumulator(org.apache.spark.util.LongAccumulator)

Example 93 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RDDConverterUtils method dataFrameToBinaryBlock.

public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc, Dataset<Row> df, MatrixCharacteristics mc, boolean containsID, boolean isVector) {
    // determine unknown dimensions and sparsity if required
    if (!mc.dimsKnown(true)) {
        LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
        JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
        long rlen = tmp.count();
        long clen = !isVector ? df.columns().length - (containsID ? 1 : 0) : ((Vector) tmp.first().get(containsID ? 1 : 0)).size();
        long nnz = UtilFunctions.toLong(aNnz.value());
        mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
    }
    // ensure valid blocksizes
    if (mc.getRowsPerBlock() <= 1 || mc.getColsPerBlock() <= 1) {
        mc.setBlockSize(ConfigurationManager.getBlocksize());
    }
    // construct or reuse row ids
    JavaPairRDD<Row, Long> prepinput = containsID ? df.javaRDD().mapToPair(new DataFrameExtractIDFunction(df.schema().fieldIndex(DF_ID_COLUMN))) : // zip row index
    df.javaRDD().zipWithIndex();
    // convert csv rdd to binary block rdd (w/ partial blocks)
    boolean sparse = requiresSparseAllocation(prepinput, mc);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
    // aggregate partial matrix blocks (w/ preferred number of output
    // partitions as the data is likely smaller in binary block format,
    // but also to bound the size of partitions for compressed inputs)
    int parts = SparkUtils.getNumPreferredPartitions(mc, out);
    return RDDAggregateUtils.mergeByKey(out, parts, false);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) LongAccumulator(org.apache.spark.util.LongAccumulator) Row(org.apache.spark.sql.Row)

Example 94 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RDDConverterUtils method libsvmToBinaryBlock.

/**
 * Converts a libsvm text input file into two binary block matrices for features
 * and labels, and saves these to the specified output files. This call also deletes
 * existing files at the specified output locations, as well as determines and
 * writes the meta data files of both output matrices.
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing
 * the libsvm input files in order to ensure consistency with Spark.
 *
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, MatrixCharacteristics mcOutX) {
    if (!mcOutX.dimsKnown())
        throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation.");
    try {
        // cleanup existing output files
        MapReduceTool.deleteFileIfExistOnHDFS(pathX);
        MapReduceTool.deleteFileIfExistOnHDFS(pathY);
        // convert libsvm to labeled points
        int numFeatures = (int) mcOutX.getCols();
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
        JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
        // append row index and best-effort caching to avoid repeated text parsing
        JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint, Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK());
        // extract labels and convert to binary block
        MatrixCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
        LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
        JavaPairRDD<MatrixIndexes, MatrixBlock> out1 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
        int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
        out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
        out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        // update nnz after triggered save
        mc1.setNonZeros(aNnz1.value());
        MapReduceTool.writeMetaDataFile(pathY + ".mtd", ValueType.DOUBLE, mc1, OutputInfo.BinaryBlockOutputInfo);
        // extract data and convert to binary block
        MatrixCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
        LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
        JavaPairRDD<MatrixIndexes, MatrixBlock> out2 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
        out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
        out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        // update nnz after triggered save
        mc2.setNonZeros(aNnz2.value());
        MapReduceTool.writeMetaDataFile(pathX + ".mtd", ValueType.DOUBLE, mc2, OutputInfo.BinaryBlockOutputInfo);
        // asynchronous cleanup of cached intermediates
        ilpoints.unpersist(false);
    } catch (IOException ex) {
        throw new DMLRuntimeException(ex);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) IOException(java.io.IOException) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) LongAccumulator(org.apache.spark.util.LongAccumulator)

Example 95 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RDDConverterUtilsExt method convertPy4JArrayToMB.

public static MatrixBlock convertPy4JArrayToMB(byte[] data, int rlen, int clen, boolean isSparse) {
    MatrixBlock mb = new MatrixBlock(rlen, clen, isSparse, -1);
    if (isSparse) {
        throw new DMLRuntimeException("Convertion to sparse format not supported");
    } else {
        long limit = rlen * clen;
        if (limit > Integer.MAX_VALUE)
            throw new DMLRuntimeException("Dense NumPy array of size " + limit + " cannot be converted to MatrixBlock");
        double[] denseBlock = new double[(int) limit];
        ByteBuffer buf = ByteBuffer.wrap(data);
        buf.order(ByteOrder.nativeOrder());
        for (int i = 0; i < rlen * clen; i++) {
            denseBlock[i] = buf.getDouble();
        }
        mb.init(denseBlock, rlen, clen);
    }
    mb.recomputeNonZeros();
    mb.examSparsity();
    return mb;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) ByteBuffer(java.nio.ByteBuffer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)459 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)142 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)111 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)102 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)48 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)48 IOException (java.io.IOException)44 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)41 ArrayList (java.util.ArrayList)40 Path (org.apache.hadoop.fs.Path)29 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)24 FileSystem (org.apache.hadoop.fs.FileSystem)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Tuple2 (scala.Tuple2)19 SequenceFile (org.apache.hadoop.io.SequenceFile)17 Row (org.apache.spark.sql.Row)14 SparseBlock (org.apache.sysml.runtime.matrix.data.SparseBlock)14 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)14 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)13