Search in sources :

Example 61 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class ReplicateBlockFunction method call.

@Override
public Iterator<Tuple2<MatrixIndexes, MatrixBlock>> call(Tuple2<MatrixIndexes, MatrixBlock> arg0) throws Exception {
    ArrayList<Tuple2<MatrixIndexes, MatrixBlock>> ret = new ArrayList<>();
    MatrixIndexes ixIn = arg0._1();
    MatrixBlock blkIn = arg0._2();
    long numBlocks = (long) Math.ceil((double) _len / _blen);
    if (// LHS MATRIX
    _left) {
        // replicate wrt # column blocks in RHS
        long i = ixIn.getRowIndex();
        for (long j = 1; j <= numBlocks; j++) {
            MatrixIndexes tmpix = new MatrixIndexes(i, j);
            MatrixBlock tmpblk = _deep ? new MatrixBlock(blkIn) : blkIn;
            ret.add(new Tuple2<>(tmpix, tmpblk));
        }
    } else // RHS MATRIX
    {
        // replicate wrt # row blocks in LHS
        long j = ixIn.getColumnIndex();
        for (long i = 1; i <= numBlocks; i++) {
            MatrixIndexes tmpix = new MatrixIndexes(i, j);
            MatrixBlock tmpblk = _deep ? new MatrixBlock(blkIn) : blkIn;
            ret.add(new Tuple2<>(tmpix, tmpblk));
        }
    }
    // output list of new tuples
    return ret.iterator();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) Tuple2(scala.Tuple2) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList)

Example 62 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class FrameRDDConverterUtils method matrixBlockToBinaryBlockLongIndex.

public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc, JavaPairRDD<MatrixIndexes, MatrixBlock> input, MatrixCharacteristics mcIn) {
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = input;
    MatrixCharacteristics mc = new MatrixCharacteristics(mcIn);
    // reblock matrix blocks if required (multiple column blocks)
    if (mcIn.getCols() > mcIn.getColsPerBlock()) {
        // split matrix blocks into extended matrix blocks
        in = in.flatMapToPair(new MatrixFrameReblockFunction(mcIn));
        mc.setBlockSize(MatrixFrameReblockFunction.computeBlockSize(mc), (int) mc.getCols());
        // shuffle matrix blocks (instead of frame blocks) in order to exploit
        // sparse formats (for sparse or wide matrices) during shuffle
        in = RDDAggregateUtils.mergeByKey(in, false);
    }
    // convert individual matrix blocks to frame blocks (w/o shuffle)
    return in.mapToPair(new MatrixToFrameBlockFunction(mc));
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 63 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class RDDConverterUtils method csvToBinaryBlock.

public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue) {
    // determine unknown dimensions and sparsity if required
    if (!mc.dimsKnown(true)) {
        LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
        JavaRDD<String> tmp = input.values().map(new CSVAnalysisFunction(aNnz, delim));
        long rlen = tmp.count() - (hasHeader ? 1 : 0);
        long clen = tmp.first().split(delim).length;
        long nnz = UtilFunctions.toLong(aNnz.value());
        mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
    }
    // prepare csv w/ row indexes (sorted by filenames)
    JavaPairRDD<Text, Long> prepinput = input.values().zipWithIndex();
    // convert csv rdd to binary block rdd (w/ partial blocks)
    boolean sparse = requiresSparseAllocation(prepinput, mc);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new CSVToBinaryBlockFunction(mc, sparse, hasHeader, delim, fill, fillValue));
    // aggregate partial matrix blocks (w/ preferred number of output
    // partitions as the data is likely smaller in binary block format,
    // but also to bound the size of partitions for compressed inputs)
    int parts = SparkUtils.getNumPreferredPartitions(mc, out);
    return RDDAggregateUtils.mergeByKey(out, parts, false);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Text(org.apache.hadoop.io.Text) SerText(org.apache.sysml.runtime.instructions.spark.data.SerText) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) LongAccumulator(org.apache.spark.util.LongAccumulator)

Example 64 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class RDDConverterUtils method dataFrameToBinaryBlock.

public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc, Dataset<Row> df, MatrixCharacteristics mc, boolean containsID, boolean isVector) {
    // determine unknown dimensions and sparsity if required
    if (!mc.dimsKnown(true)) {
        LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
        JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
        long rlen = tmp.count();
        long clen = !isVector ? df.columns().length - (containsID ? 1 : 0) : ((Vector) tmp.first().get(containsID ? 1 : 0)).size();
        long nnz = UtilFunctions.toLong(aNnz.value());
        mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
    }
    // ensure valid blocksizes
    if (mc.getRowsPerBlock() <= 1 || mc.getColsPerBlock() <= 1) {
        mc.setBlockSize(ConfigurationManager.getBlocksize());
    }
    // construct or reuse row ids
    JavaPairRDD<Row, Long> prepinput = containsID ? df.javaRDD().mapToPair(new DataFrameExtractIDFunction(df.schema().fieldIndex(DF_ID_COLUMN))) : // zip row index
    df.javaRDD().zipWithIndex();
    // convert csv rdd to binary block rdd (w/ partial blocks)
    boolean sparse = requiresSparseAllocation(prepinput, mc);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
    // aggregate partial matrix blocks (w/ preferred number of output
    // partitions as the data is likely smaller in binary block format,
    // but also to bound the size of partitions for compressed inputs)
    int parts = SparkUtils.getNumPreferredPartitions(mc, out);
    return RDDAggregateUtils.mergeByKey(out, parts, false);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) LongAccumulator(org.apache.spark.util.LongAccumulator) Row(org.apache.spark.sql.Row)

Example 65 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class RDDConverterUtils method libsvmToBinaryBlock.

/**
 * Converts a libsvm text input file into two binary block matrices for features
 * and labels, and saves these to the specified output files. This call also deletes
 * existing files at the specified output locations, as well as determines and
 * writes the meta data files of both output matrices.
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing
 * the libsvm input files in order to ensure consistency with Spark.
 *
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, MatrixCharacteristics mcOutX) {
    if (!mcOutX.dimsKnown())
        throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation.");
    try {
        // cleanup existing output files
        MapReduceTool.deleteFileIfExistOnHDFS(pathX);
        MapReduceTool.deleteFileIfExistOnHDFS(pathY);
        // convert libsvm to labeled points
        int numFeatures = (int) mcOutX.getCols();
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
        JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
        // append row index and best-effort caching to avoid repeated text parsing
        JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint, Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK());
        // extract labels and convert to binary block
        MatrixCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
        LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
        JavaPairRDD<MatrixIndexes, MatrixBlock> out1 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
        int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
        out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
        out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        // update nnz after triggered save
        mc1.setNonZeros(aNnz1.value());
        MapReduceTool.writeMetaDataFile(pathY + ".mtd", ValueType.DOUBLE, mc1, OutputInfo.BinaryBlockOutputInfo);
        // extract data and convert to binary block
        MatrixCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
        LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
        JavaPairRDD<MatrixIndexes, MatrixBlock> out2 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
        out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
        out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        // update nnz after triggered save
        mc2.setNonZeros(aNnz2.value());
        MapReduceTool.writeMetaDataFile(pathX + ".mtd", ValueType.DOUBLE, mc2, OutputInfo.BinaryBlockOutputInfo);
        // asynchronous cleanup of cached intermediates
        ilpoints.unpersist(false);
    } catch (IOException ex) {
        throw new DMLRuntimeException(ex);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) IOException(java.io.IOException) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) LongAccumulator(org.apache.spark.util.LongAccumulator)

Aggregations

MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)165 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)142 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)70 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)48 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)41 Path (org.apache.hadoop.fs.Path)24 SequenceFile (org.apache.hadoop.io.SequenceFile)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)22 ArrayList (java.util.ArrayList)21 IOException (java.io.IOException)20 FileSystem (org.apache.hadoop.fs.FileSystem)20 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)19 Tuple2 (scala.Tuple2)19 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)17 JobConf (org.apache.hadoop.mapred.JobConf)14 MatrixValue (org.apache.sysml.runtime.matrix.data.MatrixValue)11 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)10 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)10 File (java.io.File)9 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)9