Search in sources :

Example 91 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class MLContextTest method testOutputBinaryBlocksDML.

@Test
public void testOutputBinaryBlocksDML() {
    System.out.println("MLContextTest - output binary blocks DML");
    String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
    MLResults results = ml.execute(dml(s).out("M"));
    Matrix m = results.getMatrix("M");
    JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks = m.toBinaryBlocks();
    MatrixMetadata mm = m.getMatrixMetadata();
    MatrixCharacteristics mc = mm.asMatrixCharacteristics();
    JavaRDD<String> javaRDDStringIJV = RDDConverterUtils.binaryBlockToTextCell(binaryBlocks, mc);
    List<String> lines = javaRDDStringIJV.collect();
    Assert.assertEquals("1 1 1.0", lines.get(0));
    Assert.assertEquals("1 2 2.0", lines.get(1));
    Assert.assertEquals("2 1 3.0", lines.get(2));
    Assert.assertEquals("2 2 4.0", lines.get(3));
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) Matrix(org.apache.sysml.api.mlcontext.Matrix) MLResults(org.apache.sysml.api.mlcontext.MLResults) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) Test(org.junit.Test)

Example 92 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class FrameConverterTest method runConverter.

@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
    SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
    JavaSparkContext sc = sec.getSparkContext();
    ValueType[] lschema = schema.toArray(new ValueType[0]);
    MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
    switch(type) {
        case CSV2BIN:
            {
                InputInfo iinfo = InputInfo.CSVInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2CSV:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                CSVFileFormatProperties fprop = new CSVFileFormatProperties();
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case TXTCELL2BIN:
            {
                InputInfo iinfo = InputInfo.TextCellInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2TXTCELL:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case MAT2BIN:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2MAT:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
                break;
            }
        case DFRM2BIN:
            {
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                // Create DataFrame
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
                JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
                Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2DFRM:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
                // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        default:
            throw new RuntimeException("Unsuported converter type: " + type.toString());
    }
    sec.close();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Dataset(org.apache.spark.sql.Dataset) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritableFrameToLongFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)

Example 93 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class SparkExecutionContext method getRDDHandleForMatrixObject.

/**
 * This call returns an RDD handle for a given matrix object. This includes
 * the creation of RDDs for in-memory or binary-block HDFS data.
 *
 * @param mo matrix object
 * @param inputInfo input info
 * @return JavaPairRDD handle for a matrix object
 */
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForMatrixObject(MatrixObject mo, InputInfo inputInfo) {
    // NOTE: MB this logic should be integrated into MatrixObject
    // However, for now we cannot assume that spark libraries are
    // always available and hence only store generic references in
    // matrix object while all the logic is in the SparkExecContext
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    // rdd operations if already executed and cached
    if (mo.getRDDHandle() != null && (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false))) {
        // return existing rdd handling (w/o input format change)
        rdd = mo.getRDDHandle().getRDD();
    } else // CASE 2: dirty in memory data or cached result of rdd operations
    if (mo.isDirty() || mo.isCached(false)) {
        // get in-memory matrix block and parallelize it
        // w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = mo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (// write if necessary
            mo.isDirty() || !mo.isHDFSFileExists())
                mo.exportData();
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
            fromFile = true;
        } else {
            // default case
            // pin matrix in memory
            MatrixBlock mb = mo.acquireRead();
            rdd = toMatrixJavaPairRDD(sc, mb, (int) mo.getNumRowsPerBlock(), (int) mo.getNumColumnsPerBlock());
            // unpin matrix
            mo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(fromFile);
        rddhandle.setParallelizedRDD(!fromFile);
        mo.setRDDHandle(rddhandle);
    } else // CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo == InputInfo.BinaryBlockInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            // recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            // cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
        } else if (inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo == InputInfo.BinaryCellInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>) rdd).mapToPair(new CopyBinaryCellFunction());
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(true);
        mo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyBinaryCellFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 94 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class SparkExecutionContext method repartitionAndCacheMatrixObject.

@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject(String var) {
    MatrixObject mo = getMatrixObject(var);
    MatrixCharacteristics mcIn = mo.getMatrixCharacteristics();
    // double check size to avoid unnecessary spark context creation
    if (!OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), (double) OptimizerUtils.estimateSizeExactSparsity(mcIn)))
        return;
    // get input rdd and default storage level
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);
    // avoid unnecessary caching of input in order to reduce memory pressure
    if (mo.getRDDHandle().allowsShortCircuitRead() && isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id())) {
        in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        // investigate issue of unnecessarily large number of partitions
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
        if (numPartitions < in.getNumPartitions())
            in = in.coalesce(numPartitions);
    }
    // repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
    // executed on the original data, because there will be no merge, i.e., no key duplicates
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);
    // convert mcsr into memory-efficient csr if potentially sparse
    if (OptimizerUtils.checkSparseBlockCSRConversion(mcIn)) {
        out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
    }
    // persist rdd in default storage level
    out.persist(Checkpoint.DEFAULT_STORAGE_LEVEL).count();
    // create new rdd handle, in-place of current matrix object
    // guaranteed to exist (see above)
    RDDObject inro = mo.getRDDHandle();
    // create new rdd object
    RDDObject outro = new RDDObject(out);
    // mark as checkpointed
    outro.setCheckpointRDD(true);
    // keep lineage to prevent cycles on cleanup
    outro.addLineageChild(inro);
    mo.setRDDHandle(outro);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CreateSparseBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Checkpoint(org.apache.sysml.lops.Checkpoint) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 95 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class SparkExecutionContext method toMatrixBlock.

/**
 * Utility method for creating a single matrix block out of a binary cell RDD.
 * Note that this collect call might trigger execution of any pending transformations.
 *
 * @param rdd JavaPairRDD for matrix block
 * @param rlen number of rows
 * @param clen number of columns
 * @param nnz number of non-zeros
 * @return matrix block
 */
public static MatrixBlock toMatrixBlock(JavaPairRDD<MatrixIndexes, MatrixCell> rdd, int rlen, int clen, long nnz) {
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    MatrixBlock out = null;
    // determine target sparse/dense representation
    long lnnz = (nnz >= 0) ? nnz : (long) rlen * clen;
    boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, lnnz);
    // create output matrix block (w/ lazy allocation)
    out = new MatrixBlock(rlen, clen, sparse);
    List<Tuple2<MatrixIndexes, MatrixCell>> list = rdd.collect();
    // copy blocks one-at-a-time into output matrix block
    for (Tuple2<MatrixIndexes, MatrixCell> keyval : list) {
        // unpack index-block pair
        MatrixIndexes ix = keyval._1();
        MatrixCell cell = keyval._2();
        // append cell to dense/sparse target in order to avoid shifting for sparse
        // note: this append requires a final sort of sparse rows
        out.appendValue((int) ix.getRowIndex() - 1, (int) ix.getColumnIndex() - 1, cell.getValue());
    }
    // post-processing output matrix
    if (sparse)
        out.sortSparseRows();
    out.recomputeNonZeros();
    out.examSparsity();
    if (DMLScript.STATISTICS) {
        Statistics.accSparkCollectTime(System.nanoTime() - t0);
        Statistics.incSparkCollectCount(1);
    }
    return out;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) Tuple2(scala.Tuple2) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell)

Aggregations

MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)165 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)142 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)70 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)48 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)41 Path (org.apache.hadoop.fs.Path)24 SequenceFile (org.apache.hadoop.io.SequenceFile)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)22 ArrayList (java.util.ArrayList)21 IOException (java.io.IOException)20 FileSystem (org.apache.hadoop.fs.FileSystem)20 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)19 Tuple2 (scala.Tuple2)19 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)17 JobConf (org.apache.hadoop.mapred.JobConf)14 MatrixValue (org.apache.sysml.runtime.matrix.data.MatrixValue)11 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)10 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)10 File (java.io.File)9 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)9