Search in sources :

Example 86 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class MLContextConversionUtil method dataFrameToMatrixBinaryBlocks.

/**
 * Convert a {@code DataFrame} to a
 * {@code JavaPairRDD<MatrixIndexes, MatrixBlock>} binary-block matrix.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param matrixMetadata
 *            the matrix metadata
 * @return the {@code DataFrame} matrix converted to a
 *         {@code JavaPairRDD<MatrixIndexes,
 *         MatrixBlock>} binary-block matrix
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToMatrixBinaryBlocks(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
    // handle meta data
    determineMatrixFormatIfNeeded(dataFrame, matrixMetadata);
    MatrixCharacteristics mc = (matrixMetadata != null && matrixMetadata.asMatrixCharacteristics() != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    boolean containsID = isDataFrameWithIDColumn(matrixMetadata);
    boolean isVector = isVectorBasedDataFrame(matrixMetadata);
    // convert data frame to binary block matrix
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(jsc(), dataFrame, mc, containsID, isVector);
    // update determined matrix characteristics
    if (matrixMetadata != null)
        matrixMetadata.setMatrixCharacteristics(mc);
    return out;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 87 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class MLContextConversionUtil method dataFrameToMatrixObject.

/**
 * Convert a {@code DataFrame} to a {@code MatrixObject}.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param matrixMetadata
 *            the matrix metadata
 * @return the {@code DataFrame} matrix converted to a converted to a
 *         {@code MatrixObject}
 */
public static MatrixObject dataFrameToMatrixObject(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
    matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
    JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
    MatrixObject mo = binaryBlocksToMatrixObject(binaryBlock, matrixMetadata, false);
    // keep lineage of original dataset to allow bypassing binary block
    // conversion if possible
    mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
    return mo;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject)

Example 88 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class DataFrameMatrixConversionTest method testDataFrameConversion.

private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) {
    boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
    RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform;
    try {
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
        DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
        // generate input data and setup metadata
        int rows = (cols == cols3) ? rows3 : rows1;
        double sparsity = dense ? sparsity1 : sparsity2;
        double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373);
        MatrixBlock mbA = DataConverter.convertToMatrixBlock(A);
        int blksz = ConfigurationManager.getBlocksize();
        MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, blksz, mbA.getNonZeros());
        MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);
        // get binary block input rdd
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz, blksz);
        // matrix - dataframe - matrix conversion
        Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
        df = (rows == rows3) ? df.repartition(rows) : df;
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
        // get output matrix block
        MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, blksz, -1);
        // compare matrix blocks
        double[][] B = DataConverter.convertToDoubleMatrix(mbB);
        TestUtils.compareMatrices(A, B, rows, cols, eps);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
        DMLScript.rtplatform = oldPlatform;
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) Row(org.apache.spark.sql.Row)

Example 89 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class DataFrameMatrixConversionTest method testDataFrameConversionUltraSparse.

private void testDataFrameConversionUltraSparse(boolean vector, boolean unknownDims) {
    boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
    RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform;
    try {
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
        DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
        // generate input data and setup metadata
        double[][] A = getRandomMatrix(rows1, 1, -10, 10, 0.7, 2373);
        MatrixBlock mbA0 = DataConverter.convertToMatrixBlock(A);
        MatrixBlock mbA = LibMatrixReorg.diag(mbA0, new MatrixBlock(rows1, rows1, true));
        int blksz = ConfigurationManager.getBlocksize();
        MatrixCharacteristics mc1 = new MatrixCharacteristics(rows1, rows1, blksz, blksz, mbA.getNonZeros());
        MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);
        // get binary block input rdd
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz, blksz);
        // matrix - dataframe - matrix conversion
        Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
        // get output matrix block
        MatrixBlock mbB0 = SparkExecutionContext.toMatrixBlock(out, rows1, rows1, blksz, blksz, -1);
        MatrixBlock mbB = LibMatrixReorg.diag(mbB0, new MatrixBlock(rows1, 1, false));
        // compare matrix blocks
        double[][] B = DataConverter.convertToDoubleMatrix(mbB);
        TestUtils.compareMatrices(A, B, rows1, 1, eps);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
        DMLScript.rtplatform = oldPlatform;
    }
}
Also used : RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Row(org.apache.spark.sql.Row) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 90 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class GNMFTest method testGNMFWithRDMLAndJava.

@Test
public void testGNMFWithRDMLAndJava() throws IOException, DMLException, ParseException {
    System.out.println("------------ BEGIN " + TEST_NAME + " TEST {" + numRegisteredInputs + ", " + numRegisteredOutputs + "} ------------");
    this.scriptType = ScriptType.DML;
    int m = 2000;
    int n = 1500;
    int k = 50;
    int maxiter = 2;
    double Eps = Math.pow(10, -8);
    getAndLoadTestConfiguration(TEST_NAME);
    List<String> proArgs = new ArrayList<String>();
    proArgs.add(input("v"));
    proArgs.add(input("w"));
    proArgs.add(input("h"));
    proArgs.add(Integer.toString(maxiter));
    proArgs.add(output("w"));
    proArgs.add(output("h"));
    programArgs = proArgs.toArray(new String[proArgs.size()]);
    fullDMLScriptName = getScript();
    rCmd = getRCmd(inputDir(), Integer.toString(maxiter), expectedDir());
    double[][] v = getRandomMatrix(m, n, 1, 5, 0.2, System.currentTimeMillis());
    double[][] w = getRandomMatrix(m, k, 0, 1, 1, System.currentTimeMillis());
    double[][] h = getRandomMatrix(k, n, 0, 1, 1, System.currentTimeMillis());
    writeInputMatrixWithMTD("v", v, true);
    writeInputMatrixWithMTD("w", w, true);
    writeInputMatrixWithMTD("h", h, true);
    for (int i = 0; i < maxiter; i++) {
        double[][] tW = TestUtils.performTranspose(w);
        double[][] tWV = TestUtils.performMatrixMultiplication(tW, v);
        double[][] tWW = TestUtils.performMatrixMultiplication(tW, w);
        double[][] tWWH = TestUtils.performMatrixMultiplication(tWW, h);
        for (int j = 0; j < k; j++) {
            for (int l = 0; l < n; l++) {
                h[j][l] = h[j][l] * (tWV[j][l] / (tWWH[j][l] + Eps));
            }
        }
        double[][] tH = TestUtils.performTranspose(h);
        double[][] vTH = TestUtils.performMatrixMultiplication(v, tH);
        double[][] hTH = TestUtils.performMatrixMultiplication(h, tH);
        double[][] wHTH = TestUtils.performMatrixMultiplication(w, hTH);
        for (int j = 0; j < m; j++) {
            for (int l = 0; l < k; l++) {
                w[j][l] = w[j][l] * (vTH[j][l] / (wHTH[j][l] + Eps));
            }
        }
    }
    boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
    DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    RUNTIME_PLATFORM oldRT = DMLScript.rtplatform;
    try {
        DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
        Script script = ScriptFactory.dmlFromFile(fullDMLScriptName);
        // set positional argument values
        for (int argNum = 1; argNum <= proArgs.size(); argNum++) {
            script.in("$" + argNum, proArgs.get(argNum - 1));
        }
        // Read two matrices through RDD and one through HDFS
        if (numRegisteredInputs >= 1) {
            JavaRDD<String> vIn = sc.sc().textFile(input("v"), 2).toJavaRDD();
            MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, m, n);
            script.in("V", vIn, mm);
        }
        if (numRegisteredInputs >= 2) {
            JavaRDD<String> wIn = sc.sc().textFile(input("w"), 2).toJavaRDD();
            MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, m, k);
            script.in("W", wIn, mm);
        }
        if (numRegisteredInputs >= 3) {
            JavaRDD<String> hIn = sc.sc().textFile(input("h"), 2).toJavaRDD();
            MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, k, n);
            script.in("H", hIn, mm);
        }
        // Output one matrix to HDFS and get one as RDD
        if (numRegisteredOutputs >= 1) {
            script.out("H");
        }
        if (numRegisteredOutputs >= 2) {
            script.out("W");
            ml.setConfigProperty(DMLConfig.CP_PARALLEL_OPS, "false");
        }
        MLResults results = ml.execute(script);
        if (numRegisteredOutputs >= 2) {
            String configStr = ConfigurationManager.getDMLConfig().getConfigInfo();
            if (configStr.contains("cp.parallel.ops: true"))
                Assert.fail("Configuration not updated via setConfig");
        }
        if (numRegisteredOutputs >= 1) {
            RDD<String> hOut = results.getRDDStringIJV("H");
            String fName = output("h");
            try {
                MapReduceTool.deleteFileIfExistOnHDFS(fName);
            } catch (IOException e) {
                throw new DMLRuntimeException("Error: While deleting file on HDFS");
            }
            hOut.saveAsTextFile(fName);
        }
        if (numRegisteredOutputs >= 2) {
            JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("W");
            JavaRDD<MatrixEntry> matRDD = javaRDDStringIJV.map(new StringToMatrixEntry());
            Matrix matrix = results.getMatrix("W");
            MatrixCharacteristics mcW = matrix.getMatrixMetadata().asMatrixCharacteristics();
            CoordinateMatrix coordinateMatrix = new CoordinateMatrix(matRDD.rdd(), mcW.getRows(), mcW.getCols());
            JavaPairRDD<MatrixIndexes, MatrixBlock> binaryRDD = RDDConverterUtilsExt.coordinateMatrixToBinaryBlock(sc, coordinateMatrix, mcW, true);
            JavaRDD<String> wOut = RDDConverterUtils.binaryBlockToTextCell(binaryRDD, mcW);
            String fName = output("w");
            try {
                MapReduceTool.deleteFileIfExistOnHDFS(fName);
            } catch (IOException e) {
                throw new DMLRuntimeException("Error: While deleting file on HDFS");
            }
            wOut.saveAsTextFile(fName);
        }
        runRScript(true);
        // compare matrices
        HashMap<CellIndex, Double> hmWDML = readDMLMatrixFromHDFS("w");
        HashMap<CellIndex, Double> hmHDML = readDMLMatrixFromHDFS("h");
        HashMap<CellIndex, Double> hmWR = readRMatrixFromFS("w");
        HashMap<CellIndex, Double> hmHR = readRMatrixFromFS("h");
        TestUtils.compareMatrices(hmWDML, hmWR, 0.000001, "hmWDML", "hmWR");
        TestUtils.compareMatrices(hmHDML, hmHR, 0.000001, "hmHDML", "hmHR");
    } finally {
        DMLScript.rtplatform = oldRT;
        DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) MatrixEntry(org.apache.spark.mllib.linalg.distributed.MatrixEntry) CoordinateMatrix(org.apache.spark.mllib.linalg.distributed.CoordinateMatrix) Matrix(org.apache.sysml.api.mlcontext.Matrix) CellIndex(org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Script(org.apache.sysml.api.mlcontext.Script) DMLScript(org.apache.sysml.api.DMLScript) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) IOException(java.io.IOException) CoordinateMatrix(org.apache.spark.mllib.linalg.distributed.CoordinateMatrix) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) Test(org.junit.Test)

Aggregations

MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)165 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)142 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)70 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)48 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)41 Path (org.apache.hadoop.fs.Path)24 SequenceFile (org.apache.hadoop.io.SequenceFile)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)22 ArrayList (java.util.ArrayList)21 IOException (java.io.IOException)20 FileSystem (org.apache.hadoop.fs.FileSystem)20 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)19 Tuple2 (scala.Tuple2)19 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)17 JobConf (org.apache.hadoop.mapred.JobConf)14 MatrixValue (org.apache.sysml.runtime.matrix.data.MatrixValue)11 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)10 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)10 File (java.io.File)9 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)9