use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class MLContextConversionUtil method dataFrameToMatrixBinaryBlocks.
/**
* Convert a {@code DataFrame} to a
* {@code JavaPairRDD<MatrixIndexes, MatrixBlock>} binary-block matrix.
*
* @param dataFrame
* the Spark {@code DataFrame}
* @param matrixMetadata
* the matrix metadata
* @return the {@code DataFrame} matrix converted to a
* {@code JavaPairRDD<MatrixIndexes,
* MatrixBlock>} binary-block matrix
*/
public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToMatrixBinaryBlocks(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
// handle meta data
determineMatrixFormatIfNeeded(dataFrame, matrixMetadata);
MatrixCharacteristics mc = (matrixMetadata != null && matrixMetadata.asMatrixCharacteristics() != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
boolean containsID = isDataFrameWithIDColumn(matrixMetadata);
boolean isVector = isVectorBasedDataFrame(matrixMetadata);
// convert data frame to binary block matrix
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(jsc(), dataFrame, mc, containsID, isVector);
// update determined matrix characteristics
if (matrixMetadata != null)
matrixMetadata.setMatrixCharacteristics(mc);
return out;
}
use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class MLContextConversionUtil method dataFrameToMatrixObject.
/**
* Convert a {@code DataFrame} to a {@code MatrixObject}.
*
* @param dataFrame
* the Spark {@code DataFrame}
* @param matrixMetadata
* the matrix metadata
* @return the {@code DataFrame} matrix converted to a converted to a
* {@code MatrixObject}
*/
public static MatrixObject dataFrameToMatrixObject(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
MatrixObject mo = binaryBlocksToMatrixObject(binaryBlock, matrixMetadata, false);
// keep lineage of original dataset to allow bypassing binary block
// conversion if possible
mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
return mo;
}
use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class DataFrameMatrixConversionTest method testDataFrameConversion.
private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) {
boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform;
try {
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
// generate input data and setup metadata
int rows = (cols == cols3) ? rows3 : rows1;
double sparsity = dense ? sparsity1 : sparsity2;
double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373);
MatrixBlock mbA = DataConverter.convertToMatrixBlock(A);
int blksz = ConfigurationManager.getBlocksize();
MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, blksz, mbA.getNonZeros());
MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);
// get binary block input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz, blksz);
// matrix - dataframe - matrix conversion
Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
df = (rows == rows3) ? df.repartition(rows) : df;
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
// get output matrix block
MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, blksz, -1);
// compare matrix blocks
double[][] B = DataConverter.convertToDoubleMatrix(mbB);
TestUtils.compareMatrices(A, B, rows, cols, eps);
} catch (Exception ex) {
throw new RuntimeException(ex);
} finally {
DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
DMLScript.rtplatform = oldPlatform;
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class DataFrameMatrixConversionTest method testDataFrameConversionUltraSparse.
private void testDataFrameConversionUltraSparse(boolean vector, boolean unknownDims) {
boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform;
try {
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
// generate input data and setup metadata
double[][] A = getRandomMatrix(rows1, 1, -10, 10, 0.7, 2373);
MatrixBlock mbA0 = DataConverter.convertToMatrixBlock(A);
MatrixBlock mbA = LibMatrixReorg.diag(mbA0, new MatrixBlock(rows1, rows1, true));
int blksz = ConfigurationManager.getBlocksize();
MatrixCharacteristics mc1 = new MatrixCharacteristics(rows1, rows1, blksz, blksz, mbA.getNonZeros());
MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);
// get binary block input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz, blksz);
// matrix - dataframe - matrix conversion
Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
// get output matrix block
MatrixBlock mbB0 = SparkExecutionContext.toMatrixBlock(out, rows1, rows1, blksz, blksz, -1);
MatrixBlock mbB = LibMatrixReorg.diag(mbB0, new MatrixBlock(rows1, 1, false));
// compare matrix blocks
double[][] B = DataConverter.convertToDoubleMatrix(mbB);
TestUtils.compareMatrices(A, B, rows1, 1, eps);
} catch (Exception ex) {
throw new RuntimeException(ex);
} finally {
DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
DMLScript.rtplatform = oldPlatform;
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class GNMFTest method testGNMFWithRDMLAndJava.
@Test
public void testGNMFWithRDMLAndJava() throws IOException, DMLException, ParseException {
System.out.println("------------ BEGIN " + TEST_NAME + " TEST {" + numRegisteredInputs + ", " + numRegisteredOutputs + "} ------------");
this.scriptType = ScriptType.DML;
int m = 2000;
int n = 1500;
int k = 50;
int maxiter = 2;
double Eps = Math.pow(10, -8);
getAndLoadTestConfiguration(TEST_NAME);
List<String> proArgs = new ArrayList<String>();
proArgs.add(input("v"));
proArgs.add(input("w"));
proArgs.add(input("h"));
proArgs.add(Integer.toString(maxiter));
proArgs.add(output("w"));
proArgs.add(output("h"));
programArgs = proArgs.toArray(new String[proArgs.size()]);
fullDMLScriptName = getScript();
rCmd = getRCmd(inputDir(), Integer.toString(maxiter), expectedDir());
double[][] v = getRandomMatrix(m, n, 1, 5, 0.2, System.currentTimeMillis());
double[][] w = getRandomMatrix(m, k, 0, 1, 1, System.currentTimeMillis());
double[][] h = getRandomMatrix(k, n, 0, 1, 1, System.currentTimeMillis());
writeInputMatrixWithMTD("v", v, true);
writeInputMatrixWithMTD("w", w, true);
writeInputMatrixWithMTD("h", h, true);
for (int i = 0; i < maxiter; i++) {
double[][] tW = TestUtils.performTranspose(w);
double[][] tWV = TestUtils.performMatrixMultiplication(tW, v);
double[][] tWW = TestUtils.performMatrixMultiplication(tW, w);
double[][] tWWH = TestUtils.performMatrixMultiplication(tWW, h);
for (int j = 0; j < k; j++) {
for (int l = 0; l < n; l++) {
h[j][l] = h[j][l] * (tWV[j][l] / (tWWH[j][l] + Eps));
}
}
double[][] tH = TestUtils.performTranspose(h);
double[][] vTH = TestUtils.performMatrixMultiplication(v, tH);
double[][] hTH = TestUtils.performMatrixMultiplication(h, tH);
double[][] wHTH = TestUtils.performMatrixMultiplication(w, hTH);
for (int j = 0; j < m; j++) {
for (int l = 0; l < k; l++) {
w[j][l] = w[j][l] * (vTH[j][l] / (wHTH[j][l] + Eps));
}
}
}
boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
RUNTIME_PLATFORM oldRT = DMLScript.rtplatform;
try {
DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
Script script = ScriptFactory.dmlFromFile(fullDMLScriptName);
// set positional argument values
for (int argNum = 1; argNum <= proArgs.size(); argNum++) {
script.in("$" + argNum, proArgs.get(argNum - 1));
}
// Read two matrices through RDD and one through HDFS
if (numRegisteredInputs >= 1) {
JavaRDD<String> vIn = sc.sc().textFile(input("v"), 2).toJavaRDD();
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, m, n);
script.in("V", vIn, mm);
}
if (numRegisteredInputs >= 2) {
JavaRDD<String> wIn = sc.sc().textFile(input("w"), 2).toJavaRDD();
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, m, k);
script.in("W", wIn, mm);
}
if (numRegisteredInputs >= 3) {
JavaRDD<String> hIn = sc.sc().textFile(input("h"), 2).toJavaRDD();
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, k, n);
script.in("H", hIn, mm);
}
// Output one matrix to HDFS and get one as RDD
if (numRegisteredOutputs >= 1) {
script.out("H");
}
if (numRegisteredOutputs >= 2) {
script.out("W");
ml.setConfigProperty(DMLConfig.CP_PARALLEL_OPS, "false");
}
MLResults results = ml.execute(script);
if (numRegisteredOutputs >= 2) {
String configStr = ConfigurationManager.getDMLConfig().getConfigInfo();
if (configStr.contains("cp.parallel.ops: true"))
Assert.fail("Configuration not updated via setConfig");
}
if (numRegisteredOutputs >= 1) {
RDD<String> hOut = results.getRDDStringIJV("H");
String fName = output("h");
try {
MapReduceTool.deleteFileIfExistOnHDFS(fName);
} catch (IOException e) {
throw new DMLRuntimeException("Error: While deleting file on HDFS");
}
hOut.saveAsTextFile(fName);
}
if (numRegisteredOutputs >= 2) {
JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("W");
JavaRDD<MatrixEntry> matRDD = javaRDDStringIJV.map(new StringToMatrixEntry());
Matrix matrix = results.getMatrix("W");
MatrixCharacteristics mcW = matrix.getMatrixMetadata().asMatrixCharacteristics();
CoordinateMatrix coordinateMatrix = new CoordinateMatrix(matRDD.rdd(), mcW.getRows(), mcW.getCols());
JavaPairRDD<MatrixIndexes, MatrixBlock> binaryRDD = RDDConverterUtilsExt.coordinateMatrixToBinaryBlock(sc, coordinateMatrix, mcW, true);
JavaRDD<String> wOut = RDDConverterUtils.binaryBlockToTextCell(binaryRDD, mcW);
String fName = output("w");
try {
MapReduceTool.deleteFileIfExistOnHDFS(fName);
} catch (IOException e) {
throw new DMLRuntimeException("Error: While deleting file on HDFS");
}
wOut.saveAsTextFile(fName);
}
runRScript(true);
// compare matrices
HashMap<CellIndex, Double> hmWDML = readDMLMatrixFromHDFS("w");
HashMap<CellIndex, Double> hmHDML = readDMLMatrixFromHDFS("h");
HashMap<CellIndex, Double> hmWR = readRMatrixFromFS("w");
HashMap<CellIndex, Double> hmHR = readRMatrixFromFS("h");
TestUtils.compareMatrices(hmWDML, hmWR, 0.000001, "hmWDML", "hmWR");
TestUtils.compareMatrices(hmHDML, hmHR, 0.000001, "hmHDML", "hmHR");
} finally {
DMLScript.rtplatform = oldRT;
DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
}
}
Aggregations