use of org.apache.sysml.parser.Expression.ValueType in project incubator-systemml by apache.
the class DataFrameRowFrameConversionTest method testDataFrameConversion.
private void testDataFrameConversion(ValueType vt, boolean singleColBlock, boolean dense, boolean unknownDims) {
boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform;
try {
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
// generate input data and setup metadata
int cols = singleColBlock ? cols1 : cols2;
double sparsity = dense ? sparsity1 : sparsity2;
double[][] A = getRandomMatrix(rows1, cols, -10, 10, sparsity, 2373);
A = (vt == ValueType.INT) ? TestUtils.round(A) : A;
MatrixBlock mbA = DataConverter.convertToMatrixBlock(A);
FrameBlock fbA = DataConverter.convertToFrameBlock(mbA, vt);
int blksz = ConfigurationManager.getBlocksize();
MatrixCharacteristics mc1 = new MatrixCharacteristics(rows1, cols, blksz, blksz, mbA.getNonZeros());
MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);
ValueType[] schema = UtilFunctions.nCopies(cols, vt);
// get binary block input rdd
JavaPairRDD<Long, FrameBlock> in = SparkExecutionContext.toFrameJavaPairRDD(sc, fbA);
// frame - dataframe - frame conversion
Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, schema);
JavaPairRDD<Long, FrameBlock> out = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true);
// get output frame block
FrameBlock fbB = SparkExecutionContext.toFrameBlock(out, schema, rows1, cols);
// compare frame blocks
MatrixBlock mbB = DataConverter.convertToMatrixBlock(fbB);
double[][] B = DataConverter.convertToDoubleMatrix(mbB);
TestUtils.compareMatrices(A, B, rows1, cols, eps);
} catch (Exception ex) {
throw new RuntimeException(ex);
} finally {
DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
DMLScript.rtplatform = oldPlatform;
}
}
use of org.apache.sysml.parser.Expression.ValueType in project incubator-systemml by apache.
the class FrameTest method testFrameGeneral.
private void testFrameGeneral(InputInfo iinfo, OutputInfo oinfo, boolean bFromDataFrame, boolean bToDataFrame) throws IOException, DMLException, ParseException {
boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
RUNTIME_PLATFORM oldRT = DMLScript.rtplatform;
DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
int rowstart = 234, rowend = 1478, colstart = 125, colend = 568;
int bRows = rowend - rowstart + 1, bCols = colend - colstart + 1;
int rowstartC = 124, rowendC = 1178, colstartC = 143, colendC = 368;
int cRows = rowendC - rowstartC + 1, cCols = colendC - colstartC + 1;
HashMap<String, ValueType[]> outputSchema = new HashMap<String, ValueType[]>();
HashMap<String, MatrixCharacteristics> outputMC = new HashMap<String, MatrixCharacteristics>();
TestConfiguration config = getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
List<String> proArgs = new ArrayList<String>();
proArgs.add(input("A"));
proArgs.add(Integer.toString(rows));
proArgs.add(Integer.toString(cols));
proArgs.add(input("B"));
proArgs.add(Integer.toString(bRows));
proArgs.add(Integer.toString(bCols));
proArgs.add(Integer.toString(rowstart));
proArgs.add(Integer.toString(rowend));
proArgs.add(Integer.toString(colstart));
proArgs.add(Integer.toString(colend));
proArgs.add(output("A"));
proArgs.add(Integer.toString(rowstartC));
proArgs.add(Integer.toString(rowendC));
proArgs.add(Integer.toString(colstartC));
proArgs.add(Integer.toString(colendC));
proArgs.add(output("C"));
fullDMLScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME + ".dml";
ValueType[] schema = schemaMixedLarge;
// initialize the frame data.
List<ValueType> lschema = Arrays.asList(schema);
fullRScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME + ".R";
rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + rowstart + " " + rowend + " " + colstart + " " + colend + " " + expectedDir() + " " + rowstartC + " " + rowendC + " " + colstartC + " " + colendC;
double sparsity = sparsity1;
double[][] A = getRandomMatrix(rows, cols, min, max, sparsity, 1111);
writeInputFrameWithMTD("A", A, true, schema, oinfo);
sparsity = sparsity2;
double[][] B = getRandomMatrix((int) (bRows), (int) (bCols), min, max, sparsity, 2345);
ValueType[] schemaB = new ValueType[bCols];
for (int i = 0; i < bCols; ++i) schemaB[i] = schema[colstart - 1 + i];
List<ValueType> lschemaB = Arrays.asList(schemaB);
writeInputFrameWithMTD("B", B, true, schemaB, oinfo);
ValueType[] schemaC = new ValueType[colendC - colstartC + 1];
for (int i = 0; i < cCols; ++i) schemaC[i] = schema[colstartC - 1 + i];
Dataset<Row> dfA = null, dfB = null;
if (bFromDataFrame) {
// Create DataFrame for input A
StructType dfSchemaA = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schema, false);
JavaRDD<Row> rowRDDA = FrameRDDConverterUtils.csvToRowRDD(sc, input("A"), DataExpression.DEFAULT_DELIM_DELIMITER, schema);
dfA = spark.createDataFrame(rowRDDA, dfSchemaA);
// Create DataFrame for input B
StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
JavaRDD<Row> rowRDDB = FrameRDDConverterUtils.csvToRowRDD(sc, input("B"), DataExpression.DEFAULT_DELIM_DELIMITER, schemaB);
dfB = spark.createDataFrame(rowRDDB, dfSchemaB);
}
try {
Script script = ScriptFactory.dmlFromFile(fullDMLScriptName);
String format = "csv";
if (oinfo == OutputInfo.TextCellOutputInfo)
format = "text";
if (bFromDataFrame) {
script.in("A", dfA);
} else {
JavaRDD<String> aIn = sc.textFile(input("A"));
FrameSchema fs = new FrameSchema(lschema);
FrameFormat ff = (format.equals("text")) ? FrameFormat.IJV : FrameFormat.CSV;
FrameMetadata fm = new FrameMetadata(ff, fs, rows, cols);
script.in("A", aIn, fm);
}
if (bFromDataFrame) {
script.in("B", dfB);
} else {
JavaRDD<String> bIn = sc.textFile(input("B"));
FrameSchema fs = new FrameSchema(lschemaB);
FrameFormat ff = (format.equals("text")) ? FrameFormat.IJV : FrameFormat.CSV;
FrameMetadata fm = new FrameMetadata(ff, fs, bRows, bCols);
script.in("B", bIn, fm);
}
// Output one frame to HDFS and get one as RDD //TODO HDFS input/output to do
script.out("A", "C");
// set positional argument values
for (int argNum = 1; argNum <= proArgs.size(); argNum++) {
script.in("$" + argNum, proArgs.get(argNum - 1));
}
MLResults results = ml.execute(script);
format = "csv";
if (iinfo == InputInfo.TextCellInputInfo)
format = "text";
String fName = output("AB");
try {
MapReduceTool.deleteFileIfExistOnHDFS(fName);
} catch (IOException e) {
throw new DMLRuntimeException("Error: While deleting file on HDFS");
}
if (!bToDataFrame) {
if (format.equals("text")) {
JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("A");
javaRDDStringIJV.saveAsTextFile(fName);
} else {
JavaRDD<String> javaRDDStringCSV = results.getJavaRDDStringCSV("A");
javaRDDStringCSV.saveAsTextFile(fName);
}
} else {
Dataset<Row> df = results.getDataFrame("A");
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
MatrixCharacteristics mc = new MatrixCharacteristics(rows, cols, -1, -1, -1);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, bFromDataFrame).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(output("AB"), LongWritable.class, FrameBlock.class, OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
}
fName = output("C");
try {
MapReduceTool.deleteFileIfExistOnHDFS(fName);
} catch (IOException e) {
throw new DMLRuntimeException("Error: While deleting file on HDFS");
}
if (!bToDataFrame) {
if (format.equals("text")) {
JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("C");
javaRDDStringIJV.saveAsTextFile(fName);
} else {
JavaRDD<String> javaRDDStringCSV = results.getJavaRDDStringCSV("C");
javaRDDStringCSV.saveAsTextFile(fName);
}
} else {
Dataset<Row> df = results.getDataFrame("C");
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
MatrixCharacteristics mc = new MatrixCharacteristics(cRows, cCols, -1, -1, -1);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, bFromDataFrame).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fName, LongWritable.class, FrameBlock.class, OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
}
runRScript(true);
outputSchema.put("AB", schema);
outputMC.put("AB", new MatrixCharacteristics(rows, cols, -1, -1));
outputSchema.put("C", schemaC);
outputMC.put("C", new MatrixCharacteristics(cRows, cCols, -1, -1));
for (String file : config.getOutputFiles()) {
MatrixCharacteristics md = outputMC.get(file);
FrameBlock frameBlock = readDMLFrameFromHDFS(file, iinfo, md);
FrameBlock frameRBlock = readRFrameFromHDFS(file + ".csv", InputInfo.CSVInputInfo, md);
ValueType[] schemaOut = outputSchema.get(file);
verifyFrameData(frameBlock, frameRBlock, schemaOut);
System.out.println("File " + file + " processed successfully.");
}
System.out.println("Frame MLContext test completed successfully.");
} finally {
DMLScript.rtplatform = oldRT;
DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
}
}
use of org.apache.sysml.parser.Expression.ValueType in project incubator-systemml by apache.
the class FrameAppendDistTest method commonAppendTest.
/**
* @param platform
* @param rows
* @param cols1
* @param cols2
* @param sparse
*/
public void commonAppendTest(RUNTIME_PLATFORM platform, int rows1, int rows2, int cols1, int cols2, boolean sparse, AppendMethod forcedAppendMethod, boolean rbind) {
TestConfiguration config = getAndLoadTestConfiguration(TEST_NAME);
RUNTIME_PLATFORM prevPlfm = rtplatform;
double sparsity = (sparse) ? sparsity2 : sparsity1;
boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
try {
if (forcedAppendMethod != null) {
BinaryOp.FORCED_APPEND_METHOD = forcedAppendMethod;
}
rtplatform = platform;
if (rtplatform == RUNTIME_PLATFORM.SPARK)
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
config.addVariable("rows", rows1);
config.addVariable("cols", cols1);
/* This is for running the junit test the new way, i.e., construct the arguments directly */
String RI_HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
programArgs = new String[] { "-explain", "-args", input("A"), Long.toString(rows1), Long.toString(cols1), input("B"), Long.toString(rows2), Long.toString(cols2), output("C"), (rbind ? "rbind" : "cbind") };
fullRScriptName = RI_HOME + TEST_NAME + ".R";
rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + expectedDir() + " " + (rbind ? "rbind" : "cbind");
// initialize the frame data.
ValueType[] lschemaA = genMixSchema(cols1);
double[][] A = getRandomMatrix(rows1, cols1, min, max, sparsity, 1111);
writeInputFrameWithMTD("A", A, true, lschemaA, OutputInfo.BinaryBlockOutputInfo);
ValueType[] lschemaB = genMixSchema(cols2);
double[][] B = getRandomMatrix(rows2, cols2, min, max, sparsity, 2345);
writeInputFrameWithMTD("B", B, true, lschemaB, OutputInfo.BinaryBlockOutputInfo);
boolean exceptionExpected = false;
int expectedNumberOfJobs = -1;
runTest(true, exceptionExpected, null, expectedNumberOfJobs);
runRScript(true);
ValueType[] lschemaAB = UtilFunctions.copyOf(lschemaA, lschemaB);
for (String file : config.getOutputFiles()) {
FrameBlock frameBlock = readDMLFrameFromHDFS(file, InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics md = new MatrixCharacteristics(frameBlock.getNumRows(), frameBlock.getNumColumns(), -1, -1);
FrameBlock frameRBlock = readRFrameFromHDFS(file + ".csv", InputInfo.CSVInputInfo, md);
verifyFrameData(frameBlock, frameRBlock, (ValueType[]) lschemaAB);
System.out.println("File processed is " + file);
}
} catch (Exception ex) {
ex.printStackTrace();
throw new RuntimeException(ex);
} finally {
// reset execution platform
rtplatform = prevPlfm;
DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
BinaryOp.FORCED_APPEND_METHOD = null;
}
}
use of org.apache.sysml.parser.Expression.ValueType in project incubator-systemml by apache.
the class FrameAppendTest method runFrameAppendTest.
/**
* @param sparseM1
* @param sparseM2
* @param instType
*/
private void runFrameAppendTest(ValueType[] schema1, ValueType[] schema2, AppendType atype) {
try {
// data generation
double[][] A = getRandomMatrix(rows, schema1.length, -10, 10, 0.9, 2373);
double[][] B = getRandomMatrix(rows, schema2.length, -10, 10, 0.9, 129);
// init data frame 1
FrameBlock frame1 = new FrameBlock(schema1);
Object[] row1 = new Object[schema1.length];
for (int i = 0; i < rows; i++) {
for (int j = 0; j < schema1.length; j++) A[i][j] = UtilFunctions.objectToDouble(schema1[j], row1[j] = UtilFunctions.doubleToObject(schema1[j], A[i][j]));
frame1.appendRow(row1);
}
// init data frame 2
FrameBlock frame2 = new FrameBlock(schema2);
Object[] row2 = new Object[schema2.length];
for (int i = 0; i < rows; i++) {
for (int j = 0; j < schema2.length; j++) B[i][j] = UtilFunctions.objectToDouble(schema2[j], row2[j] = UtilFunctions.doubleToObject(schema2[j], B[i][j]));
frame2.appendRow(row2);
}
// core append operations matrix blocks
MatrixBlock mbA = DataConverter.convertToMatrixBlock(A);
MatrixBlock mbB = DataConverter.convertToMatrixBlock(B);
MatrixBlock mbC = mbA.append(mbB, new MatrixBlock(), atype == AppendType.CBIND);
// core append operations frame blocks
FrameBlock frame3 = frame1.append(frame2, new FrameBlock(), atype == AppendType.CBIND);
// check basic meta data
if (frame3.getNumRows() != mbC.getNumRows())
Assert.fail("Wrong number of rows: " + frame3.getNumRows() + ", expected: " + mbC.getNumRows());
// check correct values
ValueType[] lschema = frame3.getSchema();
for (int i = 0; i < rows; i++) for (int j = 0; j < lschema.length; j++) {
double tmp = UtilFunctions.objectToDouble(lschema[j], frame3.get(i, j));
if (tmp != mbC.quickGetValue(i, j))
Assert.fail("Wrong get value for cell (" + i + "," + j + "): " + tmp + ", expected: " + mbC.quickGetValue(i, j));
}
} catch (Exception ex) {
ex.printStackTrace();
throw new RuntimeException(ex);
}
}
use of org.apache.sysml.parser.Expression.ValueType in project incubator-systemml by apache.
the class FrameConverterTest method runConverter.
@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
JavaSparkContext sc = sec.getSparkContext();
ValueType[] lschema = schema.toArray(new ValueType[0]);
MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
switch(type) {
case CSV2BIN:
{
InputInfo iinfo = InputInfo.CSVInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2CSV:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
CSVFileFormatProperties fprop = new CSVFileFormatProperties();
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
rddOut.saveAsTextFile(fnameOut);
break;
}
case TXTCELL2BIN:
{
InputInfo iinfo = InputInfo.TextCellInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2TXTCELL:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
rddOut.saveAsTextFile(fnameOut);
break;
}
case MAT2BIN:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2MAT:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
break;
}
case DFRM2BIN:
{
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
// Create DataFrame
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2DFRM:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
default:
throw new RuntimeException("Unsuported converter type: " + type.toString());
}
sec.close();
}
Aggregations