use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class FrameConverterTest method runConverter.
@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
JavaSparkContext sc = sec.getSparkContext();
ValueType[] lschema = schema.toArray(new ValueType[0]);
MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
switch(type) {
case CSV2BIN:
{
InputInfo iinfo = InputInfo.CSVInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2CSV:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
CSVFileFormatProperties fprop = new CSVFileFormatProperties();
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
rddOut.saveAsTextFile(fnameOut);
break;
}
case TXTCELL2BIN:
{
InputInfo iinfo = InputInfo.TextCellInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2TXTCELL:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
rddOut.saveAsTextFile(fnameOut);
break;
}
case MAT2BIN:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2MAT:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
break;
}
case DFRM2BIN:
{
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
// Create DataFrame
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2DFRM:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
default:
throw new RuntimeException("Unsuported converter type: " + type.toString());
}
sec.close();
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class FrameObject method writeBlobToHDFS.
@Override
protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) throws IOException, DMLRuntimeException {
OutputInfo oinfo = OutputInfo.stringToOutputInfo(ofmt);
FrameWriter writer = FrameWriterFactory.createFrameWriter(oinfo, fprop);
writer.writeFrameToHDFS(_data, fname, getNumRows(), getNumColumns());
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class MatrixObject method writeBlobFromRDDtoHDFS.
@Override
protected void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String outputFormat) throws IOException, DMLRuntimeException {
// prepare output info
MetaDataFormat iimd = (MetaDataFormat) _metaData;
OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo(outputFormat) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo()));
// note: the write of an RDD to HDFS might trigger
// lazy evaluation of pending transformations.
long newnnz = SparkExecutionContext.writeRDDtoHDFS(rdd, fname, oinfo);
_metaData.getMatrixCharacteristics().setNonZeros(newnnz);
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class MatrixObject method writeBlobToHDFS.
/**
* Writes in-memory matrix to HDFS in a specified format.
*/
@Override
protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) throws IOException, DMLRuntimeException {
long begin = 0;
if (LOG.isTraceEnabled()) {
LOG.trace(" Writing matrix to HDFS... " + hashCode() + " Path: " + fname + ", Format: " + (ofmt != null ? ofmt : "inferred from metadata"));
begin = System.currentTimeMillis();
}
MetaDataFormat iimd = (MetaDataFormat) _metaData;
if (_data != null) {
// Get the dimension information from the metadata stored within MatrixObject
MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
// Write the matrix to HDFS in requested format
OutputInfo oinfo = (ofmt != null ? OutputInfo.stringToOutputInfo(ofmt) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo()));
// note: this is only required if singlenode (due to binarycell default)
if (oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && (mc.getRowsPerBlock() != ConfigurationManager.getBlocksize() || mc.getColsPerBlock() != ConfigurationManager.getBlocksize())) {
DataConverter.writeMatrixToHDFS(_data, fname, oinfo, new MatrixCharacteristics(mc.getRows(), mc.getCols(), ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), mc.getNonZeros()), rep, fprop);
} else {
DataConverter.writeMatrixToHDFS(_data, fname, oinfo, mc, rep, fprop);
}
if (LOG.isTraceEnabled())
LOG.trace("Writing matrix to HDFS (" + fname + ") - COMPLETED... " + (System.currentTimeMillis() - begin) + " msec.");
} else if (LOG.isTraceEnabled()) {
LOG.trace("Writing matrix to HDFS (" + fname + ") - NOTHING TO WRITE (_data == null).");
}
if (DMLScript.STATISTICS)
CacheStatistics.incrementHDFSWrites();
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class DataPartitionerRemoteReducer method configure.
public void configure(JobConf job) {
String fnameNew = MRJobConfiguration.getPartitioningFilename(job);
OutputInfo oi = MRJobConfiguration.getPartitioningOutputInfo(job);
if (oi == OutputInfo.TextCellOutputInfo)
_reducer = new DataPartitionerReducerTextcell(job, fnameNew);
else if (oi == OutputInfo.BinaryCellOutputInfo)
_reducer = new DataPartitionerReducerBinarycell(job, fnameNew);
else if (oi == OutputInfo.BinaryBlockOutputInfo)
_reducer = new DataPartitionerReducerBinaryblock(job, fnameNew);
else
throw new RuntimeException("Unable to configure reducer with unknown output info: " + oi.toString());
}
Aggregations