use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.
the class MLContextConversionUtil method javaRDDStringCSVToFrameObject.
/**
* Convert a {@code JavaRDD<String>} in CSV format to a {@code FrameObject}
*
* @param variableName
* name of the variable associated with the frame
* @param javaRDD
* the Java RDD of strings
* @param frameMetadata
* frame metadata
* @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
*/
public static FrameObject javaRDDStringCSVToFrameObject(String variableName, JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
JavaPairRDD<Long, FrameBlock> rdd;
try {
rdd = FrameRDDConverterUtils.csvToBinaryBlock(jsc(), javaPairRDDText, mc, frameObject.getSchema(), false, ",", false, -1);
} catch (DMLRuntimeException e) {
e.printStackTrace();
return null;
}
frameObject.setRDDHandle(new RDDObject(rdd, variableName));
return frameObject;
}
use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.
the class SparkExecutionContext method getRDDHandleForMatrixObject.
/**
* This call returns an RDD handle for a given matrix object. This includes
* the creation of RDDs for in-memory or binary-block HDFS data.
*
* @param mo matrix object
* @param inputInfo input info
* @return JavaPairRDD handle for a matrix object
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForMatrixObject(MatrixObject mo, InputInfo inputInfo) throws DMLRuntimeException {
//NOTE: MB this logic should be integrated into MatrixObject
//However, for now we cannot assume that spark libraries are
//always available and hence only store generic references in
//matrix object while all the logic is in the SparkExecContext
JavaSparkContext sc = getSparkContext();
JavaPairRDD<?, ?> rdd = null;
//rdd operations if already executed and cached
if (mo.getRDDHandle() != null && (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false))) {
//return existing rdd handling (w/o input format change)
rdd = mo.getRDDHandle().getRDD();
} else //CASE 2: dirty in memory data or cached result of rdd operations
if (mo.isDirty() || mo.isCached(false)) {
//get in-memory matrix block and parallelize it
//w/ guarded parallelize (fallback to export, rdd from file if too large)
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
boolean fromFile = false;
if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
if (//write if necessary
mo.isDirty() || !mo.isHDFSFileExists())
mo.exportData();
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//cp is workaround for read bug
rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
fromFile = true;
} else {
//default case
//pin matrix in memory
MatrixBlock mb = mo.acquireRead();
rdd = toMatrixJavaPairRDD(sc, mb, (int) mo.getNumRowsPerBlock(), (int) mo.getNumColumnsPerBlock());
//unpin matrix
mo.release();
_parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
}
//keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
rddhandle.setHDFSFile(fromFile);
mo.setRDDHandle(rddhandle);
} else //CASE 3: non-dirty (file exists on HDFS)
{
// For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
if (inputInfo == InputInfo.BinaryBlockInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
//recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
//cp is workaround for read bug
rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
} else if (inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
} else if (inputInfo == InputInfo.BinaryCellInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//cp is workaround for read bug
rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>) rdd).mapToPair(new CopyBinaryCellFunction());
} else {
throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
}
//keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
rddhandle.setHDFSFile(true);
mo.setRDDHandle(rddhandle);
}
return rdd;
}
use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.
the class SparkExecutionContext method getRDDHandleForFrameObject.
/**
* FIXME: currently this implementation assumes matrix representations but frame signature
* in order to support the old transform implementation.
*
* @param fo frame object
* @param inputInfo input info
* @return JavaPairRDD handle for a frame object
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForFrameObject(FrameObject fo, InputInfo inputInfo) throws DMLRuntimeException {
//NOTE: MB this logic should be integrated into FrameObject
//However, for now we cannot assume that spark libraries are
//always available and hence only store generic references in
//matrix object while all the logic is in the SparkExecContext
InputInfo inputInfo2 = (inputInfo == InputInfo.BinaryBlockInputInfo) ? InputInfo.BinaryBlockFrameInputInfo : inputInfo;
JavaSparkContext sc = getSparkContext();
JavaPairRDD<?, ?> rdd = null;
//rdd operations if already executed and cached
if (fo.getRDDHandle() != null && (fo.getRDDHandle().isCheckpointRDD() || !fo.isCached(false))) {
//return existing rdd handling (w/o input format change)
rdd = fo.getRDDHandle().getRDD();
} else //CASE 2: dirty in memory data or cached result of rdd operations
if (fo.isDirty() || fo.isCached(false)) {
//get in-memory matrix block and parallelize it
//w/ guarded parallelize (fallback to export, rdd from file if too large)
MatrixCharacteristics mc = fo.getMatrixCharacteristics();
boolean fromFile = false;
if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
if (fo.isDirty()) {
//write only if necessary
fo.exportData();
}
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
//cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
fromFile = true;
} else {
//default case
//pin frame in memory
FrameBlock fb = fo.acquireRead();
rdd = toFrameJavaPairRDD(sc, fb);
//unpin frame
fo.release();
_parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
}
//keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd, fo.getVarName());
rddhandle.setHDFSFile(fromFile);
fo.setRDDHandle(rddhandle);
} else //CASE 3: non-dirty (file exists on HDFS)
{
// For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
if (inputInfo2 == InputInfo.BinaryBlockFrameInputInfo) {
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
//note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
//recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
//cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
} else if (inputInfo2 == InputInfo.TextCellInputInfo || inputInfo2 == InputInfo.CSVInputInfo || inputInfo2 == InputInfo.MatrixMarketInputInfo) {
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
//cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
} else if (inputInfo2 == InputInfo.BinaryCellInputInfo) {
throw new DMLRuntimeException("Binarycell not supported for frames.");
} else {
throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
}
//keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd, fo.getVarName());
rddhandle.setHDFSFile(true);
fo.setRDDHandle(rddhandle);
}
return rdd;
}
use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.
the class MLContextConversionUtil method javaRDDStringIJVToMatrixObject.
/**
* Convert a {@code JavaRDD<String>} in IJV format to a {@code MatrixObject}
* . Note that metadata is required for IJV format.
*
* @param variableName
* name of the variable associated with the matrix
* @param javaRDD
* the Java RDD of strings
* @param matrixMetadata
* matrix metadata
* @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
*/
public static MatrixObject javaRDDStringIJVToMatrixObject(String variableName, JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.TextCellOutputInfo, InputInfo.TextCellInputInfo));
JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
matrixObject.setRDDHandle(new RDDObject(javaPairRDD2, variableName));
return matrixObject;
}
use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.
the class MLContextConversionUtil method javaRDDStringCSVToMatrixObject.
/**
* Convert a {@code JavaRDD<String>} in CSV format to a {@code MatrixObject}
*
* @param variableName
* name of the variable associated with the matrix
* @param javaRDD
* the Java RDD of strings
* @param matrixMetadata
* matrix metadata
* @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
*/
public static MatrixObject javaRDDStringCSVToMatrixObject(String variableName, JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.CSVOutputInfo, InputInfo.CSVInputInfo));
JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
matrixObject.setRDDHandle(new RDDObject(javaPairRDD2, variableName));
return matrixObject;
}
Aggregations