use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project incubator-systemml by apache.
the class MLContextConversionUtil method dataFrameToMatrixObject.
/**
* Convert a {@code DataFrame} to a {@code MatrixObject}.
*
* @param variableName
* name of the variable associated with the matrix
* @param dataFrame
* the Spark {@code DataFrame}
* @param matrixMetadata
* the matrix metadata
* @return the {@code DataFrame} matrix converted to a converted to a
* {@code MatrixObject}
*/
public static MatrixObject dataFrameToMatrixObject(String variableName, Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
MatrixObject mo = binaryBlocksToMatrixObject(variableName, binaryBlock, matrixMetadata, false);
//keep lineage of original dataset to allow bypassing binary block conversion if possible
mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, variableName, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
return mo;
}
use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project incubator-systemml by apache.
the class RemoteDPParForSpark method getPartitionedInput.
@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
InputInfo ii = InputInfo.BinaryBlockInputInfo;
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
// NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
if (hasInputDataSet(dpf, mo)) {
DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
Dataset<Row> in = dsObj.getDataset();
// construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
in.javaRDD().zipWithIndex();
// convert row to row in matrix block format
return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
} else // binary block input rdd without grouping
if (!requiresGrouping(dpf, mo)) {
// get input rdd and data partitioning
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
} else // default binary block input rdd with grouping
{
// get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
// to reduce memory pressure for shuffle and subsequent
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
// data partitioning of input rdd
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
}
}
use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project systemml by apache.
the class MLContextConversionUtil method dataFrameToMatrixObject.
/**
* Convert a {@code DataFrame} to a {@code MatrixObject}.
*
* @param dataFrame
* the Spark {@code DataFrame}
* @param matrixMetadata
* the matrix metadata
* @return the {@code DataFrame} matrix converted to a converted to a
* {@code MatrixObject}
*/
public static MatrixObject dataFrameToMatrixObject(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
MatrixObject mo = binaryBlocksToMatrixObject(binaryBlock, matrixMetadata, false);
// keep lineage of original dataset to allow bypassing binary block
// conversion if possible
mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
return mo;
}
use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project incubator-systemml by apache.
the class MLContextConversionUtil method dataFrameToMatrixObject.
/**
* Convert a {@code DataFrame} to a {@code MatrixObject}.
*
* @param dataFrame
* the Spark {@code DataFrame}
* @param matrixMetadata
* the matrix metadata
* @return the {@code DataFrame} matrix converted to a converted to a
* {@code MatrixObject}
*/
public static MatrixObject dataFrameToMatrixObject(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
MatrixObject mo = binaryBlocksToMatrixObject(binaryBlock, matrixMetadata, false);
// keep lineage of original dataset to allow bypassing binary block
// conversion if possible
mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
return mo;
}
use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project systemml by apache.
the class RemoteDPParForSpark method getPartitionedInput.
@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
InputInfo ii = InputInfo.BinaryBlockInputInfo;
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
// NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
if (hasInputDataSet(dpf, mo)) {
DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
Dataset<Row> in = dsObj.getDataset();
// construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
in.javaRDD().zipWithIndex();
// convert row to row in matrix block format
return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
} else // binary block input rdd without grouping
if (!requiresGrouping(dpf, mo)) {
// get input rdd and data partitioning
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
} else // default binary block input rdd with grouping
{
// get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
// to reduce memory pressure for shuffle and subsequent
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
// data partitioning of input rdd
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
}
}
Aggregations