Search in sources :

Example 1 with DatasetObject

use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project incubator-systemml by apache.

the class MLContextConversionUtil method dataFrameToMatrixObject.

/**
	 * Convert a {@code DataFrame} to a {@code MatrixObject}.
	 * 
	 * @param variableName
	 *            name of the variable associated with the matrix
	 * @param dataFrame
	 *            the Spark {@code DataFrame}
	 * @param matrixMetadata
	 *            the matrix metadata
	 * @return the {@code DataFrame} matrix converted to a converted to a
	 *         {@code MatrixObject}
	 */
public static MatrixObject dataFrameToMatrixObject(String variableName, Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
    matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
    JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
    MatrixObject mo = binaryBlocksToMatrixObject(variableName, binaryBlock, matrixMetadata, false);
    //keep lineage of original dataset to allow bypassing binary block conversion if possible
    mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, variableName, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
    return mo;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject)

Example 2 with DatasetObject

use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project incubator-systemml by apache.

the class RemoteDPParForSpark method getPartitionedInput.

@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
    InputInfo ii = InputInfo.BinaryBlockInputInfo;
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    // NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
    if (hasInputDataSet(dpf, mo)) {
        DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
        Dataset<Row> in = dsObj.getDataset();
        // construct or reuse row ids
        JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
        in.javaRDD().zipWithIndex();
        // convert row to row in matrix block format
        return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
    } else // binary block input rdd without grouping
    if (!requiresGrouping(dpf, mo)) {
        // get input rdd and data partitioning
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    } else // default binary block input rdd with grouping
    {
        // get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
        // to reduce memory pressure for shuffle and subsequent
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
            in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        // data partitioning of input rdd
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject) DataFrameExtractIDFunction(org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Row(org.apache.spark.sql.Row)

Example 3 with DatasetObject

use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project systemml by apache.

the class MLContextConversionUtil method dataFrameToMatrixObject.

/**
 * Convert a {@code DataFrame} to a {@code MatrixObject}.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param matrixMetadata
 *            the matrix metadata
 * @return the {@code DataFrame} matrix converted to a converted to a
 *         {@code MatrixObject}
 */
public static MatrixObject dataFrameToMatrixObject(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
    matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
    JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
    MatrixObject mo = binaryBlocksToMatrixObject(binaryBlock, matrixMetadata, false);
    // keep lineage of original dataset to allow bypassing binary block
    // conversion if possible
    mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
    return mo;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject)

Example 4 with DatasetObject

use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project incubator-systemml by apache.

the class MLContextConversionUtil method dataFrameToMatrixObject.

/**
 * Convert a {@code DataFrame} to a {@code MatrixObject}.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param matrixMetadata
 *            the matrix metadata
 * @return the {@code DataFrame} matrix converted to a converted to a
 *         {@code MatrixObject}
 */
public static MatrixObject dataFrameToMatrixObject(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
    matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
    JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
    MatrixObject mo = binaryBlocksToMatrixObject(binaryBlock, matrixMetadata, false);
    // keep lineage of original dataset to allow bypassing binary block
    // conversion if possible
    mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
    return mo;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject)

Example 5 with DatasetObject

use of org.apache.sysml.runtime.instructions.spark.data.DatasetObject in project systemml by apache.

the class RemoteDPParForSpark method getPartitionedInput.

@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
    InputInfo ii = InputInfo.BinaryBlockInputInfo;
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    // NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
    if (hasInputDataSet(dpf, mo)) {
        DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
        Dataset<Row> in = dsObj.getDataset();
        // construct or reuse row ids
        JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
        in.javaRDD().zipWithIndex();
        // convert row to row in matrix block format
        return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
    } else // binary block input rdd without grouping
    if (!requiresGrouping(dpf, mo)) {
        // get input rdd and data partitioning
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    } else // default binary block input rdd with grouping
    {
        // get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
        // to reduce memory pressure for shuffle and subsequent
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
            in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        // data partitioning of input rdd
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject) DataFrameExtractIDFunction(org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Row(org.apache.spark.sql.Row)

Aggregations

MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)5 DatasetObject (org.apache.sysml.runtime.instructions.spark.data.DatasetObject)5 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)5 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)5 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 Row (org.apache.spark.sql.Row)2 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)2 DataFrameExtractIDFunction (org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction)2 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)2 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)2