Search in sources :

Example 11 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class MLContextConversionUtil method matrixObjectToDataFrame.

/**
 * Convert a {@code MatrixObject} to a {@code DataFrame}.
 *
 * @param matrixObject
 *            the {@code MatrixObject}
 * @param sparkExecutionContext
 *            the Spark execution context
 * @param isVectorDF
 *            is the DataFrame a vector DataFrame?
 * @return the {@code MatrixObject} converted to a {@code DataFrame}
 */
public static Dataset<Row> matrixObjectToDataFrame(MatrixObject matrixObject, SparkExecutionContext sparkExecutionContext, boolean isVectorDF) {
    try {
        @SuppressWarnings("unchecked") JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sparkExecutionContext.getRDDHandleForMatrixObject(matrixObject, InputInfo.BinaryBlockInputInfo);
        MatrixCharacteristics mc = matrixObject.getMatrixCharacteristics();
        return RDDConverterUtils.binaryBlockToDataFrame(spark(), binaryBlocks, mc, isVectorDF);
    } catch (DMLRuntimeException e) {
        throw new MLContextException("DMLRuntimeException while converting matrix object to DataFrame", e);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 12 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class FrameIndexingSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    String opcode = getOpcode();
    // get indexing range
    long rl = ec.getScalarInput(rowLower.getName(), rowLower.getValueType(), rowLower.isLiteral()).getLongValue();
    long ru = ec.getScalarInput(rowUpper.getName(), rowUpper.getValueType(), rowUpper.isLiteral()).getLongValue();
    long cl = ec.getScalarInput(colLower.getName(), colLower.getValueType(), colLower.isLiteral()).getLongValue();
    long cu = ec.getScalarInput(colUpper.getName(), colUpper.getValueType(), colUpper.isLiteral()).getLongValue();
    IndexRange ixrange = new IndexRange(rl, ru, cl, cu);
    // right indexing
    if (opcode.equalsIgnoreCase(RightIndex.OPCODE)) {
        // update and check output dimensions
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        mcOut.set(ru - rl + 1, cu - cl + 1, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
        checkValidOutputDimensions(mcOut);
        // execute right indexing operation (partitioning-preserving if possible)
        JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
        JavaPairRDD<Long, FrameBlock> out = null;
        if (isPartitioningPreservingRightIndexing(mcIn, ixrange)) {
            out = in1.mapPartitionsToPair(new SliceBlockPartitionFunction(ixrange, mcOut), true);
        } else {
            out = in1.filter(new IsFrameBlockInRange(rl, ru, mcOut)).mapToPair(new SliceBlock(ixrange, mcOut));
        }
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
        // update schema of output with subset of input schema
        sec.getFrameObject(output.getName()).setSchema(sec.getFrameObject(input1.getName()).getSchema((int) cl, (int) cu));
    } else // left indexing
    if (opcode.equalsIgnoreCase(LeftIndex.OPCODE) || opcode.equalsIgnoreCase("mapLeftIndex")) {
        JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
        PartitionedBroadcast<FrameBlock> broadcastIn2 = null;
        JavaPairRDD<Long, FrameBlock> in2 = null;
        JavaPairRDD<Long, FrameBlock> out = null;
        // update and check output dimensions
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        MatrixCharacteristics mcLeft = ec.getMatrixCharacteristics(input1.getName());
        mcOut.set(mcLeft.getRows(), mcLeft.getCols(), mcLeft.getRowsPerBlock(), mcLeft.getColsPerBlock());
        checkValidOutputDimensions(mcOut);
        // note: always frame rhs, scalars are preprocessed via cast to 1x1 frame
        MatrixCharacteristics mcRight = ec.getMatrixCharacteristics(input2.getName());
        // sanity check matching index range and rhs dimensions
        if (!mcRight.dimsKnown()) {
            throw new DMLRuntimeException("The right input frame dimensions are not specified for FrameIndexingSPInstruction");
        }
        if (!(ru - rl + 1 == mcRight.getRows() && cu - cl + 1 == mcRight.getCols())) {
            throw new DMLRuntimeException("Invalid index range of leftindexing: [" + rl + ":" + ru + "," + cl + ":" + cu + "] vs [" + mcRight.getRows() + "x" + mcRight.getCols() + "].");
        }
        if (opcode.equalsIgnoreCase("mapLeftIndex")) {
            broadcastIn2 = sec.getBroadcastForFrameVariable(input2.getName());
            // partitioning-preserving mappartitions (key access required for broadcast loopkup)
            out = in1.mapPartitionsToPair(new LeftIndexPartitionFunction(broadcastIn2, ixrange, mcOut), true);
        } else {
            // general case
            // zero-out lhs
            in1 = in1.flatMapToPair(new ZeroOutLHS(false, ixrange, mcLeft));
            // slice rhs, shift and merge with lhs
            in2 = sec.getFrameBinaryBlockRDDHandleForVariable(input2.getName()).flatMapToPair(new SliceRHSForLeftIndexing(ixrange, mcLeft));
            out = FrameRDDAggregateUtils.mergeByKey(in1.union(in2));
        }
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
        if (broadcastIn2 != null)
            sec.addLineageBroadcast(output.getName(), input2.getName());
        if (in2 != null)
            sec.addLineageRDD(output.getName(), input2.getName());
    } else
        throw new DMLRuntimeException("Invalid opcode (" + opcode + ") encountered in FrameIndexingSPInstruction.");
}
Also used : IsFrameBlockInRange(org.apache.sysml.runtime.instructions.spark.functions.IsFrameBlockInRange) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IndexRange(org.apache.sysml.runtime.util.IndexRange) PartitionedBroadcast(org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 13 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class MatrixIndexingSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    String opcode = getOpcode();
    // get indexing range
    long rl = ec.getScalarInput(rowLower.getName(), rowLower.getValueType(), rowLower.isLiteral()).getLongValue();
    long ru = ec.getScalarInput(rowUpper.getName(), rowUpper.getValueType(), rowUpper.isLiteral()).getLongValue();
    long cl = ec.getScalarInput(colLower.getName(), colLower.getValueType(), colLower.isLiteral()).getLongValue();
    long cu = ec.getScalarInput(colUpper.getName(), colUpper.getValueType(), colUpper.isLiteral()).getLongValue();
    IndexRange ixrange = new IndexRange(rl, ru, cl, cu);
    // right indexing
    if (opcode.equalsIgnoreCase(RightIndex.OPCODE)) {
        // update and check output dimensions
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        mcOut.set(ru - rl + 1, cu - cl + 1, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
        mcOut.setNonZerosBound(Math.min(mcOut.getLength(), mcIn.getNonZerosBound()));
        checkValidOutputDimensions(mcOut);
        // execute right indexing operation (partitioning-preserving if possible)
        JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
        if (isSingleBlockLookup(mcIn, ixrange)) {
            sec.setMatrixOutput(output.getName(), singleBlockIndexing(in1, mcIn, mcOut, ixrange), getExtendedOpcode());
        } else if (isMultiBlockLookup(in1, mcIn, mcOut, ixrange)) {
            sec.setMatrixOutput(output.getName(), multiBlockIndexing(in1, mcIn, mcOut, ixrange), getExtendedOpcode());
        } else {
            // rdd output for general case
            JavaPairRDD<MatrixIndexes, MatrixBlock> out = generalCaseRightIndexing(in1, mcIn, mcOut, ixrange, _aggType);
            // put output RDD handle into symbol table
            sec.setRDDHandleForVariable(output.getName(), out);
            sec.addLineageRDD(output.getName(), input1.getName());
        }
    } else // left indexing
    if (opcode.equalsIgnoreCase(LeftIndex.OPCODE) || opcode.equalsIgnoreCase("mapLeftIndex")) {
        String rddVar = (_type == LixCacheType.LEFT) ? input2.getName() : input1.getName();
        String bcVar = (_type == LixCacheType.LEFT) ? input1.getName() : input2.getName();
        JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
        PartitionedBroadcast<MatrixBlock> broadcastIn2 = null;
        JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = null;
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
        // update and check output dimensions
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        MatrixCharacteristics mcLeft = ec.getMatrixCharacteristics(input1.getName());
        mcOut.set(mcLeft.getRows(), mcLeft.getCols(), mcLeft.getRowsPerBlock(), mcLeft.getColsPerBlock());
        checkValidOutputDimensions(mcOut);
        // note: always matrix rhs, scalars are preprocessed via cast to 1x1 matrix
        MatrixCharacteristics mcRight = ec.getMatrixCharacteristics(input2.getName());
        // sanity check matching index range and rhs dimensions
        if (!mcRight.dimsKnown()) {
            throw new DMLRuntimeException("The right input matrix dimensions are not specified for MatrixIndexingSPInstruction");
        }
        if (!(ru - rl + 1 == mcRight.getRows() && cu - cl + 1 == mcRight.getCols())) {
            throw new DMLRuntimeException("Invalid index range of leftindexing: [" + rl + ":" + ru + "," + cl + ":" + cu + "] vs [" + mcRight.getRows() + "x" + mcRight.getCols() + "].");
        }
        if (opcode.equalsIgnoreCase("mapLeftIndex")) {
            broadcastIn2 = sec.getBroadcastForVariable(bcVar);
            // partitioning-preserving mappartitions (key access required for broadcast loopkup)
            out = in1.mapPartitionsToPair(new LeftIndexPartitionFunction(broadcastIn2, ixrange, _type, mcOut), true);
        } else {
            // general case
            // zero-out lhs
            in1 = in1.mapToPair(new ZeroOutLHS(false, ixrange, mcLeft));
            // slice rhs, shift and merge with lhs
            in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName()).flatMapToPair(new SliceRHSForLeftIndexing(ixrange, mcLeft));
            out = RDDAggregateUtils.mergeByKey(in1.union(in2));
        }
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), rddVar);
        if (broadcastIn2 != null)
            sec.addLineageBroadcast(output.getName(), bcVar);
        if (in2 != null)
            sec.addLineageRDD(output.getName(), input2.getName());
    } else
        throw new DMLRuntimeException("Invalid opcode (" + opcode + ") encountered in MatrixIndexingSPInstruction.");
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IndexRange(org.apache.sysml.runtime.util.IndexRange) PartitionedBroadcast(org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 14 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class MatrixIndexingSPInstruction method createPartitionPruningRDD.

/**
 * Wraps the input RDD into a PartitionPruningRDD, which acts as a filter
 * of required partitions. The distinct set of required partitions is determined
 * via the partitioner of the input RDD.
 *
 * @param in input matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param filter partition filter
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
private static JavaPairRDD<MatrixIndexes, MatrixBlock> createPartitionPruningRDD(JavaPairRDD<MatrixIndexes, MatrixBlock> in, List<MatrixIndexes> filter) {
    // build hashset of required partition ids
    HashSet<Integer> flags = new HashSet<>();
    Partitioner partitioner = in.rdd().partitioner().get();
    for (MatrixIndexes key : filter) flags.add(partitioner.getPartition(key));
    // create partition pruning rdd
    Function1<Object, Object> f = new PartitionPruningFunction(flags);
    PartitionPruningRDD<Tuple2<MatrixIndexes, MatrixBlock>> ppRDD = PartitionPruningRDD.create(in.rdd(), f);
    // wrap output into java pair rdd
    return new JavaPairRDD<>(ppRDD, ClassManifestFactory.fromClass(MatrixIndexes.class), ClassManifestFactory.fromClass(MatrixBlock.class));
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Partitioner(org.apache.spark.Partitioner) HashSet(java.util.HashSet)

Example 15 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class MultiReturnParameterizedBuiltinSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    try {
        // get input RDD and meta data
        FrameObject fo = sec.getFrameObject(input1.getName());
        FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
        JavaPairRDD<Long, FrameBlock> in = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
        String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue();
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null;
        // step 1: build transform meta data
        Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), null);
        MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext());
        JavaRDD<String> rcMaps = in.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)).distinct().groupByKey().flatMap(new TransformEncodeGroupFunction(accMax));
        if (containsMVImputeEncoder(encoderBuild)) {
            EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
            rcMaps = rcMaps.union(in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)).groupByKey().flatMap(new TransformEncodeGroup2Function(mva)));
        }
        // trigger eval
        rcMaps.saveAsTextFile(fometa.getFileName());
        // consolidate meta data frame (reuse multi-threaded reader, special handling missing values)
        FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
        FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
        // recompute num distinct items per column
        meta.recomputeColumnCardinality();
        meta.setColumnNames((colnames != null) ? colnames : meta.getColumnNames());
        // step 2: transform apply (similar to spark transformapply)
        // compute omit offset map for block shifts
        TfOffsetMap omap = null;
        if (TfMetaUtils.containsOmitSpec(spec, colnames)) {
            omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
        }
        // create encoder broadcast (avoiding replication per task)
        Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), meta);
        mcOut.setDimension(mcIn.getRows() - ((omap != null) ? omap.getNumRmRows() : 0), encoder.getNumCols());
        Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
        Broadcast<TfOffsetMap> bomap = (omap != null) ? sec.getSparkContext().broadcast(omap) : null;
        // execute transform apply
        JavaPairRDD<Long, FrameBlock> tmp = in.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = FrameRDDConverterUtils.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
        // set output and maintain lineage/output characteristics
        sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
        sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
        sec.setFrameOutput(_outputs.get(1).getName(), meta);
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) Encoder(org.apache.sysml.runtime.transform.encode.Encoder) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) EncoderMVImpute(org.apache.sysml.runtime.transform.encode.EncoderMVImpute) RDDTransformApplyOffsetFunction(org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction.RDDTransformApplyOffsetFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) RDDTransformApplyFunction(org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction.RDDTransformApplyFunction) TfOffsetMap(org.apache.sysml.runtime.transform.meta.TfOffsetMap) FrameReader(org.apache.sysml.runtime.io.FrameReader)

Aggregations

JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)99 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)44 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)42 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)42 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)41 Tuple2 (scala.Tuple2)35 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)33 JavaRDD (org.apache.spark.api.java.JavaRDD)28 List (java.util.List)27 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)24 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 Collectors (java.util.stream.Collectors)22 IOException (java.io.IOException)17 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)16 LongWritable (org.apache.hadoop.io.LongWritable)15 Broadcast (org.apache.spark.broadcast.Broadcast)15 Text (org.apache.hadoop.io.Text)12 UserException (org.broadinstitute.hellbender.exceptions.UserException)12 Function (org.apache.spark.api.java.function.Function)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11