Examples with JavaPairRDD - org.apache.spark.api.java.JavaPairRDD

Example 26 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project systemml by apache.

the class SpoofSPInstruction method createJoinedInputRDD.

private static JavaPairRDD<MatrixIndexes, MatrixBlock[]> createJoinedInputRDD(SparkExecutionContext sec, CPOperand[] inputs, boolean[] bcVect, boolean outer) {
    // get input rdd for main input
    int main = getMainInputIndex(inputs, bcVect);
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(inputs[main].getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(inputs[main].getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock[]> ret = in.mapValues(new MapInputSignature());
    for (int i = 0; i < inputs.length; i++) if (i != main && inputs[i].getDataType().isMatrix() && !bcVect[i]) {
        // create side input rdd
        String varname = inputs[i].getName();
        JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = sec.getBinaryBlockRDDHandleForVariable(varname);
        MatrixCharacteristics mcTmp = sec.getMatrixCharacteristics(varname);
        // replicate blocks if mismatch with main input
        if (outer && i == 2)
            tmp = tmp.flatMapToPair(new ReplicateRightFactorFunction(mcIn.getRows(), mcIn.getRowsPerBlock()));
        else if (mcIn.getNumRowBlocks() > mcTmp.getNumRowBlocks())
            tmp = tmp.flatMapToPair(new ReplicateBlockFunction(mcIn.getRows(), mcIn.getRowsPerBlock(), false));
        else if (mcIn.getNumColBlocks() > mcTmp.getNumColBlocks())
            tmp = tmp.flatMapToPair(new ReplicateBlockFunction(mcIn.getCols(), mcIn.getColsPerBlock(), true));
        // join main and side inputs and consolidate signature
        ret = ret.join(tmp).mapValues(new MapJoinSignature());
    }
    return ret;
}

Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) ReplicateBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.ReplicateBlockFunction)

Example 27 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project systemml by apache.

the class WriteSPInstruction method processFrameWriteInstruction.

@SuppressWarnings("unchecked")
protected void processFrameWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi, ValueType[] schema) throws IOException {
    // get input rdd
    JavaPairRDD<Long, FrameBlock> in1 = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    if (oi == OutputInfo.TextCellOutputInfo) {
        JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToTextCell(in1, mc);
        customSaveTextFile(out, fname, false);
    } else if (oi == OutputInfo.CSVOutputInfo) {
        CSVFileFormatProperties props = (formatProperties != null) ? (CSVFileFormatProperties) formatProperties : null;
        JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToCsv(in1, mc, props, true);
        customSaveTextFile(out, fname, false);
    } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        JavaPairRDD<LongWritable, FrameBlock> out = in1.mapToPair(new LongFrameToLongWritableFrameFunction());
        out.saveAsHadoopFile(fname, LongWritable.class, FrameBlock.class, SequenceFileOutputFormat.class);
    } else {
        // unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
    }
    // write meta data file
    MapReduceTool.writeMetaDataFile(fname + ".mtd", input1.getValueType(), schema, DataType.FRAME, mc, oi, formatProperties);
}

Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JavaRDD(org.apache.spark.api.java.JavaRDD) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction)

Example 28 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project systemml by apache.

the class CheckpointSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // this is valid if relevant branches are never entered)
    if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
        // add a dummy entry to the input, which will be immediately overwritten by the null output.
        sec.setVariable(input1.getName(), new BooleanObject(false));
        sec.setVariable(output.getName(), new BooleanObject(false));
        return;
    }
    // -------
    // (for csv input files with unknown dimensions, we might have generated a checkpoint after
    // csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
    CacheableData<?> obj = sec.getCacheableData(input1.getName());
    if (obj.isCached(true)) {
        // available in memory
        sec.setVariable(output.getName(), obj);
        return;
    }
    // get input rdd handle (for matrix or frame)
    JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
    // Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
    // -------
    // Note that persist is an transformation which will be triggered on-demand with the next rdd operations
    // This prevents unnecessary overhead if the dataset is only consumed by cp operations.
    JavaPairRDD<?, ?> out = null;
    if (!in.getStorageLevel().equals(_level)) {
        // (trigger coalesce if intended number of partitions exceeded by 20%
        // and not hash partitioned to avoid losing the existing partitioner)
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
        boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
        // checkpoint pre-processing rdd operations
        if (coalesce) {
            // merge partitions without shuffle if too many partitions
            out = in.coalesce(numPartitions);
        } else {
            // apply a narrow shallow copy to allow for short-circuit collects
            if (input1.getDataType() == DataType.MATRIX)
                out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
            else if (input1.getDataType() == DataType.FRAME)
                out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
        }
        // convert mcsr into memory-efficient csr if potentially sparse
        if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
            out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
        }
        // actual checkpoint into given storage level
        out = out.persist(_level);
        // otherwise these their nnz would never be evaluated due to lazy evaluation in spark
        if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
            mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
        }
    } else {
        // pass-through
        out = in;
    }
    // Step 3: In-place update of input matrix/frame rdd handle and set as output
    // -------
    // We use this in-place approach for two reasons. First, it is correct because our checkpoint
    // injection rewrites guarantee that after checkpoint instructions there are no consumers on the
    // given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
    // filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
    // caching and subsequent collects. Note that in-place update requires us to explicitly handle
    // lineage information in order to prevent cycles on cleanup.
    CacheableData<?> cd = sec.getCacheableData(input1.getName());
    if (out != in) {
        // prevent unnecessary lineage info
        // guaranteed to exist (see above)
        RDDObject inro = cd.getRDDHandle();
        // create new rdd object
        RDDObject outro = new RDDObject(out);
        // mark as checkpointed
        outro.setCheckpointRDD(true);
        // keep lineage to prevent cycles on cleanup
        outro.addLineageChild(inro);
        cd.setRDDHandle(outro);
    }
    sec.setVariable(output.getName(), cd);
}

Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Checkpoint(org.apache.sysml.lops.Checkpoint) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) CreateSparseBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) CopyFrameBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockFunction) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Example 29 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project systemml by apache.

the class CastSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    String opcode = getOpcode();
    // get input RDD and prepare output
    JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
    JavaPairRDD<?, ?> out = null;
    // convert frame-matrix / matrix-frame and set output
    if (opcode.equals(UnaryCP.CAST_AS_MATRIX_OPCODE)) {
        MatrixCharacteristics mcOut = new MatrixCharacteristics(mcIn);
        mcOut.setBlockSize(ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize());
        out = FrameRDDConverterUtils.binaryBlockToMatrixBlock((JavaPairRDD<Long, FrameBlock>) in, mcIn, mcOut);
    } else if (opcode.equals(UnaryCP.CAST_AS_FRAME_OPCODE)) {
        out = FrameRDDConverterUtils.matrixBlockToBinaryBlockLongIndex(sec.getSparkContext(), (JavaPairRDD<MatrixIndexes, MatrixBlock>) in, mcIn);
    } else {
        throw new DMLRuntimeException("Unsupported spark cast operation: " + opcode);
    }
    // update output statistics and add lineage
    sec.setRDDHandleForVariable(output.getName(), out);
    updateUnaryOutputMatrixCharacteristics(sec, input1.getName(), output.getName());
    sec.addLineageRDD(output.getName(), input1.getName());
    // update schema information for output frame
    if (opcode.equals(UnaryCP.CAST_AS_FRAME_OPCODE)) {
        sec.getFrameObject(output.getName()).setSchema(UtilFunctions.nCopies((int) mcIn.getCols(), ValueType.DOUBLE));
    }
}

Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 30 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project systemml by apache.

the class MatrixIndexingSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    String opcode = getOpcode();
    // get indexing range
    long rl = ec.getScalarInput(rowLower.getName(), rowLower.getValueType(), rowLower.isLiteral()).getLongValue();
    long ru = ec.getScalarInput(rowUpper.getName(), rowUpper.getValueType(), rowUpper.isLiteral()).getLongValue();
    long cl = ec.getScalarInput(colLower.getName(), colLower.getValueType(), colLower.isLiteral()).getLongValue();
    long cu = ec.getScalarInput(colUpper.getName(), colUpper.getValueType(), colUpper.isLiteral()).getLongValue();
    IndexRange ixrange = new IndexRange(rl, ru, cl, cu);
    // right indexing
    if (opcode.equalsIgnoreCase(RightIndex.OPCODE)) {
        // update and check output dimensions
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        mcOut.set(ru - rl + 1, cu - cl + 1, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
        mcOut.setNonZerosBound(Math.min(mcOut.getLength(), mcIn.getNonZerosBound()));
        checkValidOutputDimensions(mcOut);
        // execute right indexing operation (partitioning-preserving if possible)
        JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
        if (isSingleBlockLookup(mcIn, ixrange)) {
            sec.setMatrixOutput(output.getName(), singleBlockIndexing(in1, mcIn, mcOut, ixrange), getExtendedOpcode());
        } else if (isMultiBlockLookup(in1, mcIn, mcOut, ixrange)) {
            sec.setMatrixOutput(output.getName(), multiBlockIndexing(in1, mcIn, mcOut, ixrange), getExtendedOpcode());
        } else {
            // rdd output for general case
            JavaPairRDD<MatrixIndexes, MatrixBlock> out = generalCaseRightIndexing(in1, mcIn, mcOut, ixrange, _aggType);
            // put output RDD handle into symbol table
            sec.setRDDHandleForVariable(output.getName(), out);
            sec.addLineageRDD(output.getName(), input1.getName());
        }
    } else // left indexing
    if (opcode.equalsIgnoreCase(LeftIndex.OPCODE) || opcode.equalsIgnoreCase("mapLeftIndex")) {
        String rddVar = (_type == LixCacheType.LEFT) ? input2.getName() : input1.getName();
        String bcVar = (_type == LixCacheType.LEFT) ? input1.getName() : input2.getName();
        JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
        PartitionedBroadcast<MatrixBlock> broadcastIn2 = null;
        JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = null;
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
        // update and check output dimensions
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        MatrixCharacteristics mcLeft = ec.getMatrixCharacteristics(input1.getName());
        mcOut.set(mcLeft.getRows(), mcLeft.getCols(), mcLeft.getRowsPerBlock(), mcLeft.getColsPerBlock());
        checkValidOutputDimensions(mcOut);
        // note: always matrix rhs, scalars are preprocessed via cast to 1x1 matrix
        MatrixCharacteristics mcRight = ec.getMatrixCharacteristics(input2.getName());
        // sanity check matching index range and rhs dimensions
        if (!mcRight.dimsKnown()) {
            throw new DMLRuntimeException("The right input matrix dimensions are not specified for MatrixIndexingSPInstruction");
        }
        if (!(ru - rl + 1 == mcRight.getRows() && cu - cl + 1 == mcRight.getCols())) {
            throw new DMLRuntimeException("Invalid index range of leftindexing: [" + rl + ":" + ru + "," + cl + ":" + cu + "] vs [" + mcRight.getRows() + "x" + mcRight.getCols() + "].");
        }
        if (opcode.equalsIgnoreCase("mapLeftIndex")) {
            broadcastIn2 = sec.getBroadcastForVariable(bcVar);
            // partitioning-preserving mappartitions (key access required for broadcast loopkup)
            out = in1.mapPartitionsToPair(new LeftIndexPartitionFunction(broadcastIn2, ixrange, _type, mcOut), true);
        } else {
            // general case
            // zero-out lhs
            in1 = in1.mapToPair(new ZeroOutLHS(false, ixrange, mcLeft));
            // slice rhs, shift and merge with lhs
            in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName()).flatMapToPair(new SliceRHSForLeftIndexing(ixrange, mcLeft));
            out = RDDAggregateUtils.mergeByKey(in1.union(in2));
        }
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), rddVar);
        if (broadcastIn2 != null)
            sec.addLineageBroadcast(output.getName(), bcVar);
        if (in2 != null)
            sec.addLineageRDD(output.getName(), input2.getName());
    } else
        throw new DMLRuntimeException("Invalid opcode (" + opcode + ") encountered in MatrixIndexingSPInstruction.");
}

Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IndexRange(org.apache.sysml.runtime.util.IndexRange) PartitionedBroadcast(org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Aggregations

JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)99 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)44 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)42 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)42 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)41 Tuple2 (scala.Tuple2)35 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)33 JavaRDD (org.apache.spark.api.java.JavaRDD)28 List (java.util.List)27 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)24 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 Collectors (java.util.stream.Collectors)22 IOException (java.io.IOException)17 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)16 LongWritable (org.apache.hadoop.io.LongWritable)15 Broadcast (org.apache.spark.broadcast.Broadcast)15 Text (org.apache.hadoop.io.Text)12 UserException (org.broadinstitute.hellbender.exceptions.UserException)12 Function (org.apache.spark.api.java.function.Function)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11