Search in sources :

Example 1 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class DataTransform method spDataTransform.

public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // Parse transform instruction (the first instruction) to obtain relevant fields
    TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
    JobConf job = new JobConf();
    FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
    checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
    // find the first file in alphabetical ordering of partfiles in directory inputPath 
    String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
    // find column names and construct output header
    String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
    HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
    int numColumns = colNamesToIds.size();
    String outHeader = getOutputHeader(fs, headerLine, oprnds);
    String tmpPath = MRJobConfiguration.constructTempOutputFilename();
    // Construct RDD for input data
    @SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
    JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
    long numRowsTf = 0, numColumnsTf = 0;
    JavaPairRDD<Long, String> tfPairRDD = null;
    if (!oprnds.isApply) {
        // build specification file with column IDs insteadof column names
        String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // Build transformation metadata, including recode maps, bin definitions, etc.
        // Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
        String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
        numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
        // store the specFileWithIDs as transformation metadata
        MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
    } else {
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
        MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
        // path to specification file
        String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        // Apply transformation metadata, and perform actual transformation 
        tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
    }
    // copy auxiliary data (old and new header lines) from temporary location to txMtdPath
    moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
    // convert to csv output format (serialized longwritable/text)
    JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
    if (outtfPairRDD != null) {
        MatrixObject outMO = outputs[0];
        String outVar = outMO.getVarName();
        outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
        sec.addLineageRDD(outVar, inst.getParams().get("target"));
        //update output statistics (required for correctness)
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
        mcOut.setDimension(numRowsTf, numColumnsTf);
        mcOut.setNonZeros(-1);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 2 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class CumulativeAggregateSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    long rlen = mc.getRows();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    // execute unary aggregate (w/ implicit drop correction)
    AggregateUnaryOperator auop = (AggregateUnaryOperator) _optr;
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapToPair(new RDDCumAggFunction(auop, rlen, brlen, bclen));
    out = RDDAggregateUtils.mergeByKey(out, false);
    // put output handle in symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) AggregateUnaryOperator(org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 3 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class CumulativeOffsetSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input2.getName());
    long rlen = mc.getRows();
    int brlen = mc.getRowsPerBlock();
    // get inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> inData = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> inAgg = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
    // prepare aggregates (cumsplit of offsets)
    inAgg = inAgg.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, brlen));
    // execute cumulative offset (apply cumulative op w/ offsets)
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = inData.join(inAgg).mapValues(new RDDCumOffsetFunction(_uop, _bop));
    updateUnaryOutputMatrixCharacteristics(sec);
    // put output handle in symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
    sec.addLineageRDD(output.getName(), input2.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 4 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class FrameAppendMSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    // map-only append (rhs must be vector and fit in mapper mem)
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    checkBinaryAppendInputCharacteristics(sec, _cbind, false, false);
    JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
    PartitionedBroadcast<FrameBlock> in2 = sec.getBroadcastForFrameVariable(input2.getName());
    // execute map-append operations (partitioning preserving if keys for blocks not changing)
    JavaPairRDD<Long, FrameBlock> out = null;
    if (preservesPartitioning(_cbind)) {
        out = in1.mapPartitionsToPair(new MapSideAppendPartitionFunction(in2), true);
    } else
        throw new DMLRuntimeException("Append type rbind not supported for frame mappend, instead use rappend");
    // put output RDD handle into symbol table
    updateBinaryAppendOutputMatrixCharacteristics(sec, _cbind);
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
    sec.addLineageBroadcast(output.getName(), input2.getName());
    // update schema of output with merged input schemas
    sec.getFrameObject(output.getName()).setSchema(sec.getFrameObject(input1.getName()).mergeSchemas(sec.getFrameObject(input2.getName())));
}
Also used : FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 5 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class FrameIndexingSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    String opcode = getOpcode();
    // get indexing range
    long rl = ec.getScalarInput(rowLower.getName(), rowLower.getValueType(), rowLower.isLiteral()).getLongValue();
    long ru = ec.getScalarInput(rowUpper.getName(), rowUpper.getValueType(), rowUpper.isLiteral()).getLongValue();
    long cl = ec.getScalarInput(colLower.getName(), colLower.getValueType(), colLower.isLiteral()).getLongValue();
    long cu = ec.getScalarInput(colUpper.getName(), colUpper.getValueType(), colUpper.isLiteral()).getLongValue();
    IndexRange ixrange = new IndexRange(rl, ru, cl, cu);
    // right indexing
    if (opcode.equalsIgnoreCase(RightIndex.OPCODE)) {
        // update and check output dimensions
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        mcOut.set(ru - rl + 1, cu - cl + 1, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
        checkValidOutputDimensions(mcOut);
        // execute right indexing operation (partitioning-preserving if possible)
        JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
        JavaPairRDD<Long, FrameBlock> out = null;
        if (isPartitioningPreservingRightIndexing(mcIn, ixrange)) {
            out = in1.mapPartitionsToPair(new SliceBlockPartitionFunction(ixrange, mcOut), true);
        } else {
            out = in1.filter(new IsFrameBlockInRange(rl, ru, mcOut)).mapToPair(new SliceBlock(ixrange, mcOut));
        }
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
        // update schema of output with subset of input schema
        sec.getFrameObject(output.getName()).setSchema(sec.getFrameObject(input1.getName()).getSchema((int) cl, (int) cu));
    } else // left indexing
    if (opcode.equalsIgnoreCase(LeftIndex.OPCODE) || opcode.equalsIgnoreCase("mapLeftIndex")) {
        JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
        PartitionedBroadcast<FrameBlock> broadcastIn2 = null;
        JavaPairRDD<Long, FrameBlock> in2 = null;
        JavaPairRDD<Long, FrameBlock> out = null;
        // update and check output dimensions
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        MatrixCharacteristics mcLeft = ec.getMatrixCharacteristics(input1.getName());
        mcOut.set(mcLeft.getRows(), mcLeft.getCols(), mcLeft.getRowsPerBlock(), mcLeft.getColsPerBlock());
        checkValidOutputDimensions(mcOut);
        // note: always frame rhs, scalars are preprocessed via cast to 1x1 frame
        MatrixCharacteristics mcRight = ec.getMatrixCharacteristics(input2.getName());
        // sanity check matching index range and rhs dimensions
        if (!mcRight.dimsKnown()) {
            throw new DMLRuntimeException("The right input frame dimensions are not specified for FrameIndexingSPInstruction");
        }
        if (!(ru - rl + 1 == mcRight.getRows() && cu - cl + 1 == mcRight.getCols())) {
            throw new DMLRuntimeException("Invalid index range of leftindexing: [" + rl + ":" + ru + "," + cl + ":" + cu + "] vs [" + mcRight.getRows() + "x" + mcRight.getCols() + "].");
        }
        if (opcode.equalsIgnoreCase("mapLeftIndex")) {
            broadcastIn2 = sec.getBroadcastForFrameVariable(input2.getName());
            // partitioning-preserving mappartitions (key access required for broadcast loopkup)
            out = in1.mapPartitionsToPair(new LeftIndexPartitionFunction(broadcastIn2, ixrange, mcOut), true);
        } else {
            // general case
            // zero-out lhs
            in1 = in1.flatMapToPair(new ZeroOutLHS(false, ixrange, mcLeft));
            // slice rhs, shift and merge with lhs
            in2 = sec.getFrameBinaryBlockRDDHandleForVariable(input2.getName()).flatMapToPair(new SliceRHSForLeftIndexing(ixrange, mcLeft));
            out = FrameRDDAggregateUtils.mergeByKey(in1.union(in2));
        }
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
        if (broadcastIn2 != null)
            sec.addLineageBroadcast(output.getName(), input2.getName());
        if (in2 != null)
            sec.addLineageRDD(output.getName(), input2.getName());
    } else
        throw new DMLRuntimeException("Invalid opcode (" + opcode + ") encountered in FrameIndexingSPInstruction.");
}
Also used : IsFrameBlockInRange(org.apache.sysml.runtime.instructions.spark.functions.IsFrameBlockInRange) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IndexRange(org.apache.sysml.runtime.util.IndexRange) PartitionedBroadcast(org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Aggregations

SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)112 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)92 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)92 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)71 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)39 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)22 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)14 DoubleObject (org.apache.sysml.runtime.instructions.cp.DoubleObject)12 ScalarObject (org.apache.sysml.runtime.instructions.cp.ScalarObject)9 PartitionedBroadcast (org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast)8 FilterNonEmptyBlocksFunction (org.apache.sysml.runtime.instructions.spark.functions.FilterNonEmptyBlocksFunction)7 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)7 ArrayList (java.util.ArrayList)6 CPOperand (org.apache.sysml.runtime.instructions.cp.CPOperand)6 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)6 AggregateDropCorrectionFunction (org.apache.sysml.runtime.instructions.spark.functions.AggregateDropCorrectionFunction)6 AggregateOperator (org.apache.sysml.runtime.matrix.operators.AggregateOperator)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)5 ValueType (org.apache.sysml.parser.Expression.ValueType)4