Search in sources :

Example 11 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class ResultMergeRemoteSpark method setRDDHandleForMerge.

@SuppressWarnings("unchecked")
private static void setRDDHandleForMerge(MatrixObject mo, SparkExecutionContext sec) {
    InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
    JavaSparkContext sc = sec.getSparkContext();
    JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(mo.getFileName(), iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
    RDDObject rddhandle = new RDDObject(rdd);
    rddhandle.setHDFSFile(true);
    mo.setRDDHandle(rddhandle);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 12 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class ResultMergeRemoteSpark method executeParallelMerge.

@Override
public MatrixObject executeParallelMerge(int par) {
    // always create new matrix object (required for nested parallelism)
    MatrixObject moNew = null;
    if (LOG.isTraceEnabled())
        LOG.trace("ResultMerge (remote, spark): Execute serial merge for output " + _output.hashCode() + " (fname=" + _output.getFileName() + ")");
    try {
        if (_inputs != null && _inputs.length > 0) {
            // prepare compare
            MetaDataFormat metadata = (MetaDataFormat) _output.getMetaData();
            MatrixCharacteristics mcOld = metadata.getMatrixCharacteristics();
            MatrixObject compare = (mcOld.getNonZeros() == 0) ? null : _output;
            // actual merge
            RDDObject ro = executeMerge(compare, _inputs, mcOld.getRows(), mcOld.getCols(), mcOld.getRowsPerBlock(), mcOld.getColsPerBlock());
            // create new output matrix (e.g., to prevent potential export<->read file access conflict
            moNew = new MatrixObject(_output.getValueType(), _outputFName);
            OutputInfo oiOld = metadata.getOutputInfo();
            InputInfo iiOld = metadata.getInputInfo();
            MatrixCharacteristics mc = new MatrixCharacteristics(mcOld);
            mc.setNonZeros(_isAccum ? -1 : computeNonZeros(_output, Arrays.asList(_inputs)));
            MetaDataFormat meta = new MetaDataFormat(mc, oiOld, iiOld);
            moNew.setMetaData(meta);
            moNew.setRDDHandle(ro);
        } else {
            // return old matrix, to prevent copy
            moNew = _output;
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    return moNew;
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 13 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class OptimizerRuleBased method rewriteSetSparkEagerRDDCaching.

// /////
// REWRITE set spark eager rdd caching
// /
protected void rewriteSetSparkEagerRDDCaching(OptNode n, LocalVariableMap vars) {
    // get program blocks of root parfor
    Object[] progobj = OptTreeConverter.getAbstractPlanMapping().getMappedProg(n.getID());
    ParForStatementBlock pfsb = (ParForStatementBlock) progobj[0];
    ParForProgramBlock pfpb = (ParForProgramBlock) progobj[1];
    ArrayList<String> ret = new ArrayList<>();
    if (// spark exec mode
    OptimizerUtils.isSparkExecutionMode() && // local parfor
    n.getExecType() == ExecType.CP && // at least 2 iterations
    _N > 1) {
        Set<String> cand = pfsb.variablesRead().getVariableNames();
        Collection<String> rpVars = pfpb.getSparkRepartitionVariables();
        for (String var : cand) {
            Data dat = vars.get(var);
            if (dat != null && dat instanceof MatrixObject && ((MatrixObject) dat).getRDDHandle() != null) {
                MatrixObject mo = (MatrixObject) dat;
                MatrixCharacteristics mc = mo.getMatrixCharacteristics();
                RDDObject rdd = mo.getRDDHandle();
                if (// not a repartition var
                (rpVars == null || !rpVars.contains(var)) && // is cached rdd
                rdd.rHasCheckpointRDDChilds() && // is out-of-core dataset
                _lm / n.getK() < OptimizerUtils.estimateSizeExactSparsity(mc)) {
                    ret.add(var);
                }
            }
        }
        // apply rewrite to parfor pb
        if (!ret.isEmpty()) {
            pfpb.setSparkEagerCacheVariables(ret);
        }
    }
    _numEvaluatedPlans++;
    LOG.debug(getOptMode() + " OPT: rewrite 'set spark eager rdd caching' - result=" + ret.size() + " (" + ProgramConverter.serializeStringCollection(ret) + ")");
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ArrayList(java.util.ArrayList) Data(org.apache.sysml.runtime.instructions.cp.Data) ParForProgramBlock(org.apache.sysml.runtime.controlprogram.ParForProgramBlock) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) ParForStatementBlock(org.apache.sysml.parser.ParForStatementBlock) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject)

Example 14 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class SparkExecutionContext method rCleanupLineageObject.

@SuppressWarnings({ "rawtypes", "unchecked" })
private void rCleanupLineageObject(LineageObject lob) throws IOException {
    // abort recursive cleanup if still consumers
    if (lob.getNumReferences() > 0)
        return;
    // robustness in function calls and to prevent repeated scans of the symbol table)
    if (lob.hasBackReference())
        return;
    // incl deferred hdfs file removal (only if metadata set by cleanup call)
    if (lob instanceof RDDObject) {
        RDDObject rdd = (RDDObject) lob;
        int rddID = rdd.getRDD().id();
        cleanupRDDVariable(rdd.getRDD());
        if (rdd.getHDFSFilename() != null) {
            // deferred file removal
            MapReduceTool.deleteFileWithMTDIfExistOnHDFS(rdd.getHDFSFilename());
        }
        if (rdd.isParallelizedRDD())
            _parRDDs.deregisterRDD(rddID);
    } else if (lob instanceof BroadcastObject) {
        PartitionedBroadcast pbm = ((BroadcastObject) lob).getBroadcast();
        if (// robustness for evictions
        pbm != null)
            for (Broadcast<PartitionedBlock> bc : pbm.getBroadcasts()) cleanupBroadcastVariable(bc);
        CacheableData.addBroadcastSize(-((BroadcastObject) lob).getSize());
    }
    // recursively process lineage children
    for (LineageObject c : lob.getLineageChilds()) {
        c.decrementNumReferences();
        rCleanupLineageObject(c);
    }
}
Also used : PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) PartitionedBroadcast(org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) LineageObject(org.apache.sysml.runtime.instructions.spark.data.LineageObject) Checkpoint(org.apache.sysml.lops.Checkpoint) BroadcastObject(org.apache.sysml.runtime.instructions.spark.data.BroadcastObject)

Example 15 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class SparkExecutionContext method setRDDHandleForVariable.

/**
 * Keep the output rdd of spark rdd operations as meta data of matrix/frame
 * objects in the symbol table.
 *
 * @param varname variable name
 * @param rdd JavaPairRDD handle for variable
 */
public void setRDDHandleForVariable(String varname, JavaPairRDD<?, ?> rdd) {
    CacheableData<?> obj = getCacheableData(varname);
    RDDObject rddhandle = new RDDObject(rdd);
    obj.setRDDHandle(rddhandle);
}
Also used : RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject)

Aggregations

RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)31 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)22 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)13 LongWritable (org.apache.hadoop.io.LongWritable)11 Text (org.apache.hadoop.io.Text)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)10 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)10 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)9 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)9 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)9 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)8 ValueType (org.apache.sysml.parser.Expression.ValueType)7 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)7 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)7 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)6 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)6 IOException (java.io.IOException)4 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)4 Path (org.apache.hadoop.fs.Path)3