use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class ResultMergeRemoteSpark method setRDDHandleForMerge.
@SuppressWarnings("unchecked")
private static void setRDDHandleForMerge(MatrixObject mo, SparkExecutionContext sec) {
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaSparkContext sc = sec.getSparkContext();
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(mo.getFileName(), iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
RDDObject rddhandle = new RDDObject(rdd);
rddhandle.setHDFSFile(true);
mo.setRDDHandle(rddhandle);
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class ResultMergeRemoteSpark method executeParallelMerge.
@Override
public MatrixObject executeParallelMerge(int par) {
// always create new matrix object (required for nested parallelism)
MatrixObject moNew = null;
if (LOG.isTraceEnabled())
LOG.trace("ResultMerge (remote, spark): Execute serial merge for output " + _output.hashCode() + " (fname=" + _output.getFileName() + ")");
try {
if (_inputs != null && _inputs.length > 0) {
// prepare compare
MetaDataFormat metadata = (MetaDataFormat) _output.getMetaData();
MatrixCharacteristics mcOld = metadata.getMatrixCharacteristics();
MatrixObject compare = (mcOld.getNonZeros() == 0) ? null : _output;
// actual merge
RDDObject ro = executeMerge(compare, _inputs, mcOld.getRows(), mcOld.getCols(), mcOld.getRowsPerBlock(), mcOld.getColsPerBlock());
// create new output matrix (e.g., to prevent potential export<->read file access conflict
moNew = new MatrixObject(_output.getValueType(), _outputFName);
OutputInfo oiOld = metadata.getOutputInfo();
InputInfo iiOld = metadata.getInputInfo();
MatrixCharacteristics mc = new MatrixCharacteristics(mcOld);
mc.setNonZeros(_isAccum ? -1 : computeNonZeros(_output, Arrays.asList(_inputs)));
MetaDataFormat meta = new MetaDataFormat(mc, oiOld, iiOld);
moNew.setMetaData(meta);
moNew.setRDDHandle(ro);
} else {
// return old matrix, to prevent copy
moNew = _output;
}
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
return moNew;
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class OptimizerRuleBased method rewriteSetSparkEagerRDDCaching.
// /////
// REWRITE set spark eager rdd caching
// /
protected void rewriteSetSparkEagerRDDCaching(OptNode n, LocalVariableMap vars) {
// get program blocks of root parfor
Object[] progobj = OptTreeConverter.getAbstractPlanMapping().getMappedProg(n.getID());
ParForStatementBlock pfsb = (ParForStatementBlock) progobj[0];
ParForProgramBlock pfpb = (ParForProgramBlock) progobj[1];
ArrayList<String> ret = new ArrayList<>();
if (// spark exec mode
OptimizerUtils.isSparkExecutionMode() && // local parfor
n.getExecType() == ExecType.CP && // at least 2 iterations
_N > 1) {
Set<String> cand = pfsb.variablesRead().getVariableNames();
Collection<String> rpVars = pfpb.getSparkRepartitionVariables();
for (String var : cand) {
Data dat = vars.get(var);
if (dat != null && dat instanceof MatrixObject && ((MatrixObject) dat).getRDDHandle() != null) {
MatrixObject mo = (MatrixObject) dat;
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
RDDObject rdd = mo.getRDDHandle();
if (// not a repartition var
(rpVars == null || !rpVars.contains(var)) && // is cached rdd
rdd.rHasCheckpointRDDChilds() && // is out-of-core dataset
_lm / n.getK() < OptimizerUtils.estimateSizeExactSparsity(mc)) {
ret.add(var);
}
}
}
// apply rewrite to parfor pb
if (!ret.isEmpty()) {
pfpb.setSparkEagerCacheVariables(ret);
}
}
_numEvaluatedPlans++;
LOG.debug(getOptMode() + " OPT: rewrite 'set spark eager rdd caching' - result=" + ret.size() + " (" + ProgramConverter.serializeStringCollection(ret) + ")");
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class SparkExecutionContext method rCleanupLineageObject.
@SuppressWarnings({ "rawtypes", "unchecked" })
private void rCleanupLineageObject(LineageObject lob) throws IOException {
// abort recursive cleanup if still consumers
if (lob.getNumReferences() > 0)
return;
// robustness in function calls and to prevent repeated scans of the symbol table)
if (lob.hasBackReference())
return;
// incl deferred hdfs file removal (only if metadata set by cleanup call)
if (lob instanceof RDDObject) {
RDDObject rdd = (RDDObject) lob;
int rddID = rdd.getRDD().id();
cleanupRDDVariable(rdd.getRDD());
if (rdd.getHDFSFilename() != null) {
// deferred file removal
MapReduceTool.deleteFileWithMTDIfExistOnHDFS(rdd.getHDFSFilename());
}
if (rdd.isParallelizedRDD())
_parRDDs.deregisterRDD(rddID);
} else if (lob instanceof BroadcastObject) {
PartitionedBroadcast pbm = ((BroadcastObject) lob).getBroadcast();
if (// robustness for evictions
pbm != null)
for (Broadcast<PartitionedBlock> bc : pbm.getBroadcasts()) cleanupBroadcastVariable(bc);
CacheableData.addBroadcastSize(-((BroadcastObject) lob).getSize());
}
// recursively process lineage children
for (LineageObject c : lob.getLineageChilds()) {
c.decrementNumReferences();
rCleanupLineageObject(c);
}
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class SparkExecutionContext method setRDDHandleForVariable.
/**
* Keep the output rdd of spark rdd operations as meta data of matrix/frame
* objects in the symbol table.
*
* @param varname variable name
* @param rdd JavaPairRDD handle for variable
*/
public void setRDDHandleForVariable(String varname, JavaPairRDD<?, ?> rdd) {
CacheableData<?> obj = getCacheableData(varname);
RDDObject rddhandle = new RDDObject(rdd);
obj.setRDDHandle(rddhandle);
}
Aggregations