use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.
the class FrameObject method readBlobFromRDD.
@Override
protected FrameBlock readBlobFromRDD(RDDObject rdd, MutableBoolean status) throws IOException {
// note: the read of a frame block from an RDD might trigger
// lazy evaluation of pending transformations.
RDDObject lrdd = rdd;
// prepare return status (by default only collect)
status.setValue(false);
MetaDataFormat iimd = (MetaDataFormat) _metaData;
MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
int rlen = (int) mc.getRows();
int clen = (int) mc.getCols();
// handle missing schema if necessary
ValueType[] lschema = (_schema != null) ? _schema : UtilFunctions.nCopies(clen >= 1 ? (int) clen : 1, ValueType.STRING);
FrameBlock fb = null;
try {
// prevent unnecessary collect through rdd checkpoint
if (rdd.allowsShortCircuitCollect()) {
lrdd = (RDDObject) rdd.getLineageChilds().get(0);
}
// collect frame block from binary block RDD
fb = SparkExecutionContext.toFrameBlock(lrdd, lschema, rlen, clen);
} catch (DMLRuntimeException ex) {
throw new IOException(ex);
}
// sanity check correct output
if (fb == null)
throw new IOException("Unable to load frame from rdd.");
return fb;
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.
the class MatrixObject method readBlobFromRDD.
@Override
protected MatrixBlock readBlobFromRDD(RDDObject rdd, MutableBoolean writeStatus) throws IOException {
// note: the read of a matrix block from an RDD might trigger
// lazy evaluation of pending transformations.
RDDObject lrdd = rdd;
// prepare return status (by default only collect)
writeStatus.setValue(false);
MetaDataFormat iimd = (MetaDataFormat) _metaData;
MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
InputInfo ii = iimd.getInputInfo();
MatrixBlock mb = null;
try {
// prevent unnecessary collect through rdd checkpoint
if (rdd.allowsShortCircuitCollect()) {
lrdd = (RDDObject) rdd.getLineageChilds().get(0);
}
// obtain matrix block from RDD
int rlen = (int) mc.getRows();
int clen = (int) mc.getCols();
int brlen = (int) mc.getRowsPerBlock();
int bclen = (int) mc.getColsPerBlock();
long nnz = mc.getNonZerosBound();
// guarded rdd collect
if (// guarded collect not for binary cell
ii == InputInfo.BinaryBlockInputInfo && !OptimizerUtils.checkSparkCollectMemoryBudget(mc, getPinnedSize() + getBroadcastSize(), true)) {
// note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower
if (!MapReduceTool.existsFileOnHDFS(_hdfsFileName)) {
// prevent overwrite existing file
long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo());
_metaData.getMatrixCharacteristics().setNonZeros(newnnz);
// mark rdd as non-pending (for export)
((RDDObject) rdd).setPending(false);
// mark rdd as hdfs file (for restore)
((RDDObject) rdd).setHDFSFile(true);
// mark for no cache-write on read
writeStatus.setValue(true);
// note: the flag hdfsFile is actually not entirely correct because we still hold an rdd
// reference to the input not to an rdd of the hdfs file but the resulting behavior is correct
}
mb = readBlobFromHDFS(_hdfsFileName);
} else if (ii == InputInfo.BinaryCellInputInfo) {
// collect matrix block from binary block RDD
mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, nnz);
} else {
// collect matrix block from binary cell RDD
mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz);
}
} catch (DMLRuntimeException ex) {
throw new IOException(ex);
}
// sanity check correct output
if (mb == null)
throw new IOException("Unable to load matrix from rdd.");
return mb;
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.
the class SparkExecutionContext method getRDDHandleForFrameObject.
/**
* FIXME: currently this implementation assumes matrix representations but frame signature
* in order to support the old transform implementation.
*
* @param fo frame object
* @param inputInfo input info
* @return JavaPairRDD handle for a frame object
*/
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForFrameObject(FrameObject fo, InputInfo inputInfo) {
// NOTE: MB this logic should be integrated into FrameObject
// However, for now we cannot assume that spark libraries are
// always available and hence only store generic references in
// matrix object while all the logic is in the SparkExecContext
InputInfo inputInfo2 = (inputInfo == InputInfo.BinaryBlockInputInfo) ? InputInfo.BinaryBlockFrameInputInfo : inputInfo;
JavaSparkContext sc = getSparkContext();
JavaPairRDD<?, ?> rdd = null;
// rdd operations if already executed and cached
if (fo.getRDDHandle() != null && (fo.getRDDHandle().isCheckpointRDD() || !fo.isCached(false))) {
// return existing rdd handling (w/o input format change)
rdd = fo.getRDDHandle().getRDD();
} else // CASE 2: dirty in memory data or cached result of rdd operations
if (fo.isDirty() || fo.isCached(false)) {
// get in-memory matrix block and parallelize it
// w/ guarded parallelize (fallback to export, rdd from file if too large)
MatrixCharacteristics mc = fo.getMatrixCharacteristics();
boolean fromFile = false;
if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
if (fo.isDirty()) {
// write only if necessary
fo.exportData();
}
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
// cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
fromFile = true;
} else {
// default case
// pin frame in memory
FrameBlock fb = fo.acquireRead();
rdd = toFrameJavaPairRDD(sc, fb);
// unpin frame
fo.release();
_parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
}
// keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd);
rddhandle.setHDFSFile(fromFile);
fo.setRDDHandle(rddhandle);
} else // CASE 3: non-dirty (file exists on HDFS)
{
// For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
if (inputInfo2 == InputInfo.BinaryBlockFrameInputInfo) {
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
// note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
// recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
// cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
} else if (inputInfo2 == InputInfo.TextCellInputInfo || inputInfo2 == InputInfo.CSVInputInfo || inputInfo2 == InputInfo.MatrixMarketInputInfo) {
rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
// cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
} else if (inputInfo2 == InputInfo.BinaryCellInputInfo) {
throw new DMLRuntimeException("Binarycell not supported for frames.");
} else {
throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
}
// keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd);
rddhandle.setHDFSFile(true);
fo.setRDDHandle(rddhandle);
}
return rdd;
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.
the class SparkExecutionContext method addLineageRDD.
// /////////////////////////////////////////
// Cleanup of RDDs and Broadcast variables
// /////
/**
* Adds a child rdd object to the lineage of a parent rdd.
*
* @param varParent parent variable
* @param varChild child variable
*/
public void addLineageRDD(String varParent, String varChild) {
RDDObject parent = getCacheableData(varParent).getRDDHandle();
RDDObject child = getCacheableData(varChild).getRDDHandle();
parent.addLineageChild(child);
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.
the class SparkExecutionContext method rCleanupLineageObject.
@SuppressWarnings({ "rawtypes", "unchecked" })
private void rCleanupLineageObject(LineageObject lob) throws IOException {
// abort recursive cleanup if still consumers
if (lob.getNumReferences() > 0)
return;
// robustness in function calls and to prevent repeated scans of the symbol table)
if (lob.hasBackReference())
return;
// incl deferred hdfs file removal (only if metadata set by cleanup call)
if (lob instanceof RDDObject) {
RDDObject rdd = (RDDObject) lob;
int rddID = rdd.getRDD().id();
cleanupRDDVariable(rdd.getRDD());
if (rdd.getHDFSFilename() != null) {
// deferred file removal
MapReduceTool.deleteFileWithMTDIfExistOnHDFS(rdd.getHDFSFilename());
}
if (rdd.isParallelizedRDD())
_parRDDs.deregisterRDD(rddID);
} else if (lob instanceof BroadcastObject) {
PartitionedBroadcast pbm = ((BroadcastObject) lob).getBroadcast();
if (// robustness for evictions
pbm != null)
for (Broadcast<PartitionedBlock> bc : pbm.getBroadcasts()) cleanupBroadcastVariable(bc);
CacheableData.addBroadcastSize(-((BroadcastObject) lob).getSize());
}
// recursively process lineage children
for (LineageObject c : lob.getLineageChilds()) {
c.decrementNumReferences();
rCleanupLineageObject(c);
}
}
Aggregations