Search in sources :

Example 21 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class SparkExecutionContext method cleanupCacheableData.

@Override
public void cleanupCacheableData(CacheableData<?> mo) {
    try {
        if (mo.isCleanupEnabled()) {
            // compute ref count only if matrix cleanup actually necessary
            if (!getVariables().hasReferences(mo)) {
                // clean cached data
                mo.clearData();
                // clean hdfs data if no pending rdd operations on it
                if (mo.isHDFSFileExists() && mo.getFileName() != null) {
                    if (mo.getRDDHandle() == null) {
                        MapReduceTool.deleteFileWithMTDIfExistOnHDFS(mo.getFileName());
                    } else {
                        // deferred file removal
                        RDDObject rdd = mo.getRDDHandle();
                        rdd.setHDFSFilename(mo.getFileName());
                    }
                }
                // note: requires that mo.clearData already removed back references
                if (mo.getRDDHandle() != null) {
                    rCleanupLineageObject(mo.getRDDHandle());
                }
                if (mo.getBroadcastHandle() != null) {
                    rCleanupLineageObject(mo.getBroadcastHandle());
                }
            }
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
}
Also used : RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 22 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class SparkExecutionContext method getRDDHandleForMatrixObject.

/**
 * This call returns an RDD handle for a given matrix object. This includes
 * the creation of RDDs for in-memory or binary-block HDFS data.
 *
 * @param mo matrix object
 * @param inputInfo input info
 * @return JavaPairRDD handle for a matrix object
 */
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForMatrixObject(MatrixObject mo, InputInfo inputInfo) {
    // NOTE: MB this logic should be integrated into MatrixObject
    // However, for now we cannot assume that spark libraries are
    // always available and hence only store generic references in
    // matrix object while all the logic is in the SparkExecContext
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    // rdd operations if already executed and cached
    if (mo.getRDDHandle() != null && (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false))) {
        // return existing rdd handling (w/o input format change)
        rdd = mo.getRDDHandle().getRDD();
    } else // CASE 2: dirty in memory data or cached result of rdd operations
    if (mo.isDirty() || mo.isCached(false)) {
        // get in-memory matrix block and parallelize it
        // w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = mo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (// write if necessary
            mo.isDirty() || !mo.isHDFSFileExists())
                mo.exportData();
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
            fromFile = true;
        } else {
            // default case
            // pin matrix in memory
            MatrixBlock mb = mo.acquireRead();
            rdd = toMatrixJavaPairRDD(sc, mb, (int) mo.getNumRowsPerBlock(), (int) mo.getNumColumnsPerBlock());
            // unpin matrix
            mo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(fromFile);
        rddhandle.setParallelizedRDD(!fromFile);
        mo.setRDDHandle(rddhandle);
    } else // CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo == InputInfo.BinaryBlockInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            // recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            // cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
        } else if (inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo == InputInfo.BinaryCellInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>) rdd).mapToPair(new CopyBinaryCellFunction());
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(true);
        mo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyBinaryCellFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 23 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class SparkExecutionContext method repartitionAndCacheMatrixObject.

@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject(String var) {
    MatrixObject mo = getMatrixObject(var);
    MatrixCharacteristics mcIn = mo.getMatrixCharacteristics();
    // double check size to avoid unnecessary spark context creation
    if (!OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), (double) OptimizerUtils.estimateSizeExactSparsity(mcIn)))
        return;
    // get input rdd and default storage level
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);
    // avoid unnecessary caching of input in order to reduce memory pressure
    if (mo.getRDDHandle().allowsShortCircuitRead() && isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id())) {
        in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        // investigate issue of unnecessarily large number of partitions
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
        if (numPartitions < in.getNumPartitions())
            in = in.coalesce(numPartitions);
    }
    // repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
    // executed on the original data, because there will be no merge, i.e., no key duplicates
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);
    // convert mcsr into memory-efficient csr if potentially sparse
    if (OptimizerUtils.checkSparseBlockCSRConversion(mcIn)) {
        out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
    }
    // persist rdd in default storage level
    out.persist(Checkpoint.DEFAULT_STORAGE_LEVEL).count();
    // create new rdd handle, in-place of current matrix object
    // guaranteed to exist (see above)
    RDDObject inro = mo.getRDDHandle();
    // create new rdd object
    RDDObject outro = new RDDObject(out);
    // mark as checkpointed
    outro.setCheckpointRDD(true);
    // keep lineage to prevent cycles on cleanup
    outro.addLineageChild(inro);
    mo.setRDDHandle(outro);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CreateSparseBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Checkpoint(org.apache.sysml.lops.Checkpoint) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 24 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class SparkExecutionContext method getRDDHandleForFrameObject.

/**
 * FIXME: currently this implementation assumes matrix representations but frame signature
 * in order to support the old transform implementation.
 *
 * @param fo frame object
 * @param inputInfo input info
 * @return JavaPairRDD handle for a frame object
 */
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForFrameObject(FrameObject fo, InputInfo inputInfo) {
    // NOTE: MB this logic should be integrated into FrameObject
    // However, for now we cannot assume that spark libraries are
    // always available and hence only store generic references in
    // matrix object while all the logic is in the SparkExecContext
    InputInfo inputInfo2 = (inputInfo == InputInfo.BinaryBlockInputInfo) ? InputInfo.BinaryBlockFrameInputInfo : inputInfo;
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    // rdd operations if already executed and cached
    if (fo.getRDDHandle() != null && (fo.getRDDHandle().isCheckpointRDD() || !fo.isCached(false))) {
        // return existing rdd handling (w/o input format change)
        rdd = fo.getRDDHandle().getRDD();
    } else // CASE 2: dirty in memory data or cached result of rdd operations
    if (fo.isDirty() || fo.isCached(false)) {
        // get in-memory matrix block and parallelize it
        // w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = fo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (fo.isDirty()) {
                // write only if necessary
                fo.exportData();
            }
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
            fromFile = true;
        } else {
            // default case
            // pin frame in memory
            FrameBlock fb = fo.acquireRead();
            rdd = toFrameJavaPairRDD(sc, fb);
            // unpin frame
            fo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(fromFile);
        fo.setRDDHandle(rddhandle);
    } else // CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo2 == InputInfo.BinaryBlockFrameInputInfo) {
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            // recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
        } else if (inputInfo2 == InputInfo.TextCellInputInfo || inputInfo2 == InputInfo.CSVInputInfo || inputInfo2 == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo2 == InputInfo.BinaryCellInputInfo) {
            throw new DMLRuntimeException("Binarycell not supported for frames.");
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(true);
        fo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 25 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class ResultMergeRemoteSpark method executeMerge.

@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, long rlen, long clen, int brlen, int bclen) {
    String jobname = "ParFor-RMSP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    SparkExecutionContext sec = (SparkExecutionContext) _ec;
    boolean withCompare = (compare != null);
    RDDObject ret = null;
    // determine degree of parallelism
    int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
    // sanity check for empty src files
    if (inputs == null || inputs.length == 0)
        throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
    try {
        // note: initial implementation via union over all result rdds discarded due to
        // stack overflow errors with many parfor tasks, and thus many rdds
        // Step 1: construct input rdd from all result files of parfor workers
        // a) construct job conf with all files
        InputInfo ii = InputInfo.BinaryBlockInputInfo;
        JobConf job = new JobConf(ResultMergeRemoteMR.class);
        job.setJobName(jobname);
        job.setInputFormat(ii.inputFormatClass);
        Path[] paths = new Path[inputs.length];
        for (int i = 0; i < paths.length; i++) {
            // ensure input exists on hdfs (e.g., if in-memory or RDD)
            inputs[i].exportData();
            paths[i] = new Path(inputs[i].getFileName());
            // update rdd handle to allow lazy evaluation by guarding
            // against cleanup of temporary result files
            setRDDHandleForMerge(inputs[i], sec);
        }
        FileInputFormat.setInputPaths(job, paths);
        // b) create rdd from input files w/ deep copy of keys and blocks
        JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
        // Step 2a: merge with compare
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
        if (withCompare) {
            JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
            // merge values which differ from compare values
            ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare(_isAccum);
            out = // group all result blocks per key
            rdd.groupByKey(numRed).join(// join compare block and result blocks
            compareRdd).mapToPair(// merge result blocks w/ compare
            cfun);
        } else // Step 2b: merge without compare
        {
            // direct merge in any order (disjointness guaranteed)
            out = _isAccum ? RDDAggregateUtils.sumByKeyStable(rdd, false) : RDDAggregateUtils.mergeByKey(rdd, false);
        }
        // Step 3: create output rdd handle w/ lineage
        ret = new RDDObject(out);
        for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
        if (withCompare)
            ret.addLineageChild(compare.getRDDHandle());
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
        Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) CopyBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBlockPairFunction) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)31 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)22 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)13 LongWritable (org.apache.hadoop.io.LongWritable)11 Text (org.apache.hadoop.io.Text)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)10 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)10 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)9 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)9 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)9 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)8 ValueType (org.apache.sysml.parser.Expression.ValueType)7 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)7 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)7 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)6 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)6 IOException (java.io.IOException)4 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)4 Path (org.apache.hadoop.fs.Path)3