Search in sources :

Example 56 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class ResultMergeRemoteMapper method configure.

public void configure(JobConf job) {
    InputInfo ii = MRJobConfiguration.getResultMergeInputInfo(job);
    long[] tmp = MRJobConfiguration.getResultMergeMatrixCharacteristics(job);
    String compareFname = MRJobConfiguration.getResultMergeInfoCompareFilename(job);
    String currentFname = job.get(MRConfigurationNames.MR_MAP_INPUT_FILE);
    byte tag = 0;
    // startsWith comparison in order to account for part names in currentFname
    if (currentFname.startsWith(compareFname))
        tag = ResultMergeRemoteMR.COMPARE_TAG;
    else
        tag = ResultMergeRemoteMR.DATA_TAG;
    if (ii == InputInfo.TextCellInputInfo)
        _mapper = new ResultMergeMapperTextCell(tag);
    else if (ii == InputInfo.BinaryCellInputInfo)
        _mapper = new ResultMergeMapperBinaryCell(tag);
    else if (ii == InputInfo.BinaryBlockInputInfo)
        _mapper = new ResultMergeMapperBinaryBlock(tag, tmp[0], tmp[1], tmp[2], tmp[3]);
    else
        throw new RuntimeException("Unable to configure mapper with unknown input info: " + ii.toString());
}
Also used : InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo)

Example 57 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class ResultMergeRemoteSpark method setRDDHandleForMerge.

@SuppressWarnings("unchecked")
private static void setRDDHandleForMerge(MatrixObject mo, SparkExecutionContext sec) {
    InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
    JavaSparkContext sc = sec.getSparkContext();
    JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(mo.getFileName(), iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
    RDDObject rddhandle = new RDDObject(rdd);
    rddhandle.setHDFSFile(true);
    mo.setRDDHandle(rddhandle);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 58 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class ResultMergeRemoteSpark method executeMerge.

@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, long rlen, long clen, int brlen, int bclen) {
    String jobname = "ParFor-RMSP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    SparkExecutionContext sec = (SparkExecutionContext) _ec;
    boolean withCompare = (compare != null);
    RDDObject ret = null;
    // determine degree of parallelism
    int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
    // sanity check for empty src files
    if (inputs == null || inputs.length == 0)
        throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
    try {
        // note: initial implementation via union over all result rdds discarded due to
        // stack overflow errors with many parfor tasks, and thus many rdds
        // Step 1: construct input rdd from all result files of parfor workers
        // a) construct job conf with all files
        InputInfo ii = InputInfo.BinaryBlockInputInfo;
        JobConf job = new JobConf(ResultMergeRemoteMR.class);
        job.setJobName(jobname);
        job.setInputFormat(ii.inputFormatClass);
        Path[] paths = new Path[inputs.length];
        for (int i = 0; i < paths.length; i++) {
            // ensure input exists on hdfs (e.g., if in-memory or RDD)
            inputs[i].exportData();
            paths[i] = new Path(inputs[i].getFileName());
            // update rdd handle to allow lazy evaluation by guarding
            // against cleanup of temporary result files
            setRDDHandleForMerge(inputs[i], sec);
        }
        FileInputFormat.setInputPaths(job, paths);
        // b) create rdd from input files w/ deep copy of keys and blocks
        JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
        // Step 2a: merge with compare
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
        if (withCompare) {
            JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
            // merge values which differ from compare values
            ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare(_isAccum);
            out = // group all result blocks per key
            rdd.groupByKey(numRed).join(// join compare block and result blocks
            compareRdd).mapToPair(// merge result blocks w/ compare
            cfun);
        } else // Step 2b: merge without compare
        {
            // direct merge in any order (disjointness guaranteed)
            out = _isAccum ? RDDAggregateUtils.sumByKeyStable(rdd, false) : RDDAggregateUtils.mergeByKey(rdd, false);
        }
        // Step 3: create output rdd handle w/ lineage
        ret = new RDDObject(out);
        for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
        if (withCompare)
            ret.addLineageChild(compare.getRDDHandle());
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
        Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) CopyBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBlockPairFunction) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JobConf(org.apache.hadoop.mapred.JobConf)

Example 59 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class TransformEncodeDecodeTest method runTransformEncodeDecodeTest.

private void runTransformEncodeDecodeTest(ExecType et, boolean sparse, String fmt) {
    RUNTIME_PLATFORM platformOld = rtplatform;
    // only CP supported
    rtplatform = RUNTIME_PLATFORM.HYBRID;
    try {
        getAndLoadTestConfiguration(TEST_NAME1);
        // get input/output info
        InputInfo iinfo = InputInfo.stringExternalToInputInfo(fmt);
        OutputInfo oinfo = InputInfo.getMatchingOutputInfo(iinfo);
        // generate and write input data
        double[][] A = TestUtils.round(getRandomMatrix(rows, cols, 1, 15, sparse ? sparsity2 : sparsity1, 7));
        FrameBlock FA = DataConverter.convertToFrameBlock(DataConverter.convertToMatrixBlock(A));
        FrameWriter writer = FrameWriterFactory.createFrameWriter(oinfo);
        writer.writeFrameToHDFS(FA, input("F"), rows, cols);
        fullDMLScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME1 + ".dml";
        programArgs = new String[] { "-explain", "-args", input("F"), fmt, String.valueOf(rows), String.valueOf(cols), SCRIPT_DIR + TEST_DIR + SPEC, output("FO") };
        // run test
        runTest(true, false, null, -1);
        // compare matrices (values recoded to identical codes)
        FrameReader reader = FrameReaderFactory.createFrameReader(iinfo);
        FrameBlock FO = reader.readFrameFromHDFS(output("FO"), 16, 2);
        HashMap<String, Long> cFA = getCounts(FA, 1);
        Iterator<String[]> iterFO = FO.getStringRowIterator();
        while (iterFO.hasNext()) {
            String[] row = iterFO.next();
            Double expected = (double) cFA.get(row[1]);
            Double val = (row[0] != null) ? Double.valueOf(row[0]) : 0;
            Assert.assertEquals("Output aggregates don't match: " + expected + " vs " + val, expected, val);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
        Assert.fail(ex.getMessage());
    } finally {
        rtplatform = platformOld;
    }
}
Also used : FrameWriter(org.apache.sysml.runtime.io.FrameWriter) RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) FrameReader(org.apache.sysml.runtime.io.FrameReader)

Example 60 with InputInfo

use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.

the class RemoteDPParForSpark method getPartitionedInput.

@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
    InputInfo ii = InputInfo.BinaryBlockInputInfo;
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    // NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
    if (hasInputDataSet(dpf, mo)) {
        DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
        Dataset<Row> in = dsObj.getDataset();
        // construct or reuse row ids
        JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
        in.javaRDD().zipWithIndex();
        // convert row to row in matrix block format
        return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
    } else // binary block input rdd without grouping
    if (!requiresGrouping(dpf, mo)) {
        // get input rdd and data partitioning
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    } else // default binary block input rdd with grouping
    {
        // get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
        // to reduce memory pressure for shuffle and subsequent
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
            in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        // data partitioning of input rdd
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject) DataFrameExtractIDFunction(org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Row(org.apache.spark.sql.Row)

Aggregations

InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)74 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)38 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)30 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)26 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)20 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)20 IOException (java.io.IOException)17 JobConf (org.apache.hadoop.mapred.JobConf)13 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)13 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)12 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)11 RunningJob (org.apache.hadoop.mapred.RunningJob)10 Path (org.apache.hadoop.fs.Path)9 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)9 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)9 DMLConfig (org.apache.sysml.conf.DMLConfig)8 ValueType (org.apache.sysml.parser.Expression.ValueType)8 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)7 CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)7 Group (org.apache.hadoop.mapred.Counters.Group)6