use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class ResultMergeRemoteMapper method configure.
public void configure(JobConf job) {
InputInfo ii = MRJobConfiguration.getResultMergeInputInfo(job);
long[] tmp = MRJobConfiguration.getResultMergeMatrixCharacteristics(job);
String compareFname = MRJobConfiguration.getResultMergeInfoCompareFilename(job);
String currentFname = job.get(MRConfigurationNames.MR_MAP_INPUT_FILE);
byte tag = 0;
// startsWith comparison in order to account for part names in currentFname
if (currentFname.startsWith(compareFname))
tag = ResultMergeRemoteMR.COMPARE_TAG;
else
tag = ResultMergeRemoteMR.DATA_TAG;
if (ii == InputInfo.TextCellInputInfo)
_mapper = new ResultMergeMapperTextCell(tag);
else if (ii == InputInfo.BinaryCellInputInfo)
_mapper = new ResultMergeMapperBinaryCell(tag);
else if (ii == InputInfo.BinaryBlockInputInfo)
_mapper = new ResultMergeMapperBinaryBlock(tag, tmp[0], tmp[1], tmp[2], tmp[3]);
else
throw new RuntimeException("Unable to configure mapper with unknown input info: " + ii.toString());
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class ResultMergeRemoteSpark method setRDDHandleForMerge.
@SuppressWarnings("unchecked")
private static void setRDDHandleForMerge(MatrixObject mo, SparkExecutionContext sec) {
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaSparkContext sc = sec.getSparkContext();
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(mo.getFileName(), iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
RDDObject rddhandle = new RDDObject(rdd);
rddhandle.setHDFSFile(true);
mo.setRDDHandle(rddhandle);
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class ResultMergeRemoteSpark method executeMerge.
@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, long rlen, long clen, int brlen, int bclen) {
String jobname = "ParFor-RMSP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) _ec;
boolean withCompare = (compare != null);
RDDObject ret = null;
// determine degree of parallelism
int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
// sanity check for empty src files
if (inputs == null || inputs.length == 0)
throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
try {
// note: initial implementation via union over all result rdds discarded due to
// stack overflow errors with many parfor tasks, and thus many rdds
// Step 1: construct input rdd from all result files of parfor workers
// a) construct job conf with all files
InputInfo ii = InputInfo.BinaryBlockInputInfo;
JobConf job = new JobConf(ResultMergeRemoteMR.class);
job.setJobName(jobname);
job.setInputFormat(ii.inputFormatClass);
Path[] paths = new Path[inputs.length];
for (int i = 0; i < paths.length; i++) {
// ensure input exists on hdfs (e.g., if in-memory or RDD)
inputs[i].exportData();
paths[i] = new Path(inputs[i].getFileName());
// update rdd handle to allow lazy evaluation by guarding
// against cleanup of temporary result files
setRDDHandleForMerge(inputs[i], sec);
}
FileInputFormat.setInputPaths(job, paths);
// b) create rdd from input files w/ deep copy of keys and blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
// Step 2a: merge with compare
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
if (withCompare) {
JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
// merge values which differ from compare values
ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare(_isAccum);
out = // group all result blocks per key
rdd.groupByKey(numRed).join(// join compare block and result blocks
compareRdd).mapToPair(// merge result blocks w/ compare
cfun);
} else // Step 2b: merge without compare
{
// direct merge in any order (disjointness guaranteed)
out = _isAccum ? RDDAggregateUtils.sumByKeyStable(rdd, false) : RDDAggregateUtils.mergeByKey(rdd, false);
}
// Step 3: create output rdd handle w/ lineage
ret = new RDDObject(out);
for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
if (withCompare)
ret.addLineageChild(compare.getRDDHandle());
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
// maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS) {
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
}
return ret;
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class TransformEncodeDecodeTest method runTransformEncodeDecodeTest.
private void runTransformEncodeDecodeTest(ExecType et, boolean sparse, String fmt) {
RUNTIME_PLATFORM platformOld = rtplatform;
// only CP supported
rtplatform = RUNTIME_PLATFORM.HYBRID;
try {
getAndLoadTestConfiguration(TEST_NAME1);
// get input/output info
InputInfo iinfo = InputInfo.stringExternalToInputInfo(fmt);
OutputInfo oinfo = InputInfo.getMatchingOutputInfo(iinfo);
// generate and write input data
double[][] A = TestUtils.round(getRandomMatrix(rows, cols, 1, 15, sparse ? sparsity2 : sparsity1, 7));
FrameBlock FA = DataConverter.convertToFrameBlock(DataConverter.convertToMatrixBlock(A));
FrameWriter writer = FrameWriterFactory.createFrameWriter(oinfo);
writer.writeFrameToHDFS(FA, input("F"), rows, cols);
fullDMLScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME1 + ".dml";
programArgs = new String[] { "-explain", "-args", input("F"), fmt, String.valueOf(rows), String.valueOf(cols), SCRIPT_DIR + TEST_DIR + SPEC, output("FO") };
// run test
runTest(true, false, null, -1);
// compare matrices (values recoded to identical codes)
FrameReader reader = FrameReaderFactory.createFrameReader(iinfo);
FrameBlock FO = reader.readFrameFromHDFS(output("FO"), 16, 2);
HashMap<String, Long> cFA = getCounts(FA, 1);
Iterator<String[]> iterFO = FO.getStringRowIterator();
while (iterFO.hasNext()) {
String[] row = iterFO.next();
Double expected = (double) cFA.get(row[1]);
Double val = (row[0] != null) ? Double.valueOf(row[0]) : 0;
Assert.assertEquals("Output aggregates don't match: " + expected + " vs " + val, expected, val);
}
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
} finally {
rtplatform = platformOld;
}
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class RemoteDPParForSpark method getPartitionedInput.
@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
InputInfo ii = InputInfo.BinaryBlockInputInfo;
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
// NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
if (hasInputDataSet(dpf, mo)) {
DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
Dataset<Row> in = dsObj.getDataset();
// construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
in.javaRDD().zipWithIndex();
// convert row to row in matrix block format
return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
} else // binary block input rdd without grouping
if (!requiresGrouping(dpf, mo)) {
// get input rdd and data partitioning
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
} else // default binary block input rdd with grouping
{
// get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
// to reduce memory pressure for shuffle and subsequent
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
// data partitioning of input rdd
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
}
}
Aggregations