use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RemoteDPParWorkerReducer method configure.
@Override
public void configure(JobConf job) {
//Step 1: configure data partitioning information
_rlen = (int) MRJobConfiguration.getPartitioningNumRows(job);
_clen = (int) MRJobConfiguration.getPartitioningNumCols(job);
_brlen = MRJobConfiguration.getPartitioningBlockNumRows(job);
_bclen = MRJobConfiguration.getPartitioningBlockNumCols(job);
_iterVar = MRJobConfiguration.getPartitioningItervar(job);
_inputVar = MRJobConfiguration.getPartitioningMatrixvar(job);
_dpf = MRJobConfiguration.getPartitioningFormat(job);
switch(//create matrix partition for reuse
_dpf) {
case ROW_WISE:
_rlen = 1;
break;
case COLUMN_WISE:
_clen = 1;
break;
default:
throw new RuntimeException("Partition format not yet supported in fused partition-execute: " + _dpf);
}
_info = MRJobConfiguration.getPartitioningOutputInfo(job);
_tSparseCol = MRJobConfiguration.getPartitioningTransposedCol(job);
if (_tSparseCol)
_partition = new MatrixBlock((int) _clen, _rlen, true);
else
_partition = new MatrixBlock((int) _rlen, _clen, false);
//Step 1: configure parworker
String taskID = job.get(MRConfigurationNames.MR_TASK_ID);
LOG.trace("configure RemoteDPParWorkerReducer " + taskID);
try {
_stringID = taskID;
//int task ID
_workerID = IDHandler.extractIntID(_stringID);
//in the context of mr jobs (for example this config points to local fs instead of hdfs by default).
if (!InfrastructureAnalyzer.isLocalMode(job)) {
ConfigurationManager.setCachedJobConf(job);
}
//create local runtime program
String in = MRJobConfiguration.getProgramBlocks(job);
ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID);
_childBlocks = body.getChildBlocks();
_ec = body.getEc();
_resultVars = body.getResultVarNames();
//init local cache manager
if (!CacheableData.isCachingActive()) {
String uuid = IDHandler.createDistributedUniqueID();
LocalFileUtils.createWorkingDirectoryWithUUID(uuid);
//incl activation, cache dir creation (each map task gets its own dir for simplified cleanup)
CacheableData.initCaching(uuid);
}
if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) {
//account for local mode
CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_" + _workerID;
}
//ensure that resultvar files are not removed
super.pinResultVariables();
//enable/disable caching (if required)
boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job);
if (!cpCaching)
CacheableData.disableCaching();
_numTasks = 0;
_numIters = 0;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
//disable parfor stat monitoring, reporting execution times via counters not useful
StatisticMonitor.disableStatMonitoring();
//always reset stats because counters per map task (for case of JVM reuse)
if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job)) {
CacheStatistics.reset();
Statistics.reset();
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RemoteDPParForSparkWorker method call.
@Override
public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0) throws Exception {
ArrayList<Tuple2<Long, String>> ret = new ArrayList<Tuple2<Long, String>>();
//lazy parworker initialization
configureWorker(TaskContext.get().taskAttemptId());
//process all matrix partitions of this data partition
MatrixBlock partition = null;
while (arg0.hasNext()) {
Tuple2<Long, Iterable<Writable>> larg = arg0.next();
//collect input partition (check via equals because oinfo deserialized instance)
if (_oinfo.equals(OutputInfo.BinaryBlockOutputInfo))
partition = collectBinaryBlock(larg._2(), partition);
else
partition = collectBinaryCellInput(larg._2());
//update in-memory matrix partition
MatrixObject mo = _ec.getMatrixObject(_inputVar);
mo.setInMemoryPartition(partition);
//create tasks for input data
Task lTask = new Task(TaskType.SET);
lTask.addIteration(new IntObject(_iterVar, larg._1()));
//execute program
long numIter = getExecutedIterations();
super.executeTask(lTask);
//maintain accumulators
_aTasks.add(1);
_aIters.add((int) (getExecutedIterations() - numIter));
}
//write output if required (matrix indexed write)
ArrayList<String> tmp = RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars);
for (String val : tmp) ret.add(new Tuple2<Long, String>(_workerID, val));
return ret.iterator();
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RemoteDPParForSparkWorker method collectBinaryCellInput.
/**
* Collects a matrixblock partition from a given input iterator over
* binary cells.
*
* Note it reuses the instance attribute _partition - multiple calls
* will overwrite the result.
*
* @param valueList iterable writables
* @return matrix block
* @throws IOException if IOException occurs
*/
private MatrixBlock collectBinaryCellInput(Iterable<Writable> valueList) throws IOException {
MatrixBlock partition = null;
//reset reuse block, keep configured representation
if (_tSparseCol)
partition = new MatrixBlock(_clen, _rlen, true);
else
partition = new MatrixBlock(_rlen, _clen, false);
switch(_dpf) {
case ROW_WISE:
while (valueList.iterator().hasNext()) {
PairWritableCell pairValue = (PairWritableCell) valueList.iterator().next();
if (pairValue.indexes.getColumnIndex() < 0)
//cells used to ensure empty partitions
continue;
partition.quickSetValue(0, (int) pairValue.indexes.getColumnIndex() - 1, pairValue.cell.getValue());
}
break;
case COLUMN_WISE:
while (valueList.iterator().hasNext()) {
PairWritableCell pairValue = (PairWritableCell) valueList.iterator().next();
if (pairValue.indexes.getRowIndex() < 0)
//cells used to ensure empty partitions
continue;
if (_tSparseCol)
partition.appendValue(0, (int) pairValue.indexes.getRowIndex() - 1, pairValue.cell.getValue());
else
partition.quickSetValue((int) pairValue.indexes.getRowIndex() - 1, 0, pairValue.cell.getValue());
}
break;
default:
throw new IOException("Partition format not yet supported in fused partition-execute: " + _dpf);
}
//post-processing: cleanups if required
try {
if (partition.isInSparseFormat() && _tSparseCol)
partition.sortSparseRows();
partition.recomputeNonZeros();
partition.examSparsity();
} catch (DMLRuntimeException ex) {
throw new IOException(ex);
}
return partition;
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class ResultMergeLocalFile method createBinaryBlockResultFile.
@SuppressWarnings("deprecation")
private void createBinaryBlockResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew, MatrixFormatMetaData metadata, boolean withCompare) throws IOException, DMLRuntimeException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameNew);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
MatrixCharacteristics mc = metadata.getMatrixCharacteristics();
long rlen = mc.getRows();
long clen = mc.getCols();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
//beware ca 50ms
SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class);
try {
MatrixIndexes indexes = new MatrixIndexes();
for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++) for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) {
File dir = new File(fnameStaging + "/" + brow + "_" + bcol);
File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol);
MatrixBlock mb = null;
if (dir.exists()) {
if (//WITH COMPARE BLOCK
withCompare && dir2.exists()) {
//copy only values that are different from the original
String[] lnames2 = dir2.list();
if (//there should be exactly 1 compare block
lnames2.length != 1)
throw new DMLRuntimeException("Unable to merge results because multiple compare blocks found.");
mb = LocalFileUtils.readMatrixBlockFromLocal(dir2 + "/" + lnames2[0]);
boolean appendOnly = mb.isInSparseFormat();
double[][] compare = DataConverter.convertToDoubleMatrix(mb);
String[] lnames = dir.list();
for (String lname : lnames) {
MatrixBlock tmp = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
mergeWithComp(mb, tmp, compare);
}
//sort sparse due to append-only
if (appendOnly)
mb.sortSparseRows();
//change sparsity if required after
mb.examSparsity();
} else //WITHOUT COMPARE BLOCK
{
//copy all non-zeros from all workers
String[] lnames = dir.list();
boolean appendOnly = false;
for (String lname : lnames) {
if (mb == null) {
mb = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
appendOnly = mb.isInSparseFormat();
} else {
MatrixBlock tmp = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
mergeWithoutComp(mb, tmp, appendOnly);
}
}
//sort sparse due to append-only
if (appendOnly)
mb.sortSparseRows();
//change sparsity if required after
mb.examSparsity();
}
} else {
//NOTE: whenever runtime does not need all blocks anymore, this can be removed
int maxRow = (int) (((brow - 1) * brlen + brlen < rlen) ? brlen : rlen - (brow - 1) * brlen);
int maxCol = (int) (((bcol - 1) * bclen + bclen < clen) ? bclen : clen - (bcol - 1) * bclen);
mb = new MatrixBlock(maxRow, maxCol, true);
}
//mb.examSparsity(); //done on write anyway and mb not reused
indexes.setIndexes(brow, bcol);
writer.append(indexes, mb);
}
} finally {
IOUtilFunctions.closeSilently(writer);
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RemoteDPParForSpark method getPartitionedInput.
@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) throws DMLRuntimeException {
InputInfo ii = InputInfo.BinaryBlockInputInfo;
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
//NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
if (hasInputDataSet(dpf, mo)) {
DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
Dataset<Row> in = dsObj.getDataset();
//construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : //zip row index
in.javaRDD().zipWithIndex();
//convert row to row in matrix block format
return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
} else //binary block input rdd without grouping
if (!requiresGrouping(dpf, mo)) {
//get input rdd and data partitioning
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
} else //default binary block input rdd with grouping
{
//get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
//to reduce memory pressure for shuffle and subsequent
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
//data partitioning of input rdd
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
}
}
Aggregations