use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RemoteDPParForSpark method getPartitionedInput.
@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
InputInfo ii = InputInfo.BinaryBlockInputInfo;
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
// NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
if (hasInputDataSet(dpf, mo)) {
DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
Dataset<Row> in = dsObj.getDataset();
// construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
in.javaRDD().zipWithIndex();
// convert row to row in matrix block format
return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
} else // binary block input rdd without grouping
if (!requiresGrouping(dpf, mo)) {
// get input rdd and data partitioning
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
} else // default binary block input rdd with grouping
{
// get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
// to reduce memory pressure for shuffle and subsequent
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
// data partitioning of input rdd
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RemoteDPParForSparkWorker method collectBinaryCellInput.
/**
* Collects a matrixblock partition from a given input iterator over
* binary cells.
*
* Note it reuses the instance attribute _partition - multiple calls
* will overwrite the result.
*
* @param valueList iterable writables
* @return matrix block
* @throws IOException if IOException occurs
*/
private MatrixBlock collectBinaryCellInput(Iterable<Writable> valueList) throws IOException {
MatrixBlock partition = null;
// reset reuse block, keep configured representation
if (_tSparseCol)
partition = new MatrixBlock(_clen, _rlen, true);
else
partition = new MatrixBlock(_rlen, _clen, false);
switch(_dpf) {
case ROW_WISE:
while (valueList.iterator().hasNext()) {
PairWritableCell pairValue = (PairWritableCell) valueList.iterator().next();
if (pairValue.indexes.getColumnIndex() < 0)
// cells used to ensure empty partitions
continue;
partition.quickSetValue(0, (int) pairValue.indexes.getColumnIndex() - 1, pairValue.cell.getValue());
}
break;
case COLUMN_WISE:
while (valueList.iterator().hasNext()) {
PairWritableCell pairValue = (PairWritableCell) valueList.iterator().next();
if (pairValue.indexes.getRowIndex() < 0)
// cells used to ensure empty partitions
continue;
if (_tSparseCol)
partition.appendValue(0, (int) pairValue.indexes.getRowIndex() - 1, pairValue.cell.getValue());
else
partition.quickSetValue((int) pairValue.indexes.getRowIndex() - 1, 0, pairValue.cell.getValue());
}
break;
default:
throw new IOException("Partition format not yet supported in fused partition-execute: " + _dpf);
}
// post-processing: cleanups if required
try {
if (partition.isInSparseFormat() && _tSparseCol)
partition.sortSparseRows();
partition.recomputeNonZeros();
partition.examSparsity();
} catch (DMLRuntimeException ex) {
throw new IOException(ex);
}
return partition;
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RemoteDPParForSparkWorker method call.
@Override
public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0) throws Exception {
ArrayList<Tuple2<Long, String>> ret = new ArrayList<>();
// lazy parworker initialization
configureWorker(TaskContext.get().taskAttemptId());
// process all matrix partitions of this data partition
MatrixBlock partition = null;
while (arg0.hasNext()) {
Tuple2<Long, Iterable<Writable>> larg = arg0.next();
// collect input partition (check via equals because oinfo deserialized instance)
if (_oinfo.equals(OutputInfo.BinaryBlockOutputInfo))
partition = collectBinaryBlock(larg._2(), partition);
else
partition = collectBinaryCellInput(larg._2());
// update in-memory matrix partition
MatrixObject mo = _ec.getMatrixObject(_inputVar);
mo.setInMemoryPartition(partition);
// create tasks for input data
Task lTask = new Task(_iterVar, TaskType.SET);
lTask.addIteration(new IntObject(larg._1()));
// execute program
long numIter = getExecutedIterations();
super.executeTask(lTask);
// maintain accumulators
_aTasks.add(1);
_aIters.add((int) (getExecutedIterations() - numIter));
}
// write output if required (matrix indexed write)
ArrayList<String> tmp = RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars);
for (String val : tmp) ret.add(new Tuple2<>(_workerID, val));
return ret.iterator();
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RemoteDPParWorkerReducer method configure.
@Override
public void configure(JobConf job) {
// Step 1: configure data partitioning information
_dpf = MRJobConfiguration.getPartitioningFormat(job);
MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
PartitionFormat pf = new PartitionFormat(_dpf, MRJobConfiguration.getPartitioningSizeN(job));
_rlen = (int) pf.getNumRows(mc);
_clen = (int) pf.getNumColumns(mc);
_brlen = mc.getRowsPerBlock();
_bclen = mc.getColsPerBlock();
_iterVar = MRJobConfiguration.getPartitioningItervar(job);
_inputVar = MRJobConfiguration.getPartitioningMatrixvar(job);
_info = MRJobConfiguration.getPartitioningOutputInfo(job);
_tSparseCol = MRJobConfiguration.getPartitioningTransposedCol(job);
if (_tSparseCol)
_partition = new MatrixBlock((int) _clen, _rlen, true);
else
_partition = new MatrixBlock((int) _rlen, _clen, false);
// Step 1: configure parworker
String taskID = job.get(MRConfigurationNames.MR_TASK_ID);
LOG.trace("configure RemoteDPParWorkerReducer " + taskID);
try {
_stringID = taskID;
// int task ID
_workerID = IDHandler.extractIntID(_stringID);
// in the context of mr jobs (for example this config points to local fs instead of hdfs by default).
if (!InfrastructureAnalyzer.isLocalMode(job)) {
ConfigurationManager.setCachedJobConf(job);
}
// create local runtime program
String in = MRJobConfiguration.getProgramBlocks(job);
ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID);
_childBlocks = body.getChildBlocks();
_ec = body.getEc();
_resultVars = body.getResultVariables();
// init local cache manager
if (!CacheableData.isCachingActive()) {
String uuid = IDHandler.createDistributedUniqueID();
LocalFileUtils.createWorkingDirectoryWithUUID(uuid);
// incl activation, cache dir creation (each map task gets its own dir for simplified cleanup)
CacheableData.initCaching(uuid);
}
if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) {
// account for local mode
CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_" + _workerID;
}
// ensure that resultvar files are not removed
super.pinResultVariables();
// enable/disable caching (if required)
boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job);
if (!cpCaching)
CacheableData.disableCaching();
_numTasks = 0;
_numIters = 0;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
// disable parfor stat monitoring, reporting execution times via counters not useful
StatisticMonitor.disableStatMonitoring();
// always reset stats because counters per map task (for case of JVM reuse)
if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job))
Statistics.reset();
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class ResultMergeLocalFile method createTextCellResultFile.
private void createTextCellResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew, MetaDataFormat metadata, boolean withCompare) throws IOException, DMLRuntimeException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameNew);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
MatrixCharacteristics mc = metadata.getMatrixCharacteristics();
long rlen = mc.getRows();
long clen = mc.getCols();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)))) {
// for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
boolean written = false;
for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++) for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) {
File dir = new File(fnameStaging + "/" + brow + "_" + bcol);
File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol);
MatrixBlock mb = null;
long row_offset = (brow - 1) * brlen + 1;
long col_offset = (bcol - 1) * bclen + 1;
if (dir.exists()) {
if (// WITH COMPARE BLOCK
withCompare && dir2.exists()) {
// copy only values that are different from the original
String[] lnames2 = dir2.list();
if (// there should be exactly 1 compare block
lnames2.length != 1)
throw new DMLRuntimeException("Unable to merge results because multiple compare blocks found.");
mb = StagingFileUtils.readCellList2BlockFromLocal(dir2 + "/" + lnames2[0], brlen, bclen);
boolean appendOnly = mb.isInSparseFormat();
DenseBlock compare = DataConverter.convertToDenseBlock(mb, false);
for (String lname : dir.list()) {
MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
mergeWithComp(mb, tmp, compare);
}
// sort sparse and exam sparsity due to append-only
if (appendOnly && !_isAccum)
mb.sortSparseRows();
// change sparsity if required after
mb.examSparsity();
} else // WITHOUT COMPARE BLOCK
{
// copy all non-zeros from all workers
boolean appendOnly = false;
for (String lname : dir.list()) {
if (mb == null) {
mb = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
appendOnly = mb.isInSparseFormat();
} else {
MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
mergeWithoutComp(mb, tmp, appendOnly);
}
}
// sort sparse due to append-only
if (appendOnly && !_isAccum)
mb.sortSparseRows();
// change sparsity if required after
mb.examSparsity();
}
}
// write the block to text cell
if (mb != null) {
if (mb.isInSparseFormat()) {
Iterator<IJV> iter = mb.getSparseBlockIterator();
while (iter.hasNext()) {
IJV lcell = iter.next();
sb.append(row_offset + lcell.getI());
sb.append(' ');
sb.append(col_offset + lcell.getJ());
sb.append(' ');
sb.append(lcell.getV());
sb.append('\n');
out.write(sb.toString());
sb.setLength(0);
written = true;
}
} else {
for (int i = 0; i < brlen; i++) for (int j = 0; j < bclen; j++) {
double lvalue = mb.getValueDenseUnsafe(i, j);
if (// for nnz
lvalue != 0) {
sb.append(row_offset + i);
sb.append(' ');
sb.append(col_offset + j);
sb.append(' ');
sb.append(lvalue);
sb.append('\n');
out.write(sb.toString());
sb.setLength(0);
written = true;
}
}
}
}
}
if (!written)
out.write(IOUtilFunctions.EMPTY_TEXT_LINE);
}
}
Aggregations