use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class Tsmm2SPInstruction method processInstruction.
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// get input
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
// execute tsmm2 instruction
// step 1: first pass of X, filter-collect-broadcast excess blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> tmp1 = in.filter(new IsBlockInRange(_type.isLeft() ? 1 : mc.getRowsPerBlock() + 1, mc.getRows(), _type.isLeft() ? mc.getColsPerBlock() + 1 : 1, mc.getCols(), mc)).mapToPair(new ShiftTSMMIndexesFunction(_type));
PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(tmp1, (int) (_type.isLeft() ? mc.getRows() : mc.getRows() - mc.getRowsPerBlock()), (int) (_type.isLeft() ? mc.getCols() - mc.getColsPerBlock() : mc.getCols()), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1L);
Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
// step 2: second pass of X, compute tsmm/mapmm and aggregate result blocks
int outputDim = (int) (_type.isLeft() ? mc.getCols() : mc.getRows());
if (OptimizerUtils.estimateSize(outputDim, outputDim) <= 32 * 1024 * 1024) {
// default: <=32MB
// output large blocks and reduceAll to avoid skew on combineByKey
JavaRDD<MatrixBlock> tmp2 = RDDTSMM2ExtFunction(bpmb, _type, outputDim, (int) mc.getRowsPerBlock()));
MatrixBlock out = RDDAggregateUtils.sumStable(tmp2);
// put output block into symbol table (no lineage because single block)
// this also includes implicit maintenance of matrix characteristics
sec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
} else {
// output individual output blocks and aggregate by key (no action)
JavaPairRDD<MatrixIndexes, MatrixBlock> tmp2 = in.flatMapToPair(new RDDTSMM2Function(bpmb, _type));
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.sumByKeyStable(tmp2, false);
// put output RDD handle into symbol table
sec.getMatrixCharacteristics(output.getName()).set(outputDim, outputDim, mc.getRowsPerBlock(), mc.getColsPerBlock());
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class WriteSPInstruction method processMatrixWriteInstruction.
protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi) throws IOException {
// get input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
// piggyback nnz maintenance on write
LongAccumulator aNnz = null;
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
JavaRDD<String> header = null;
if (oi == OutputInfo.MatrixMarketOutputInfo) {
ArrayList<String> headerContainer = new ArrayList<>(1);
// First output MM header
String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz
mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
header = sec.getSparkContext().parallelize(headerContainer);
JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
if (header != null)
customSaveTextFile(header.union(ijv), fname, true);
customSaveTextFile(ijv, fname, false);
if (!mc.nnzKnown())
} else if (oi == OutputInfo.CSVOutputInfo) {
if (mc.getRows() == 0 || mc.getCols() == 0) {
throw new IOException("Write of matrices with zero rows or columns" + " not supported (" + mc.getRows() + "x" + mc.getCols() + ").");
LongAccumulator aNnz = null;
// piggyback nnz computation on actual write
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
JavaRDD<String> out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true);
customSaveTextFile(out, fname, false);
if (!mc.nnzKnown())
mc.setNonZeros((long) aNnz.value().longValue());
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
// piggyback nnz computation on actual write
LongAccumulator aNnz = null;
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
// save binary block rdd on hdfs
in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
if (!mc.nnzKnown())
mc.setNonZeros((long) aNnz.value().longValue());
} else {
// unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
// write meta data file
MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class ProgramConverter method serializeDataObject.
public static String serializeDataObject(String key, Data dat) {
// SCHEMA: <name>|<datatype>|<valuetype>|value
// (scalars are serialize by value, matrices by filename)
StringBuilder sb = new StringBuilder();
// prepare data for serialization
String name = key;
DataType datatype = dat.getDataType();
ValueType valuetype = dat.getValueType();
String value = null;
String[] matrixMetaData = null;
switch(datatype) {
case SCALAR:
ScalarObject so = (ScalarObject) dat;
// name = so.getName();
value = so.getStringValue();
case MATRIX:
MatrixObject mo = (MatrixObject) dat;
MetaDataFormat md = (MetaDataFormat) dat.getMetaData();
MatrixCharacteristics mc = md.getMatrixCharacteristics();
value = mo.getFileName();
PartitionFormat partFormat = (mo.getPartitionFormat() != null) ? new PartitionFormat(mo.getPartitionFormat(), mo.getPartitionSize()) : PartitionFormat.NONE;
matrixMetaData = new String[9];
matrixMetaData[0] = String.valueOf(mc.getRows());
matrixMetaData[1] = String.valueOf(mc.getCols());
matrixMetaData[2] = String.valueOf(mc.getRowsPerBlock());
matrixMetaData[3] = String.valueOf(mc.getColsPerBlock());
matrixMetaData[4] = String.valueOf(mc.getNonZeros());
matrixMetaData[5] = InputInfo.inputInfoToString(md.getInputInfo());
matrixMetaData[6] = OutputInfo.outputInfoToString(md.getOutputInfo());
matrixMetaData[7] = String.valueOf(partFormat);
matrixMetaData[8] = String.valueOf(mo.getUpdateType());
throw new DMLRuntimeException("Unable to serialize datatype " + datatype);
// serialize data
if (matrixMetaData != null)
for (int i = 0; i < matrixMetaData.length; i++) {
return sb.toString();
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class RemoteDPParForSpark method getPartitionedInput.
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
InputInfo ii = InputInfo.BinaryBlockInputInfo;
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
// NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
if (hasInputDataSet(dpf, mo)) {
DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
Dataset<Row> in = dsObj.getDataset();
// construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
// convert row to row in matrix block format
return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
} else // binary block input rdd without grouping
if (!requiresGrouping(dpf, mo)) {
// get input rdd and data partitioning
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
} else // default binary block input rdd with grouping
// get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
// to reduce memory pressure for shuffle and subsequent
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
// data partitioning of input rdd
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class RemoteDPParWorkerReducer method configure.
public void configure(JobConf job) {
// Step 1: configure data partitioning information
_dpf = MRJobConfiguration.getPartitioningFormat(job);
MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
PartitionFormat pf = new PartitionFormat(_dpf, MRJobConfiguration.getPartitioningSizeN(job));
_rlen = (int) pf.getNumRows(mc);
_clen = (int) pf.getNumColumns(mc);
_brlen = mc.getRowsPerBlock();
_bclen = mc.getColsPerBlock();
_iterVar = MRJobConfiguration.getPartitioningItervar(job);
_inputVar = MRJobConfiguration.getPartitioningMatrixvar(job);
_info = MRJobConfiguration.getPartitioningOutputInfo(job);
_tSparseCol = MRJobConfiguration.getPartitioningTransposedCol(job);
if (_tSparseCol)
_partition = new MatrixBlock((int) _clen, _rlen, true);
_partition = new MatrixBlock((int) _rlen, _clen, false);
// Step 1: configure parworker
String taskID = job.get(MRConfigurationNames.MR_TASK_ID);
LOG.trace("configure RemoteDPParWorkerReducer " + taskID);
try {
_stringID = taskID;
// int task ID
_workerID = IDHandler.extractIntID(_stringID);
// in the context of mr jobs (for example this config points to local fs instead of hdfs by default).
if (!InfrastructureAnalyzer.isLocalMode(job)) {
// create local runtime program
String in = MRJobConfiguration.getProgramBlocks(job);
ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID);
_childBlocks = body.getChildBlocks();
_ec = body.getEc();
_resultVars = body.getResultVariables();
// init local cache manager
if (!CacheableData.isCachingActive()) {
String uuid = IDHandler.createDistributedUniqueID();
// incl activation, cache dir creation (each map task gets its own dir for simplified cleanup)
if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) {
// account for local mode
CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_" + _workerID;
// ensure that resultvar files are not removed
// enable/disable caching (if required)
boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job);
if (!cpCaching)
_numTasks = 0;
_numIters = 0;
} catch (Exception ex) {
throw new RuntimeException(ex);
// disable parfor stat monitoring, reporting execution times via counters not useful
// always reset stats because counters per map task (for case of JVM reuse)
if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job))