use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.
the class DataPartitioner method createPartitionedMatrixObject.
/**
* Creates a partitioned matrix object based on the given input matrix object,
* according to the specified split format. The input matrix can be in-memory
* or still on HDFS and the partitioned output matrix is written to HDFS. The
* created matrix object can be used transparently for obtaining the full matrix
* or reading 1 or multiple partitions based on given index ranges.
*
* @param in input matrix object
* @param out output matrix object
* @param force if false, try to optimize
* @return partitioned matrix object
*/
public MatrixObject createPartitionedMatrixObject(MatrixObject in, MatrixObject out, boolean force) {
// check for naive partitioning
if (_format == PDataPartitionFormat.NONE)
return in;
// analyze input matrix object
MetaDataFormat meta = (MetaDataFormat) in.getMetaData();
MatrixCharacteristics mc = meta.getMatrixCharacteristics();
InputInfo ii = meta.getInputInfo();
OutputInfo oi = meta.getOutputInfo();
long rows = mc.getRows();
long cols = mc.getCols();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
long nonZeros = mc.getNonZeros();
double sparsity = mc.dimsKnown(true) ? ((double) nonZeros) / (rows * cols) : 1.0;
if (// try to optimize, if format not forced
!force) {
// check lower bound of useful data partitioning
if (// or matrix already fits in mem
rows < Hop.CPThreshold && cols < Hop.CPThreshold) {
return in;
}
// check for changing to blockwise representations
if (_format == PDataPartitionFormat.ROW_WISE && cols < Hop.CPThreshold) {
LOG.debug("Changing format from " + PDataPartitionFormat.ROW_WISE + " to " + PDataPartitionFormat.ROW_BLOCK_WISE + ".");
_format = PDataPartitionFormat.ROW_BLOCK_WISE;
}
if (_format == PDataPartitionFormat.COLUMN_WISE && rows < Hop.CPThreshold) {
LOG.debug("Changing format from " + PDataPartitionFormat.COLUMN_WISE + " to " + PDataPartitionFormat.ROW_BLOCK_WISE + ".");
_format = PDataPartitionFormat.COLUMN_BLOCK_WISE;
}
// _format = PDataPartitionFormat.ROW_BLOCK_WISE_N;
}
// check changing to binarycell in case of sparse cols (robustness)
boolean convertBlock2Cell = false;
if (ii == InputInfo.BinaryBlockInputInfo && _allowBinarycell && _format == PDataPartitionFormat.COLUMN_WISE && sparsity < SPARSITY_CELL_THRESHOLD) {
LOG.debug("Changing partition outputinfo from binaryblock to binarycell due to sparsity=" + sparsity);
oi = OutputInfo.BinaryCellOutputInfo;
convertBlock2Cell = true;
}
// prepare filenames and cleanup if required
String fnameNew = out.getFileName();
try {
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
// core partitioning (depending on subclass)
partitionMatrix(in, fnameNew, ii, oi, rows, cols, brlen, bclen);
// create output matrix object
out.setPartitioned(_format, _n);
MatrixCharacteristics mcNew = new MatrixCharacteristics(rows, cols, (int) brlen, (int) bclen);
mcNew.setNonZeros(nonZeros);
if (convertBlock2Cell)
ii = InputInfo.BinaryCellInputInfo;
MetaDataFormat metaNew = new MetaDataFormat(mcNew, oi, ii);
out.setMetaData(metaNew);
return out;
}
use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.
the class CSVReblockSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// sanity check input info
CacheableData<?> obj = sec.getCacheableData(input1.getName());
MetaDataFormat iimd = (MetaDataFormat) obj.getMetaData();
if (iimd.getInputInfo() != InputInfo.CSVInputInfo) {
throw new DMLRuntimeException("The given InputInfo is not implemented for " + "CSVReblockSPInstruction:" + iimd.getInputInfo());
}
// set output characteristics
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(mcIn.getRows(), mcIn.getCols(), _brlen, _bclen);
// check for in-memory reblock (w/ lazy spark context, potential for latency reduction)
if (Recompiler.checkCPReblock(sec, input1.getName())) {
if (input1.getDataType() == DataType.MATRIX)
Recompiler.executeInMemoryMatrixReblock(sec, input1.getName(), output.getName());
else if (input1.getDataType() == DataType.FRAME)
Recompiler.executeInMemoryFrameReblock(sec, input1.getName(), output.getName());
return;
}
// execute matrix/frame csvreblock
JavaPairRDD<?, ?> out = null;
if (input1.getDataType() == DataType.MATRIX)
out = processMatrixCSVReblockInstruction(sec, mcOut);
else if (input1.getDataType() == DataType.FRAME)
out = processFrameCSVReblockInstruction(sec, mcOut, ((FrameObject) obj).getSchema());
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
}
use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.
the class InterProceduralAnalysis method createOutputMatrix.
private static MatrixObject createOutputMatrix(long dim1, long dim2, long nnz) {
MatrixObject moOut = new MatrixObject(ValueType.DOUBLE, null);
MatrixCharacteristics mc = new MatrixCharacteristics(dim1, dim2, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), nnz);
MetaDataFormat meta = new MetaDataFormat(mc, null, null);
moOut.setMetaData(meta);
return moOut;
}
use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.
the class Recompiler method checkCPReblock.
/**
* Returns true iff (1) all instruction are reblock instructions and (2) all
* individual reblock operations fit in the current memory budget.
*
* @param inst instruction
* @param inputs the inputs
* @return return true if and only if all instructions are reblock instructions and all
* individual reblock oeprations fir in the current memory budget.
* @throws IOException if IOException occurs
*/
public static boolean checkCPReblock(MRJobInstruction inst, MatrixObject[] inputs) throws IOException {
boolean ret = true;
boolean localMode = InfrastructureAnalyzer.isLocalMode();
// check only shuffle inst
String rdInst = inst.getIv_randInstructions();
String rrInst = inst.getIv_recordReaderInstructions();
String mapInst = inst.getIv_instructionsInMapper();
String aggInst = inst.getIv_aggInstructions();
String otherInst = inst.getIv_otherInstructions();
if ((rdInst != null && rdInst.length() > 0) || (rrInst != null && rrInst.length() > 0) || (mapInst != null && mapInst.length() > 0) || (aggInst != null && aggInst.length() > 0) || (otherInst != null && otherInst.length() > 0)) {
ret = false;
}
// check only reblock inst
if (ret) {
String shuffleInst = inst.getIv_shuffleInstructions();
String[] instParts = shuffleInst.split(Lop.INSTRUCTION_DELIMITOR);
for (String rblk : instParts) if (!InstructionUtils.getOpCode(rblk).equals(ReBlock.OPCODE) && !InstructionUtils.getOpCode(rblk).equals(CSVReBlock.OPCODE)) {
ret = false;
break;
}
}
// counter-productive because any export from CP would reintroduce the empty blocks)
if (ret) {
String shuffleInst = inst.getIv_shuffleInstructions();
String[] instParts = shuffleInst.split(Lop.INSTRUCTION_DELIMITOR);
for (String rblk : instParts) if (InstructionUtils.getOpCode(rblk).equals(ReBlock.OPCODE) && // no output of empty blocks
rblk.endsWith("false")) {
ret = false;
break;
}
}
// check recompile memory budget
if (ret) {
for (MatrixObject mo : inputs) {
long rows = mo.getNumRows();
long cols = mo.getNumColumns();
// however, we do a conservative check with the CSV filesize
if (rows == -1 || cols == -1) {
Path path = new Path(mo.getFileName());
long size = MapReduceTool.getFilesizeOnHDFS(path);
if (size > CP_CSV_REBLOCK_UNKNOWN_THRESHOLD_SIZE || CP_CSV_REBLOCK_UNKNOWN_THRESHOLD_SIZE > OptimizerUtils.getLocalMemBudget()) {
ret = false;
break;
}
} else // default case (known dimensions)
{
long nnz = mo.getNnz();
double sp = OptimizerUtils.getSparsity(rows, cols, nnz);
double mem = MatrixBlock.estimateSizeInMemory(rows, cols, sp);
if (!OptimizerUtils.isValidCPDimensions(rows, cols) || !OptimizerUtils.isValidCPMatrixSize(rows, cols, sp) || mem >= OptimizerUtils.getLocalMemBudget()) {
ret = false;
break;
}
}
}
}
// NOTE: this does not apply to local mode because there text read single-threaded as well
if (ret && !localMode) {
for (MatrixObject mo : inputs) {
MetaDataFormat iimd = (MetaDataFormat) mo.getMetaData();
if ((iimd.getInputInfo() == InputInfo.TextCellInputInfo || iimd.getInputInfo() == InputInfo.MatrixMarketInputInfo || iimd.getInputInfo() == InputInfo.CSVInputInfo || iimd.getInputInfo() == InputInfo.BinaryCellInputInfo) && !mo.isDirty()) {
// get file size on hdfs (as indicator for estimated read time)
Path path = new Path(mo.getFileName());
long fileSize = MapReduceTool.getFilesizeOnHDFS(path);
// compute cp reblock size threshold based on available parallelism
long cpThreshold = CP_REBLOCK_THRESHOLD_SIZE * OptimizerUtils.getParallelTextReadParallelism();
if (fileSize > cpThreshold) {
ret = false;
break;
}
}
}
}
return ret;
}
use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.
the class CacheableData method toString.
@Override
public String toString() {
StringBuilder str = new StringBuilder();
str.append(getClass().getSimpleName());
str.append(": ");
str.append(_hdfsFileName + ", ");
if (_metaData instanceof MetaDataNumItemsByEachReducer) {
str.append("NumItemsByEachReducerMetaData");
} else {
try {
MetaDataFormat md = (MetaDataFormat) _metaData;
if (md != null) {
MatrixCharacteristics mc = _metaData.getMatrixCharacteristics();
str.append(mc.toString());
InputInfo ii = md.getInputInfo();
if (ii == null)
str.append("null");
else {
str.append(", ");
str.append(InputInfo.inputInfoToString(ii));
}
} else {
str.append("null, null");
}
} catch (Exception ex) {
LOG.error(ex);
}
}
str.append(", ");
str.append(isDirty() ? "dirty" : "not-dirty");
return str.toString();
}
Aggregations