use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class ResultMergeLocalFile method createBinaryCellResultFile.
@SuppressWarnings("deprecation")
private void createBinaryCellResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew, MatrixFormatMetaData metadata, boolean withCompare) throws IOException, DMLRuntimeException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameNew);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
MatrixCharacteristics mc = metadata.getMatrixCharacteristics();
long rlen = mc.getRows();
long clen = mc.getCols();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
MatrixIndexes indexes = new MatrixIndexes(1, 1);
MatrixCell cell = new MatrixCell(0);
//beware ca 50ms
SequenceFile.Writer out = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixCell.class);
try {
boolean written = false;
for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++) for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) {
File dir = new File(fnameStaging + "/" + brow + "_" + bcol);
File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol);
MatrixBlock mb = null;
long row_offset = (brow - 1) * brlen + 1;
long col_offset = (bcol - 1) * bclen + 1;
if (dir.exists()) {
if (//WITH COMPARE BLOCK
withCompare && dir2.exists()) {
//copy only values that are different from the original
String[] lnames2 = dir2.list();
if (//there should be exactly 1 compare block
lnames2.length != 1)
throw new DMLRuntimeException("Unable to merge results because multiple compare blocks found.");
mb = StagingFileUtils.readCellList2BlockFromLocal(dir2 + "/" + lnames2[0], brlen, bclen);
boolean appendOnly = mb.isInSparseFormat();
double[][] compare = DataConverter.convertToDoubleMatrix(mb);
String[] lnames = dir.list();
for (String lname : lnames) {
MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
mergeWithComp(mb, tmp, compare);
}
//sort sparse due to append-only
if (appendOnly)
mb.sortSparseRows();
//change sparsity if required after
mb.examSparsity();
} else //WITHOUT COMPARE BLOCK
{
//copy all non-zeros from all workers
String[] lnames = dir.list();
boolean appendOnly = false;
for (String lname : lnames) {
if (mb == null) {
mb = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
appendOnly = mb.isInSparseFormat();
} else {
MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
mergeWithoutComp(mb, tmp, appendOnly);
}
}
//sort sparse due to append-only
if (appendOnly)
mb.sortSparseRows();
//change sparsity if required after
mb.examSparsity();
}
}
//write the block to binary cell
if (mb != null) {
if (mb.isInSparseFormat()) {
Iterator<IJV> iter = mb.getSparseBlockIterator();
while (iter.hasNext()) {
IJV lcell = iter.next();
indexes.setIndexes(row_offset + lcell.getI(), col_offset + lcell.getJ());
cell.setValue(lcell.getV());
out.append(indexes, cell);
written = true;
}
} else {
for (int i = 0; i < brlen; i++) for (int j = 0; j < bclen; j++) {
double lvalue = mb.getValueDenseUnsafe(i, j);
if (//for nnz
lvalue != 0) {
indexes.setIndexes(row_offset + i, col_offset + j);
cell.setValue(lvalue);
out.append(indexes, cell);
written = true;
}
}
}
}
}
if (!written)
out.append(indexes, cell);
} finally {
IOUtilFunctions.closeSilently(out);
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class ReblockSPInstruction method processMatrixReblockInstruction.
@SuppressWarnings("unchecked")
protected void processMatrixReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) throws DMLRuntimeException {
MatrixObject mo = sec.getMatrixObject(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (iinfo == InputInfo.TextCellInputInfo || iinfo == InputInfo.MatrixMarketInputInfo) {
//check jdk version (prevent double.parseDouble contention on <jdk8)
sec.checkAndRaiseValidationWarningJDKVersion();
//get the input textcell rdd
JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
//convert textcell to binary block
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, outputEmptyBlocks);
//put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
} else if (iinfo == InputInfo.CSVInputInfo) {
// HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
// throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
CSVReblockSPInstruction csvInstruction = null;
boolean hasHeader = false;
String delim = ",";
boolean fill = false;
double fillValue = 0;
if (mo.getFileFormatProperties() instanceof CSVFileFormatProperties && mo.getFileFormatProperties() != null) {
CSVFileFormatProperties props = (CSVFileFormatProperties) mo.getFileFormatProperties();
hasHeader = props.hasHeader();
delim = props.getDelim();
fill = props.isFill();
fillValue = props.getFillValue();
}
csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
csvInstruction.processInstruction(sec);
return;
} else if (iinfo == InputInfo.BinaryCellInputInfo) {
JavaPairRDD<MatrixIndexes, MatrixCell> binaryCells = (JavaPairRDD<MatrixIndexes, MatrixCell>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), binaryCells, mcOut, outputEmptyBlocks);
//put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
} else if (iinfo == InputInfo.BinaryBlockInputInfo) {
//BINARY BLOCK <- BINARY BLOCK (different sizes)
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.flatMapToPair(new ExtractBlockForBinaryReblock(mc, mcOut));
out = RDDAggregateUtils.mergeByKey(out, false);
//put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
} else {
throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction:" + InputInfo.inputInfoToString(iinfo));
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class ReorgSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
String opcode = getOpcode();
//get input rdd handle
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
if (//TRANSPOSE
opcode.equalsIgnoreCase("r'")) {
//execute transpose reorg operation
out = in1.mapToPair(new ReorgMapFunction(opcode));
} else if (//REVERSE
opcode.equalsIgnoreCase("rev")) {
//execute reverse reorg operation
out = in1.flatMapToPair(new RDDRevFunction(mcIn));
if (mcIn.getRows() % mcIn.getRowsPerBlock() != 0)
out = RDDAggregateUtils.mergeByKey(out, false);
} else if (// DIAG
opcode.equalsIgnoreCase("rdiag")) {
if (mcIn.getCols() == 1) {
// diagV2M
out = in1.flatMapToPair(new RDDDiagV2MFunction(mcIn));
} else {
// diagM2V
//execute diagM2V operation
out = in1.filter(new FilterDiagBlocksFunction()).mapToPair(new ReorgMapFunction(opcode));
}
} else if (//ORDER
opcode.equalsIgnoreCase("rsort")) {
// Sort by column 'col' in ascending/descending order and return either index/value
//get parameters
long col = ec.getScalarInput(_col.getName(), _col.getValueType(), _col.isLiteral()).getLongValue();
boolean desc = ec.getScalarInput(_desc.getName(), _desc.getValueType(), _desc.isLiteral()).getBooleanValue();
boolean ixret = ec.getScalarInput(_ixret.getName(), _ixret.getValueType(), _ixret.isLiteral()).getBooleanValue();
boolean singleCol = (mcIn.getCols() == 1);
// extract column (if necessary) and sort
out = in1;
if (!singleCol) {
out = out.filter(new IsBlockInRange(1, mcIn.getRows(), col, col, mcIn)).mapValues(new ExtractColumn((int) UtilFunctions.computeCellInBlock(col, mcIn.getColsPerBlock())));
}
//actual index/data sort operation
if (ixret) {
//sort indexes
out = RDDSortUtils.sortIndexesByVal(out, !desc, mcIn.getRows(), mcIn.getRowsPerBlock());
} else if (singleCol && !desc) {
//sort single-column matrix
out = RDDSortUtils.sortByVal(out, mcIn.getRows(), mcIn.getRowsPerBlock());
} else {
//sort multi-column matrix
if (!_bSortIndInMem)
out = RDDSortUtils.sortDataByVal(out, in1, !desc, mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
else
out = RDDSortUtils.sortDataByValMemSort(out, in1, !desc, mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock(), sec, (ReorgOperator) _optr);
}
} else {
throw new DMLRuntimeException("Error: Incorrect opcode in ReorgSPInstruction:" + opcode);
}
//store output rdd handle
updateReorgMatrixCharacteristics(sec);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class QuaternarySPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
QuaternaryOperator qop = (QuaternaryOperator) _optr;
//tracking of rdds and broadcasts (for lineage maintenance)
ArrayList<String> rddVars = new ArrayList<String>();
ArrayList<String> bcVars = new ArrayList<String>();
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
MatrixCharacteristics inMc = sec.getMatrixCharacteristics(input1.getName());
long rlen = inMc.getRows();
long clen = inMc.getCols();
int brlen = inMc.getRowsPerBlock();
int bclen = inMc.getColsPerBlock();
//(map/redwsloss, map/redwcemm); safe because theses ops produce a scalar
if (qop.wtype1 != null || qop.wtype4 != null) {
in = in.filter(new FilterNonEmptyBlocksFunction());
}
//map-side only operation (one rdd input, two broadcasts)
if (WeightedSquaredLoss.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedSigmoid.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedDivMM.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedCrossEntropy.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedUnaryMM.OPCODE.equalsIgnoreCase(getOpcode())) {
PartitionedBroadcast<MatrixBlock> bc1 = sec.getBroadcastForVariable(input2.getName());
PartitionedBroadcast<MatrixBlock> bc2 = sec.getBroadcastForVariable(input3.getName());
//partitioning-preserving mappartitions (key access required for broadcast loopkup)
//only wdivmm changes keys
boolean noKeyChange = (qop.wtype3 == null || qop.wtype3.isBasic());
out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), noKeyChange);
rddVars.add(input1.getName());
bcVars.add(input2.getName());
bcVars.add(input3.getName());
} else //reduce-side operation (two/three/four rdd inputs, zero/one/two broadcasts)
{
PartitionedBroadcast<MatrixBlock> bc1 = _cacheU ? sec.getBroadcastForVariable(input2.getName()) : null;
PartitionedBroadcast<MatrixBlock> bc2 = _cacheV ? sec.getBroadcastForVariable(input3.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inU = (!_cacheU) ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inV = (!_cacheV) ? sec.getBinaryBlockRDDHandleForVariable(input3.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inW = (qop.hasFourInputs() && !_input4.isLiteral()) ? sec.getBinaryBlockRDDHandleForVariable(_input4.getName()) : null;
//preparation of transposed and replicated U
if (inU != null)
inU = inU.flatMapToPair(new ReplicateBlocksFunction(clen, bclen, true));
//preparation of transposed and replicated V
if (inV != null)
inV = inV.mapToPair(new TransposeFactorIndexesFunction()).flatMapToPair(new ReplicateBlocksFunction(rlen, brlen, false));
//functions calls w/ two rdd inputs
if (inU != null && inV == null && inW == null)
out = in.join(inU).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else if (inU == null && inV != null && inW == null)
out = in.join(inV).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else if (inU == null && inV == null && inW != null)
out = in.join(inW).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else //function calls w/ three rdd inputs
if (inU != null && inV != null && inW == null)
out = in.join(inU).join(inV).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU != null && inV == null && inW != null)
out = in.join(inU).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU == null && inV != null && inW != null)
out = in.join(inV).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU == null && inV == null && inW == null) {
out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), false);
} else
//function call w/ four rdd inputs
//need keys in case of wdivmm
out = in.join(inU).join(inV).join(inW).mapToPair(new RDDQuaternaryFunction4(qop));
//keep variable names for lineage maintenance
if (inU == null)
bcVars.add(input2.getName());
else
rddVars.add(input2.getName());
if (inV == null)
bcVars.add(input3.getName());
else
rddVars.add(input3.getName());
if (inW != null)
rddVars.add(_input4.getName());
}
//output handling, incl aggregation
if (//map/redwsloss, map/redwcemm
qop.wtype1 != null || qop.wtype4 != null) {
//full aggregate and cast to scalar
MatrixBlock tmp = RDDAggregateUtils.sumStable(out);
DoubleObject ret = new DoubleObject(tmp.getValue(0, 0));
sec.setVariable(output.getName(), ret);
} else //map/redwsigmoid, map/redwdivmm, map/redwumm
{
//aggregation if required (map/redwdivmm)
if (qop.wtype3 != null && !qop.wtype3.isBasic())
out = RDDAggregateUtils.sumByKeyStable(out, false);
//put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
//maintain lineage information for output rdd
for (String rddVar : rddVars) sec.addLineageRDD(output.getName(), rddVar);
for (String bcVar : bcVars) sec.addLineageBroadcast(output.getName(), bcVar);
//update matrix characteristics
updateOutputMatrixCharacteristics(sec, qop);
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RandSPInstruction method generateSequence.
private void generateSequence(SparkExecutionContext sec) throws DMLRuntimeException {
//sanity check valid increment
if (seq_incr == 0) {
throw new DMLRuntimeException("ERROR: While performing seq(" + seq_from + "," + seq_to + "," + seq_incr + ")");
}
//handle default 1 to -1 for special case of from>to
seq_incr = LibMatrixDatagen.updateSeqIncr(seq_from, seq_to, seq_incr);
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction seq with seqFrom=" + seq_from + ", seqTo=" + seq_to + ", seqIncr" + seq_incr);
//step 1: offset generation
JavaRDD<Double> offsetsRDD = null;
long nnz = (long) Math.abs(Math.round((seq_to - seq_from) / seq_incr)) + 1;
double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(nnz, 1, rowsInBlock, colsInBlock, //overestimate for on disk, ensures hdfs block per partition
nnz);
double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
long numBlocks = (long) Math.ceil(((double) nnz) / rowsInBlock);
//a) in-memory offset rdd construction
if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
ArrayList<Double> offsets = new ArrayList<Double>();
for (long i = 0; i < numBlocks; i++) {
double off = seq_from + seq_incr * i * rowsInBlock;
offsets.add(off);
}
//for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
//create offset rdd
offsetsRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
} else //b) file-based offset rdd construction (for robustness wrt large number of blocks)
{
Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
PrintWriter pw = null;
try {
FileSystem fs = IOUtilFunctions.getFileSystem(path);
pw = new PrintWriter(fs.create(path));
for (long i = 0; i < numBlocks; i++) {
double off = seq_from + seq_incr * i * rowsInBlock;
pw.println(off);
}
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
} finally {
IOUtilFunctions.closeSilently(pw);
}
//for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
//create seeds rdd
offsetsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).map(new ExtractOffsetTuple());
}
//sanity check number of non-zeros
if (nnz != rows && rows != -1) {
throw new DMLRuntimeException("Incorrect number of non-zeros: " + nnz + " != " + rows);
}
//step 2: execute seq instruction over offset input
JavaPairRDD<MatrixIndexes, MatrixBlock> out = offsetsRDD.mapToPair(new GenerateSequenceBlock(rowsInBlock, seq_from, seq_to, seq_incr));
//step 3: output handling
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (!mcOut.dimsKnown()) {
mcOut.set(nnz, 1, rowsInBlock, colsInBlock, nnz);
}
sec.setRDDHandleForVariable(output.getName(), out);
}
Aggregations