use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class VariableCPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
switch(opcode) {
case CreateVariable:
if (getInput1().getDataType() == DataType.MATRIX) {
// create new variable for symbol table and cache
// (existing objects gets cleared through rmvar instructions)
String fname = getInput2().getName();
// check if unique filename needs to be generated
if (Boolean.parseBoolean(getInput3().getName())) {
fname = new StringBuilder(fname.length() + 16).append(fname).append('_').append(_uniqueVarID.getNextID()).toString();
}
MatrixObject mobj = new MatrixObject(getInput1().getValueType(), fname);
// clone meta data because it is updated on copy-on-write, otherwise there
// is potential for hidden side effects between variables.
mobj.setMetaData((MetaData) metadata.clone());
mobj.setFileFormatProperties(_formatProperties);
mobj.setUpdateType(_updateType);
ec.setVariable(getInput1().getName(), mobj);
if (DMLScript.STATISTICS && _updateType.isInPlace())
Statistics.incrementTotalUIPVar();
} else if (getInput1().getDataType() == DataType.FRAME) {
String fname = getInput2().getName();
FrameObject fobj = new FrameObject(fname);
fobj.setMetaData((MetaData) metadata.clone());
fobj.setFileFormatProperties(_formatProperties);
if (_schema != null)
// after metadata
fobj.setSchema(_schema);
ec.setVariable(getInput1().getName(), fobj);
} else if (getInput1().getDataType() == DataType.SCALAR) {
// created variable not called for scalars
ec.setScalarOutput(getInput1().getName(), null);
} else {
throw new DMLRuntimeException("Unexpected data type: " + getInput1().getDataType());
}
break;
case AssignVariable:
// assign value of variable to the other
ec.setScalarOutput(getInput2().getName(), ec.getScalarInput(getInput1()));
break;
case CopyVariable:
processCopyInstruction(ec);
break;
case MoveVariable:
processMoveInstruction(ec);
break;
case RemoveVariable:
for (CPOperand input : inputs) processRemoveVariableInstruction(ec, input.getName());
break;
case RemoveVariableAndFile:
// Remove the variable from HashMap _variables, and possibly delete the data on disk.
boolean del = ((BooleanObject) ec.getScalarInput(getInput2().getName(), getInput2().getValueType(), true)).getBooleanValue();
MatrixObject m = (MatrixObject) ec.removeVariable(getInput1().getName());
if (!del) {
// therefore data must be exported if dirty flag is set
if (m.isDirty())
m.exportData();
} else {
// throw new DMLRuntimeException("rmfilevar w/ true is not expected! " + instString);
// cleanDataOnHDFS(pb, input1.getName());
cleanDataOnHDFS(m);
}
// check if in-memory object can be cleaned up
if (!ec.getVariables().hasReferences(m)) {
// no other variable in the symbol table points to the same Data object as that of input1.getName()
// remove matrix object from cache
m.clearData();
}
break;
case // castAsScalarVariable
CastAsScalarVariable:
if (getInput1().getDataType() == DataType.FRAME) {
FrameBlock fBlock = ec.getFrameInput(getInput1().getName());
if (fBlock.getNumRows() != 1 || fBlock.getNumColumns() != 1)
throw new DMLRuntimeException("Dimension mismatch - unable to cast frame '" + getInput1().getName() + "' of dimension (" + fBlock.getNumRows() + " x " + fBlock.getNumColumns() + ") to scalar.");
Object value = fBlock.get(0, 0);
ec.releaseFrameInput(getInput1().getName());
ec.setScalarOutput(output.getName(), ScalarObjectFactory.createScalarObject(fBlock.getSchema()[0], value));
} else {
// assume DataType.MATRIX otherwise
MatrixBlock mBlock = ec.getMatrixInput(getInput1().getName(), getExtendedOpcode());
if (mBlock.getNumRows() != 1 || mBlock.getNumColumns() != 1)
throw new DMLRuntimeException("Dimension mismatch - unable to cast matrix '" + getInput1().getName() + "' of dimension (" + mBlock.getNumRows() + " x " + mBlock.getNumColumns() + ") to scalar.");
double value = mBlock.getValue(0, 0);
ec.releaseMatrixInput(getInput1().getName(), getExtendedOpcode());
ec.setScalarOutput(output.getName(), new DoubleObject(value));
}
break;
case CastAsMatrixVariable:
{
MatrixBlock out = null;
if (getInput1().getDataType() == DataType.FRAME) {
FrameBlock fin = ec.getFrameInput(getInput1().getName());
out = DataConverter.convertToMatrixBlock(fin);
ec.releaseFrameInput(getInput1().getName());
} else {
// assume DataType.SCALAR otherwise
ScalarObject scalarInput = ec.getScalarInput(getInput1().getName(), getInput1().getValueType(), getInput1().isLiteral());
out = new MatrixBlock(1, 1, false);
out.quickSetValue(0, 0, scalarInput.getDoubleValue());
}
ec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
break;
}
case CastAsFrameVariable:
{
FrameBlock out = null;
if (getInput1().getDataType() == DataType.SCALAR) {
ScalarObject scalarInput = ec.getScalarInput(getInput1());
out = new FrameBlock(1, getInput1().getValueType());
out.ensureAllocatedColumns(1);
out.set(0, 0, scalarInput.getStringValue());
} else {
// DataType.FRAME
MatrixBlock min = ec.getMatrixInput(getInput1().getName(), getExtendedOpcode());
out = DataConverter.convertToFrameBlock(min);
ec.releaseMatrixInput(getInput1().getName(), getExtendedOpcode());
}
ec.setFrameOutput(output.getName(), out);
break;
}
case CastAsDoubleVariable:
{
ScalarObject scalarInput = ec.getScalarInput(getInput1());
ec.setScalarOutput(output.getName(), new DoubleObject(scalarInput.getDoubleValue()));
break;
}
case CastAsIntegerVariable:
{
ScalarObject scalarInput = ec.getScalarInput(getInput1());
ec.setScalarOutput(output.getName(), new IntObject(scalarInput.getLongValue()));
break;
}
case CastAsBooleanVariable:
{
ScalarObject scalarInput = ec.getScalarInput(getInput1());
ec.setScalarOutput(output.getName(), new BooleanObject(scalarInput.getBooleanValue()));
break;
}
case Read:
ScalarObject res = null;
try {
switch(getInput1().getValueType()) {
case DOUBLE:
double d = MapReduceTool.readDoubleFromHDFSFile(getInput2().getName());
res = (ScalarObject) new DoubleObject(d);
break;
case INT:
long i = MapReduceTool.readIntegerFromHDFSFile(getInput2().getName());
res = (ScalarObject) new IntObject(i);
break;
case BOOLEAN:
boolean b = MapReduceTool.readBooleanFromHDFSFile(getInput2().getName());
res = (ScalarObject) new BooleanObject(b);
break;
case STRING:
String s = MapReduceTool.readStringFromHDFSFile(getInput2().getName());
res = (ScalarObject) new StringObject(s);
break;
default:
throw new DMLRuntimeException("Invalid value type (" + getInput1().getValueType() + ") while processing readScalar instruction.");
}
} catch (IOException e) {
throw new DMLRuntimeException(e);
}
ec.setScalarOutput(getInput1().getName(), res);
break;
case Write:
processWriteInstruction(ec);
break;
case SetFileName:
Data data = ec.getVariable(getInput1().getName());
if (data.getDataType() == DataType.MATRIX) {
if (getInput3().getName().equalsIgnoreCase("remote")) {
((MatrixObject) data).setFileName(getInput2().getName());
} else {
throw new DMLRuntimeException("Invalid location (" + getInput3().getName() + ") in SetFileName instruction: " + instString);
}
} else {
throw new DMLRuntimeException("Invalid data type (" + getInput1().getDataType() + ") in SetFileName instruction: " + instString);
}
break;
default:
throw new DMLRuntimeException("Unknown opcode: " + opcode);
}
}
use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class CheckpointSPInstruction method processInstruction.
@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// this is valid if relevant branches are never entered)
if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
// add a dummy entry to the input, which will be immediately overwritten by the null output.
sec.setVariable(input1.getName(), new BooleanObject(false));
sec.setVariable(output.getName(), new BooleanObject(false));
return;
}
// -------
// (for csv input files with unknown dimensions, we might have generated a checkpoint after
// csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
CacheableData<?> obj = sec.getCacheableData(input1.getName());
if (obj.isCached(true)) {
// available in memory
sec.setVariable(output.getName(), obj);
return;
}
// get input rdd handle (for matrix or frame)
JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
// Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
// -------
// Note that persist is an transformation which will be triggered on-demand with the next rdd operations
// This prevents unnecessary overhead if the dataset is only consumed by cp operations.
JavaPairRDD<?, ?> out = null;
if (!in.getStorageLevel().equals(_level)) {
// (trigger coalesce if intended number of partitions exceeded by 20%
// and not hash partitioned to avoid losing the existing partitioner)
int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
// checkpoint pre-processing rdd operations
if (coalesce) {
// merge partitions without shuffle if too many partitions
out = in.coalesce(numPartitions);
} else {
// apply a narrow shallow copy to allow for short-circuit collects
if (input1.getDataType() == DataType.MATRIX)
out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
else if (input1.getDataType() == DataType.FRAME)
out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
}
// convert mcsr into memory-efficient csr if potentially sparse
if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
}
// actual checkpoint into given storage level
out = out.persist(_level);
// otherwise these their nnz would never be evaluated due to lazy evaluation in spark
if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
}
} else {
// pass-through
out = in;
}
// Step 3: In-place update of input matrix/frame rdd handle and set as output
// -------
// We use this in-place approach for two reasons. First, it is correct because our checkpoint
// injection rewrites guarantee that after checkpoint instructions there are no consumers on the
// given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
// filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
// caching and subsequent collects. Note that in-place update requires us to explicitly handle
// lineage information in order to prevent cycles on cleanup.
CacheableData<?> cd = sec.getCacheableData(input1.getName());
if (out != in) {
// prevent unnecessary lineage info
// guaranteed to exist (see above)
RDDObject inro = cd.getRDDHandle();
// create new rdd object
RDDObject outro = new RDDObject(out);
// mark as checkpointed
outro.setCheckpointRDD(true);
// keep lineage to prevent cycles on cleanup
outro.addLineageChild(inro);
cd.setRDDHandle(outro);
}
sec.setVariable(output.getName(), cd);
}
use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class FrameAppendRSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<Long, FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable(input2.getName());
JavaPairRDD<Long, FrameBlock> out = null;
long leftRows = sec.getMatrixCharacteristics(input1.getName()).getRows();
if (_cbind) {
JavaPairRDD<Long, FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
JavaPairRDD<Long, FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
} else {
// rbind
JavaPairRDD<Long, FrameBlock> right = in2.mapToPair(new ReduceSideAppendRowsFunction(leftRows));
out = in1.union(right);
}
// put output RDD handle into symbol table
updateBinaryAppendOutputMatrixCharacteristics(sec, _cbind);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
sec.addLineageRDD(output.getName(), input2.getName());
// update schema of output with merged input schemas
sec.getFrameObject(output.getName()).setSchema(sec.getFrameObject(input1.getName()).mergeSchemas(sec.getFrameObject(input2.getName())));
}
use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class ReblockSPInstruction method processFrameReblockInstruction.
@SuppressWarnings("unchecked")
protected void processFrameReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) {
FrameObject fo = sec.getFrameObject(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (iinfo == InputInfo.TextCellInputInfo) {
// get the input textcell rdd
JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
// convert textcell to binary block
JavaPairRDD<Long, FrameBlock> out = FrameRDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, fo.getSchema());
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
} else if (iinfo == InputInfo.CSVInputInfo) {
// HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
// throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
CSVReblockSPInstruction csvInstruction = null;
boolean hasHeader = false;
String delim = ",";
boolean fill = false;
double fillValue = 0;
if (fo.getFileFormatProperties() instanceof CSVFileFormatProperties && fo.getFileFormatProperties() != null) {
CSVFileFormatProperties props = (CSVFileFormatProperties) fo.getFileFormatProperties();
hasHeader = props.hasHeader();
delim = props.getDelim();
fill = props.isFill();
fillValue = props.getFillValue();
}
csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
csvInstruction.processInstruction(sec);
} else {
throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction: " + InputInfo.inputInfoToString(iinfo));
}
}
use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class WriteSPInstruction method processFrameWriteInstruction.
@SuppressWarnings("unchecked")
protected void processFrameWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi, ValueType[] schema) throws IOException {
// get input rdd
JavaPairRDD<Long, FrameBlock> in1 = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if (oi == OutputInfo.TextCellOutputInfo) {
JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToTextCell(in1, mc);
customSaveTextFile(out, fname, false);
} else if (oi == OutputInfo.CSVOutputInfo) {
CSVFileFormatProperties props = (formatProperties != null) ? (CSVFileFormatProperties) formatProperties : null;
JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToCsv(in1, mc, props, true);
customSaveTextFile(out, fname, false);
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
JavaPairRDD<LongWritable, FrameBlock> out = in1.mapToPair(new LongFrameToLongWritableFrameFunction());
out.saveAsHadoopFile(fname, LongWritable.class, FrameBlock.class, SequenceFileOutputFormat.class);
} else {
// unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
}
// write meta data file
MapReduceTool.writeMetaDataFile(fname + ".mtd", input1.getValueType(), schema, DataType.FRAME, mc, oi, formatProperties);
}
Aggregations