use of org.apache.sysml.runtime.transform.encode.Encoder in project systemml by apache.
the class TransformApplyEmptyRecodeMapTest method testTransformApplyEmptyRecodeMap.
@Test
public void testTransformApplyEmptyRecodeMap() {
try {
// generate input data
FrameBlock data = DataConverter.convertToFrameBlock(DataConverter.convertToMatrixBlock(getRandomMatrix(rows, cols, 1, 1, 1, 7)));
FrameBlock meta = new FrameBlock(new ValueType[] { ValueType.STRING }, new String[] { "C1" });
// execute transform apply
Encoder encoder = EncoderFactory.createEncoder("{ids:true, recode:[1]}", data.getColumnNames(), meta.getSchema(), meta);
MatrixBlock out = encoder.apply(data, new MatrixBlock(rows, cols, true));
// check outputs
Assert.assertEquals(rows, out.getNumRows());
Assert.assertEquals(cols, out.getNumColumns());
for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) Assert.assertTrue(Double.isNaN(out.quickGetValue(i, j)));
} catch (DMLRuntimeException e) {
throw new RuntimeException(e);
}
}
use of org.apache.sysml.runtime.transform.encode.Encoder in project systemml by apache.
the class ParameterizedBuiltinSPInstruction method processInstruction.
@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
String opcode = getOpcode();
// opcode guaranteed to be a valid opcode (see parsing)
if (opcode.equalsIgnoreCase("mapgroupedagg")) {
// get input rdd handle
String targetVar = params.get(Statement.GAGG_TARGET);
String groupsVar = params.get(Statement.GAGG_GROUPS);
JavaPairRDD<MatrixIndexes, MatrixBlock> target = sec.getBinaryBlockRDDHandleForVariable(targetVar);
PartitionedBroadcast<MatrixBlock> groups = sec.getBroadcastForVariable(groupsVar);
MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(targetVar);
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
CPOperand ngrpOp = new CPOperand(params.get(Statement.GAGG_NUM_GROUPS));
int ngroups = (int) sec.getScalarInput(ngrpOp.getName(), ngrpOp.getValueType(), ngrpOp.isLiteral()).getLongValue();
// single-block aggregation
if (ngroups <= mc1.getRowsPerBlock() && mc1.getCols() <= mc1.getColsPerBlock()) {
// execute map grouped aggregate
JavaRDD<MatrixBlock> out = target.map(new RDDMapGroupedAggFunction2(groups, _optr, ngroups));
MatrixBlock out2 = RDDAggregateUtils.sumStable(out);
// put output block into symbol table (no lineage because single block)
// this also includes implicit maintenance of matrix characteristics
sec.setMatrixOutput(output.getName(), out2, getExtendedOpcode());
} else // multi-block aggregation
{
// execute map grouped aggregate
JavaPairRDD<MatrixIndexes, MatrixBlock> out = target.flatMapToPair(new RDDMapGroupedAggFunction(groups, _optr, ngroups, mc1.getRowsPerBlock(), mc1.getColsPerBlock()));
out = RDDAggregateUtils.sumByKeyStable(out, false);
// updated characteristics and handle outputs
mcOut.set(ngroups, mc1.getCols(), mc1.getRowsPerBlock(), mc1.getColsPerBlock(), -1);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), targetVar);
sec.addLineageBroadcast(output.getName(), groupsVar);
}
} else if (opcode.equalsIgnoreCase("groupedagg")) {
boolean broadcastGroups = Boolean.parseBoolean(params.get("broadcast"));
// get input rdd handle
String groupsVar = params.get(Statement.GAGG_GROUPS);
JavaPairRDD<MatrixIndexes, MatrixBlock> target = sec.getBinaryBlockRDDHandleForVariable(params.get(Statement.GAGG_TARGET));
JavaPairRDD<MatrixIndexes, MatrixBlock> groups = broadcastGroups ? null : sec.getBinaryBlockRDDHandleForVariable(groupsVar);
JavaPairRDD<MatrixIndexes, MatrixBlock> weights = null;
MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(params.get(Statement.GAGG_TARGET));
MatrixCharacteristics mc2 = sec.getMatrixCharacteristics(groupsVar);
if (mc1.dimsKnown() && mc2.dimsKnown() && (mc1.getRows() != mc2.getRows() || mc2.getCols() != 1)) {
throw new DMLRuntimeException("Grouped Aggregate dimension mismatch between target and groups.");
}
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
JavaPairRDD<MatrixIndexes, WeightedCell> groupWeightedCells = null;
// Step 1: First extract groupWeightedCells from group, target and weights
if (params.get(Statement.GAGG_WEIGHTS) != null) {
weights = sec.getBinaryBlockRDDHandleForVariable(params.get(Statement.GAGG_WEIGHTS));
MatrixCharacteristics mc3 = sec.getMatrixCharacteristics(params.get(Statement.GAGG_WEIGHTS));
if (mc1.dimsKnown() && mc3.dimsKnown() && (mc1.getRows() != mc3.getRows() || mc1.getCols() != mc3.getCols())) {
throw new DMLRuntimeException("Grouped Aggregate dimension mismatch between target, groups, and weights.");
}
groupWeightedCells = groups.join(target).join(weights).flatMapToPair(new ExtractGroupNWeights());
} else // input vector or matrix
{
String ngroupsStr = params.get(Statement.GAGG_NUM_GROUPS);
long ngroups = (ngroupsStr != null) ? (long) Double.parseDouble(ngroupsStr) : -1;
// execute basic grouped aggregate (extract and preagg)
if (broadcastGroups) {
PartitionedBroadcast<MatrixBlock> pbm = sec.getBroadcastForVariable(groupsVar);
groupWeightedCells = target.flatMapToPair(new ExtractGroupBroadcast(pbm, mc1.getColsPerBlock(), ngroups, _optr));
} else {
// replicate groups if necessary
if (mc1.getNumColBlocks() > 1) {
groups = groups.flatMapToPair(new ReplicateVectorFunction(false, mc1.getNumColBlocks()));
}
groupWeightedCells = groups.join(target).flatMapToPair(new ExtractGroupJoin(mc1.getColsPerBlock(), ngroups, _optr));
}
}
// Step 2: Make sure we have brlen required while creating <MatrixIndexes, MatrixCell>
if (mc1.getRowsPerBlock() == -1) {
throw new DMLRuntimeException("The block sizes are not specified for grouped aggregate");
}
int brlen = mc1.getRowsPerBlock();
// Step 3: Now perform grouped aggregate operation (either on combiner side or reducer side)
JavaPairRDD<MatrixIndexes, MatrixCell> out = null;
if (_optr instanceof CMOperator && ((CMOperator) _optr).isPartialAggregateOperator() || _optr instanceof AggregateOperator) {
out = groupWeightedCells.reduceByKey(new PerformGroupByAggInCombiner(_optr)).mapValues(new CreateMatrixCell(brlen, _optr));
} else {
// Use groupby key because partial aggregation is not supported
out = groupWeightedCells.groupByKey().mapValues(new PerformGroupByAggInReducer(_optr)).mapValues(new CreateMatrixCell(brlen, _optr));
}
// Step 4: Set output characteristics and rdd handle
setOutputCharacteristicsForGroupedAgg(mc1, mcOut, out);
// store output rdd handle
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), params.get(Statement.GAGG_TARGET));
sec.addLineage(output.getName(), groupsVar, broadcastGroups);
if (params.get(Statement.GAGG_WEIGHTS) != null) {
sec.addLineageRDD(output.getName(), params.get(Statement.GAGG_WEIGHTS));
}
} else if (opcode.equalsIgnoreCase("rmempty")) {
String rddInVar = params.get("target");
String rddOffVar = params.get("offset");
boolean rows = sec.getScalarInput(params.get("margin"), ValueType.STRING, true).getStringValue().equals("rows");
boolean emptyReturn = Boolean.parseBoolean(params.get("empty.return").toLowerCase());
long maxDim = sec.getScalarInput(params.get("maxdim"), ValueType.DOUBLE, false).getLongValue();
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(rddInVar);
if (// default case
maxDim > 0) {
// get input rdd handle
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(rddInVar);
JavaPairRDD<MatrixIndexes, MatrixBlock> off;
PartitionedBroadcast<MatrixBlock> broadcastOff;
long brlen = mcIn.getRowsPerBlock();
long bclen = mcIn.getColsPerBlock();
long numRep = (long) Math.ceil(rows ? (double) mcIn.getCols() / bclen : (double) mcIn.getRows() / brlen);
// execute remove empty rows/cols operation
JavaPairRDD<MatrixIndexes, MatrixBlock> out;
if (_bRmEmptyBC) {
broadcastOff = sec.getBroadcastForVariable(rddOffVar);
// Broadcast offset vector
out = in.flatMapToPair(new RDDRemoveEmptyFunctionInMem(rows, maxDim, brlen, bclen, broadcastOff));
} else {
off = sec.getBinaryBlockRDDHandleForVariable(rddOffVar);
out = in.join(off.flatMapToPair(new ReplicateVectorFunction(!rows, numRep))).flatMapToPair(new RDDRemoveEmptyFunction(rows, maxDim, brlen, bclen));
}
out = RDDAggregateUtils.mergeByKey(out, false);
// store output rdd handle
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), rddInVar);
if (!_bRmEmptyBC)
sec.addLineageRDD(output.getName(), rddOffVar);
else
sec.addLineageBroadcast(output.getName(), rddOffVar);
// update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(rows ? maxDim : mcIn.getRows(), rows ? mcIn.getCols() : maxDim, (int) brlen, (int) bclen, mcIn.getNonZeros());
} else // special case: empty output (ensure valid dims)
{
int n = emptyReturn ? 1 : 0;
MatrixBlock out = new MatrixBlock(rows ? n : (int) mcIn.getRows(), rows ? (int) mcIn.getCols() : n, true);
sec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
}
} else if (opcode.equalsIgnoreCase("replace")) {
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(params.get("target"));
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(params.get("target"));
// execute replace operation
double pattern = Double.parseDouble(params.get("pattern"));
double replacement = Double.parseDouble(params.get("replacement"));
JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.mapValues(new RDDReplaceFunction(pattern, replacement));
// store output rdd handle
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), params.get("target"));
// update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock(), (pattern != 0 && replacement != 0) ? mcIn.getNonZeros() : -1);
} else if (opcode.equalsIgnoreCase("lowertri") || opcode.equalsIgnoreCase("uppertri")) {
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(params.get("target"));
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(params.get("target"));
boolean lower = opcode.equalsIgnoreCase("lowertri");
boolean diag = Boolean.parseBoolean(params.get("diag"));
boolean values = Boolean.parseBoolean(params.get("values"));
JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.mapPartitionsToPair(new RDDExtractTriangularFunction(lower, diag, values), true);
// store output rdd handle
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), params.get("target"));
// update output statistics (required for correctness)
sec.getMatrixCharacteristics(output.getName()).setDimension(mcIn.getRows(), mcIn.getCols());
} else if (opcode.equalsIgnoreCase("rexpand")) {
String rddInVar = params.get("target");
// get input rdd handle
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(rddInVar);
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(rddInVar);
double maxVal = Double.parseDouble(params.get("max"));
long lmaxVal = UtilFunctions.toLong(maxVal);
boolean dirRows = params.get("dir").equals("rows");
boolean cast = Boolean.parseBoolean(params.get("cast"));
boolean ignore = Boolean.parseBoolean(params.get("ignore"));
long brlen = mcIn.getRowsPerBlock();
long bclen = mcIn.getColsPerBlock();
// repartition input vector for higher degree of parallelism
// (avoid scenarios where few input partitions create huge outputs)
MatrixCharacteristics mcTmp = new MatrixCharacteristics(dirRows ? lmaxVal : mcIn.getRows(), dirRows ? mcIn.getRows() : lmaxVal, (int) brlen, (int) bclen, mcIn.getRows());
int numParts = (int) Math.min(SparkUtils.getNumPreferredPartitions(mcTmp, in), mcIn.getNumBlocks());
if (numParts > in.getNumPartitions() * 2)
in = in.repartition(numParts);
// execute rexpand rows/cols operation (no shuffle required because outputs are
// block-aligned with the input, i.e., one input block generates n output blocks)
JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.flatMapToPair(new RDDRExpandFunction(maxVal, dirRows, cast, ignore, brlen, bclen));
// store output rdd handle
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), rddInVar);
// update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(dirRows ? lmaxVal : mcIn.getRows(), dirRows ? mcIn.getRows() : lmaxVal, (int) brlen, (int) bclen, -1);
} else if (opcode.equalsIgnoreCase("transformapply")) {
// get input RDD and meta data
FrameObject fo = sec.getFrameObject(params.get("target"));
JavaPairRDD<Long, FrameBlock> in = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
FrameBlock meta = sec.getFrameInput(params.get("meta"));
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(params.get("target"));
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
String[] colnames = !TfMetaUtils.isIDSpec(params.get("spec")) ? in.lookup(1L).get(0).getColumnNames() : null;
// compute omit offset map for block shifts
TfOffsetMap omap = null;
if (TfMetaUtils.containsOmitSpec(params.get("spec"), colnames)) {
omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(new RDDTransformApplyOffsetFunction(params.get("spec"), colnames)).collect()));
}
// create encoder broadcast (avoiding replication per task)
Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), colnames, fo.getSchema(), (int) fo.getNumColumns(), meta);
mcOut.setDimension(mcIn.getRows() - ((omap != null) ? omap.getNumRmRows() : 0), encoder.getNumCols());
Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
Broadcast<TfOffsetMap> bomap = (omap != null) ? sec.getSparkContext().broadcast(omap) : null;
// execute transform apply
JavaPairRDD<Long, FrameBlock> tmp = in.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
JavaPairRDD<MatrixIndexes, MatrixBlock> out = FrameRDDConverterUtils.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
// set output and maintain lineage/output characteristics
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), params.get("target"));
ec.releaseFrameInput(params.get("meta"));
} else if (opcode.equalsIgnoreCase("transformdecode")) {
// get input RDD and meta data
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(params.get("target"));
MatrixCharacteristics mc = sec.getMatrixCharacteristics(params.get("target"));
FrameBlock meta = sec.getFrameInput(params.get("meta"));
String[] colnames = meta.getColumnNames();
// reblock if necessary (clen > bclen)
if (mc.getCols() > mc.getNumColBlocks()) {
in = in.mapToPair(new RDDTransformDecodeExpandFunction((int) mc.getCols(), mc.getColsPerBlock()));
in = RDDAggregateUtils.mergeByKey(in, false);
}
// construct decoder and decode individual matrix blocks
Decoder decoder = DecoderFactory.createDecoder(params.get("spec"), colnames, null, meta);
JavaPairRDD<Long, FrameBlock> out = in.mapToPair(new RDDTransformDecodeFunction(decoder, mc.getRowsPerBlock()));
// set output and maintain lineage/output characteristics
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), params.get("target"));
ec.releaseFrameInput(params.get("meta"));
sec.getMatrixCharacteristics(output.getName()).set(mc.getRows(), meta.getNumColumns(), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1);
sec.getFrameObject(output.getName()).setSchema(decoder.getSchema());
} else {
throw new DMLRuntimeException("Unknown parameterized builtin opcode: " + opcode);
}
}
use of org.apache.sysml.runtime.transform.encode.Encoder in project incubator-systemml by apache.
the class ParameterizedBuiltinCPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
String opcode = getOpcode();
ScalarObject sores = null;
if (opcode.equalsIgnoreCase("cdf")) {
SimpleOperator op = (SimpleOperator) _optr;
double result = op.fn.execute(params);
sores = new DoubleObject(result);
ec.setScalarOutput(output.getName(), sores);
} else if (opcode.equalsIgnoreCase("invcdf")) {
SimpleOperator op = (SimpleOperator) _optr;
double result = op.fn.execute(params);
sores = new DoubleObject(result);
ec.setScalarOutput(output.getName(), sores);
} else if (opcode.equalsIgnoreCase("groupedagg")) {
// acquire locks
MatrixBlock target = ec.getMatrixInput(params.get(Statement.GAGG_TARGET), getExtendedOpcode());
MatrixBlock groups = ec.getMatrixInput(params.get(Statement.GAGG_GROUPS), getExtendedOpcode());
MatrixBlock weights = null;
if (params.get(Statement.GAGG_WEIGHTS) != null)
weights = ec.getMatrixInput(params.get(Statement.GAGG_WEIGHTS), getExtendedOpcode());
int ngroups = -1;
if (params.get(Statement.GAGG_NUM_GROUPS) != null) {
ngroups = (int) Double.parseDouble(params.get(Statement.GAGG_NUM_GROUPS));
}
// compute the result
// num threads
int k = Integer.parseInt(params.get("k"));
MatrixBlock soresBlock = groups.groupedAggOperations(target, weights, new MatrixBlock(), ngroups, _optr, k);
ec.setMatrixOutput(output.getName(), soresBlock, getExtendedOpcode());
// release locks
target = groups = weights = null;
ec.releaseMatrixInput(params.get(Statement.GAGG_TARGET), getExtendedOpcode());
ec.releaseMatrixInput(params.get(Statement.GAGG_GROUPS), getExtendedOpcode());
if (params.get(Statement.GAGG_WEIGHTS) != null)
ec.releaseMatrixInput(params.get(Statement.GAGG_WEIGHTS), getExtendedOpcode());
} else if (opcode.equalsIgnoreCase("rmempty")) {
String margin = params.get("margin");
if (!(margin.equals("rows") || margin.equals("cols")))
throw new DMLRuntimeException("Unspupported margin identifier '" + margin + "'.");
// acquire locks
MatrixBlock target = ec.getMatrixInput(params.get("target"), getExtendedOpcode());
MatrixBlock select = params.containsKey("select") ? ec.getMatrixInput(params.get("select"), getExtendedOpcode()) : null;
// compute the result
boolean emptyReturn = Boolean.parseBoolean(params.get("empty.return").toLowerCase());
MatrixBlock soresBlock = target.removeEmptyOperations(new MatrixBlock(), margin.equals("rows"), emptyReturn, select);
// release locks
ec.setMatrixOutput(output.getName(), soresBlock, getExtendedOpcode());
ec.releaseMatrixInput(params.get("target"), getExtendedOpcode());
if (params.containsKey("select"))
ec.releaseMatrixInput(params.get("select"), getExtendedOpcode());
} else if (opcode.equalsIgnoreCase("replace")) {
// acquire locks
MatrixBlock target = ec.getMatrixInput(params.get("target"), getExtendedOpcode());
// compute the result
double pattern = Double.parseDouble(params.get("pattern"));
double replacement = Double.parseDouble(params.get("replacement"));
MatrixBlock ret = (MatrixBlock) target.replaceOperations(new MatrixBlock(), pattern, replacement);
// release locks
ec.setMatrixOutput(output.getName(), ret, getExtendedOpcode());
ec.releaseMatrixInput(params.get("target"), getExtendedOpcode());
} else if (opcode.equalsIgnoreCase("rexpand")) {
// acquire locks
MatrixBlock target = ec.getMatrixInput(params.get("target"), getExtendedOpcode());
// compute the result
double maxVal = Double.parseDouble(params.get("max"));
boolean dirVal = params.get("dir").equals("rows");
boolean cast = Boolean.parseBoolean(params.get("cast"));
boolean ignore = Boolean.parseBoolean(params.get("ignore"));
int numThreads = Integer.parseInt(params.get("k"));
MatrixBlock ret = (MatrixBlock) target.rexpandOperations(new MatrixBlock(), maxVal, dirVal, cast, ignore, numThreads);
// release locks
ec.setMatrixOutput(output.getName(), ret, getExtendedOpcode());
ec.releaseMatrixInput(params.get("target"), getExtendedOpcode());
} else if (opcode.equalsIgnoreCase("transformapply")) {
// acquire locks
FrameBlock data = ec.getFrameInput(params.get("target"));
FrameBlock meta = ec.getFrameInput(params.get("meta"));
String[] colNames = data.getColumnNames();
// compute transformapply
Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), colNames, data.getNumColumns(), meta);
MatrixBlock mbout = encoder.apply(data, new MatrixBlock(data.getNumRows(), data.getNumColumns(), false));
// release locks
ec.setMatrixOutput(output.getName(), mbout, getExtendedOpcode());
ec.releaseFrameInput(params.get("target"));
ec.releaseFrameInput(params.get("meta"));
} else if (opcode.equalsIgnoreCase("transformdecode")) {
// acquire locks
MatrixBlock data = ec.getMatrixInput(params.get("target"), getExtendedOpcode());
FrameBlock meta = ec.getFrameInput(params.get("meta"));
String[] colnames = meta.getColumnNames();
// compute transformdecode
Decoder decoder = DecoderFactory.createDecoder(getParameterMap().get("spec"), colnames, null, meta);
FrameBlock fbout = decoder.decode(data, new FrameBlock(decoder.getSchema()));
fbout.setColumnNames(Arrays.copyOfRange(colnames, 0, fbout.getNumColumns()));
// release locks
ec.setFrameOutput(output.getName(), fbout);
ec.releaseMatrixInput(params.get("target"), getExtendedOpcode());
ec.releaseFrameInput(params.get("meta"));
} else if (opcode.equalsIgnoreCase("transformcolmap")) {
// acquire locks
FrameBlock meta = ec.getFrameInput(params.get("target"));
String[] colNames = meta.getColumnNames();
// compute transformapply
Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), colNames, meta.getNumColumns(), null);
MatrixBlock mbout = encoder.getColMapping(meta, new MatrixBlock(meta.getNumColumns(), 3, false));
// release locks
ec.setMatrixOutput(output.getName(), mbout, getExtendedOpcode());
ec.releaseFrameInput(params.get("target"));
} else if (opcode.equalsIgnoreCase("transformmeta")) {
// get input spec and path
String spec = getParameterMap().get("spec");
String path = getParameterMap().get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_MTD);
String delim = getParameterMap().containsKey("sep") ? getParameterMap().get("sep") : TfUtils.TXMTD_SEP;
// execute transform meta data read
FrameBlock meta = null;
try {
meta = TfMetaUtils.readTransformMetaDataFromFile(spec, path, delim);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
// release locks
ec.setFrameOutput(output.getName(), meta);
} else if (opcode.equalsIgnoreCase("toString")) {
// handle input parameters
int rows = (getParam("rows") != null) ? Integer.parseInt(getParam("rows")) : TOSTRING_MAXROWS;
int cols = (getParam("cols") != null) ? Integer.parseInt(getParam("cols")) : TOSTRING_MAXCOLS;
int decimal = (getParam("decimal") != null) ? Integer.parseInt(getParam("decimal")) : TOSTRING_DECIMAL;
boolean sparse = (getParam("sparse") != null) ? Boolean.parseBoolean(getParam("sparse")) : TOSTRING_SPARSE;
String separator = (getParam("sep") != null) ? getParam("sep") : TOSTRING_SEPARATOR;
String lineseparator = (getParam("linesep") != null) ? getParam("linesep") : TOSTRING_LINESEPARATOR;
// get input matrix/frame and convert to string
CacheableData<?> data = ec.getCacheableData(getParam("target"));
String out = null;
if (data instanceof MatrixObject) {
MatrixBlock matrix = (MatrixBlock) data.acquireRead();
warnOnTrunction(matrix, rows, cols);
out = DataConverter.toString(matrix, sparse, separator, lineseparator, rows, cols, decimal);
} else if (data instanceof FrameObject) {
FrameBlock frame = (FrameBlock) data.acquireRead();
warnOnTrunction(frame, rows, cols);
out = DataConverter.toString(frame, sparse, separator, lineseparator, rows, cols, decimal);
} else {
throw new DMLRuntimeException("toString only converts matrix or frames to string");
}
ec.releaseCacheableData(getParam("target"));
ec.setScalarOutput(output.getName(), new StringObject(out));
} else {
throw new DMLRuntimeException("Unknown opcode : " + opcode);
}
}
use of org.apache.sysml.runtime.transform.encode.Encoder in project systemml by apache.
the class MultiReturnParameterizedBuiltinCPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
// obtain and pin input frame
FrameBlock fin = ec.getFrameInput(input1.getName());
String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue();
String[] colnames = fin.getColumnNames();
// execute block transform encode
Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fin.getNumColumns(), null);
// build and apply
MatrixBlock data = encoder.encode(fin, new MatrixBlock(fin.getNumRows(), fin.getNumColumns(), false));
FrameBlock meta = encoder.getMetaData(new FrameBlock(fin.getNumColumns(), ValueType.STRING));
meta.setColumnNames(colnames);
// release input and outputs
ec.releaseFrameInput(input1.getName());
ec.setMatrixOutput(getOutput(0).getName(), data, getExtendedOpcode());
ec.setFrameOutput(getOutput(1).getName(), meta);
}
use of org.apache.sysml.runtime.transform.encode.Encoder in project systemml by apache.
the class MultiReturnParameterizedBuiltinSPInstruction method processInstruction.
@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
try {
// get input RDD and meta data
FrameObject fo = sec.getFrameObject(input1.getName());
FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
JavaPairRDD<Long, FrameBlock> in = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue();
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null;
// step 1: build transform meta data
Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), null);
MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext());
JavaRDD<String> rcMaps = in.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)).distinct().groupByKey().flatMap(new TransformEncodeGroupFunction(accMax));
if (containsMVImputeEncoder(encoderBuild)) {
EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
rcMaps = rcMaps.union(in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)).groupByKey().flatMap(new TransformEncodeGroup2Function(mva)));
}
// trigger eval
rcMaps.saveAsTextFile(fometa.getFileName());
// consolidate meta data frame (reuse multi-threaded reader, special handling missing values)
FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
// recompute num distinct items per column
meta.recomputeColumnCardinality();
meta.setColumnNames((colnames != null) ? colnames : meta.getColumnNames());
// step 2: transform apply (similar to spark transformapply)
// compute omit offset map for block shifts
TfOffsetMap omap = null;
if (TfMetaUtils.containsOmitSpec(spec, colnames)) {
omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
}
// create encoder broadcast (avoiding replication per task)
Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), meta);
mcOut.setDimension(mcIn.getRows() - ((omap != null) ? omap.getNumRmRows() : 0), encoder.getNumCols());
Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
Broadcast<TfOffsetMap> bomap = (omap != null) ? sec.getSparkContext().broadcast(omap) : null;
// execute transform apply
JavaPairRDD<Long, FrameBlock> tmp = in.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
JavaPairRDD<MatrixIndexes, MatrixBlock> out = FrameRDDConverterUtils.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
// set output and maintain lineage/output characteristics
sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
sec.setFrameOutput(_outputs.get(1).getName(), meta);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
Aggregations