use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class DataTransform method spDataTransform.
public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
JobConf job = new JobConf();
FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
// find the first file in alphabetical ordering of partfiles in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names and construct output header
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
int numColumns = colNamesToIds.size();
String outHeader = getOutputHeader(fs, headerLine, oprnds);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
// Construct RDD for input data
@SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
long numRowsTf = 0, numColumnsTf = 0;
JavaPairRDD<Long, String> tfPairRDD = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
// enable GC on colNamesToIds
colNamesToIds = null;
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
// store the specFileWithIDs as transformation metadata
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
} else {
// enable GC on colNamesToIds
colNamesToIds = null;
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// convert to csv output format (serialized longwritable/text)
JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
if (outtfPairRDD != null) {
MatrixObject outMO = outputs[0];
String outVar = outMO.getVarName();
outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
sec.addLineageRDD(outVar, inst.getParams().get("target"));
//update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
mcOut.setDimension(numRowsTf, numColumnsTf);
mcOut.setNonZeros(-1);
}
}
use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class DataPartitioner method createPartitionedMatrixObject.
public MatrixObject createPartitionedMatrixObject(MatrixObject in, String fnameNew, boolean force) throws DMLRuntimeException {
ValueType vt = in.getValueType();
String varname = in.getVarName();
MatrixObject out = new MatrixObject(vt, fnameNew);
out.setDataType(DataType.MATRIX);
out.setVarName(varname + NAME_SUFFIX);
return createPartitionedMatrixObject(in, out, force);
}
use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class MLContextConversionUtil method dataFrameToMatrixObject.
/**
* Convert a {@code DataFrame} to a {@code MatrixObject}.
*
* @param variableName
* name of the variable associated with the matrix
* @param dataFrame
* the Spark {@code DataFrame}
* @param matrixMetadata
* the matrix metadata
* @return the {@code DataFrame} matrix converted to a converted to a
* {@code MatrixObject}
*/
public static MatrixObject dataFrameToMatrixObject(String variableName, Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
MatrixObject mo = binaryBlocksToMatrixObject(variableName, binaryBlock, matrixMetadata, false);
//keep lineage of original dataset to allow bypassing binary block conversion if possible
mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, variableName, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
return mo;
}
use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class MLContextConversionUtil method binaryBlocksToMatrixObject.
private static MatrixObject binaryBlocksToMatrixObject(String variableName, JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks, MatrixMetadata matrixMetadata, boolean copy) {
MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<MatrixIndexes, MatrixBlock> javaPairRdd = SparkUtils.copyBinaryBlockMatrix(binaryBlocks, copy);
MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
matrixObject.setRDDHandle(new RDDObject(javaPairRdd, variableName));
return matrixObject;
}
use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.
the class ExternalFunctionProgramBlock method verifyAndAttachOutputs.
/**
* Method to verify that function outputs match with declared outputs
*
* @param ec execution context
* @param returnFunc package function
* @param outputParams output parameters
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected void verifyAndAttachOutputs(ExecutionContext ec, PackageFunction returnFunc, String outputParams) throws DMLRuntimeException {
ArrayList<String> outputs = getParameters(outputParams);
if (outputs.size() != returnFunc.getNumFunctionOutputs()) {
throw new DMLRuntimeException("Number of function outputs (" + returnFunc.getNumFunctionOutputs() + ") " + "does not match with declaration (" + outputs.size() + ").");
}
// iterate over each output and verify that type matches
for (int i = 0; i < outputs.size(); i++) {
StringTokenizer tk = new StringTokenizer(outputs.get(i), ":");
ArrayList<String> tokens = new ArrayList<String>();
while (tk.hasMoreTokens()) {
tokens.add(tk.nextToken());
}
if (returnFunc.getFunctionOutput(i).getType() == FunctionParameterType.Matrix) {
Matrix m = (Matrix) returnFunc.getFunctionOutput(i);
if (!(tokens.get(0).equals(getFunctionParameterDataTypeString(FunctionParameterType.Matrix))) || !(tokens.get(2).equals(getMatrixValueTypeString(m.getValueType())))) {
throw new DMLRuntimeException("Function output '" + outputs.get(i) + "' does not match with declaration.");
}
// add result to variableMapping
String varName = tokens.get(1);
MatrixObject newVar = createOutputMatrixObject(m);
newVar.setVarName(varName);
//getVariables().put(varName, newVar); //put/override in local symbol table
ec.setVariable(varName, newVar);
continue;
}
if (returnFunc.getFunctionOutput(i).getType() == FunctionParameterType.Scalar) {
Scalar s = (Scalar) returnFunc.getFunctionOutput(i);
if (!tokens.get(0).equals(getFunctionParameterDataTypeString(FunctionParameterType.Scalar)) || !tokens.get(2).equals(getScalarValueTypeString(s.getScalarType()))) {
throw new DMLRuntimeException("Function output '" + outputs.get(i) + "' does not match with declaration.");
}
// allocate and set appropriate object based on type
ScalarObject scalarObject = null;
ScalarValueType type = s.getScalarType();
switch(type) {
case Integer:
scalarObject = new IntObject(tokens.get(1), Long.parseLong(s.getValue()));
break;
case Double:
scalarObject = new DoubleObject(tokens.get(1), Double.parseDouble(s.getValue()));
break;
case Boolean:
scalarObject = new BooleanObject(tokens.get(1), Boolean.parseBoolean(s.getValue()));
break;
case Text:
scalarObject = new StringObject(tokens.get(1), s.getValue());
break;
default:
throw new DMLRuntimeException("Unknown scalar value type '" + type + "' of output '" + outputs.get(i) + "'.");
}
//this.getVariables().put(tokens.get(1), scalarObject);
ec.setVariable(tokens.get(1), scalarObject);
continue;
}
if (returnFunc.getFunctionOutput(i).getType() == FunctionParameterType.Object) {
if (!tokens.get(0).equals(getFunctionParameterDataTypeString(FunctionParameterType.Object))) {
throw new DMLRuntimeException("Function output '" + outputs.get(i) + "' does not match with declaration.");
}
throw new DMLRuntimeException("Object types not yet supported");
// continue;
}
throw new DMLRuntimeException("Unknown data type '" + returnFunc.getFunctionOutput(i).getType() + "' " + "of output '" + outputs.get(i) + "'.");
}
}
Aggregations