use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class DataTransform method spDataTransform.
public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
JobConf job = new JobConf();
FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
// find the first file in alphabetical ordering of partfiles in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names and construct output header
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
int numColumns = colNamesToIds.size();
String outHeader = getOutputHeader(fs, headerLine, oprnds);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
// Construct RDD for input data
@SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
long numRowsTf = 0, numColumnsTf = 0;
JavaPairRDD<Long, String> tfPairRDD = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
// enable GC on colNamesToIds
colNamesToIds = null;
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
// store the specFileWithIDs as transformation metadata
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
} else {
// enable GC on colNamesToIds
colNamesToIds = null;
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// convert to csv output format (serialized longwritable/text)
JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
if (outtfPairRDD != null) {
MatrixObject outMO = outputs[0];
String outVar = outMO.getVarName();
outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
sec.addLineageRDD(outVar, inst.getParams().get("target"));
//update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
mcOut.setDimension(numRowsTf, numColumnsTf);
mcOut.setNonZeros(-1);
}
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class MLContextConversionUtil method binaryBlocksToFrameObject.
/**
* Convert a {@code JavaPairRDD<Long, FrameBlock>} to a {@code FrameObject}.
*
* @param variableName
* name of the variable associated with the frame
* @param binaryBlocks
* {@code JavaPairRDD<Long, FrameBlock>} representation of a
* binary-block frame
* @param frameMetadata
* the frame metadata
* @return the {@code JavaPairRDD<Long, FrameBlock>} frame converted to a
* {@code FrameObject}
*/
public static FrameObject binaryBlocksToFrameObject(String variableName, JavaPairRDD<Long, FrameBlock> binaryBlocks, FrameMetadata frameMetadata) {
MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
frameObject.setRDDHandle(new RDDObject(binaryBlocks, variableName));
return frameObject;
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class MLContextConversionUtil method javaRDDStringCSVToFrameObject.
/**
* Convert a {@code JavaRDD<String>} in CSV format to a {@code FrameObject}
*
* @param variableName
* name of the variable associated with the frame
* @param javaRDD
* the Java RDD of strings
* @param frameMetadata
* frame metadata
* @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
*/
public static FrameObject javaRDDStringCSVToFrameObject(String variableName, JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
JavaPairRDD<Long, FrameBlock> rdd;
try {
rdd = FrameRDDConverterUtils.csvToBinaryBlock(jsc(), javaPairRDDText, mc, frameObject.getSchema(), false, ",", false, -1);
} catch (DMLRuntimeException e) {
e.printStackTrace();
return null;
}
frameObject.setRDDHandle(new RDDObject(rdd, variableName));
return frameObject;
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class MLContextConversionUtil method binaryBlocksToMatrixObject.
private static MatrixObject binaryBlocksToMatrixObject(String variableName, JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks, MatrixMetadata matrixMetadata, boolean copy) {
MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<MatrixIndexes, MatrixBlock> javaPairRdd = SparkUtils.copyBinaryBlockMatrix(binaryBlocks, copy);
MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
matrixObject.setRDDHandle(new RDDObject(javaPairRdd, variableName));
return matrixObject;
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.
the class MLContextConversionUtil method binaryBlocksToMatrixObject.
private static MatrixObject binaryBlocksToMatrixObject(JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks, MatrixMetadata matrixMetadata, boolean copy) {
MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<MatrixIndexes, MatrixBlock> javaPairRdd = SparkUtils.copyBinaryBlockMatrix(binaryBlocks, copy);
MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
matrixObject.setRDDHandle(new RDDObject(javaPairRdd));
return matrixObject;
}
Aggregations