Search in sources :

Example 1 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class DataTransform method spDataTransform.

public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // Parse transform instruction (the first instruction) to obtain relevant fields
    TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
    JobConf job = new JobConf();
    FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
    checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
    // find the first file in alphabetical ordering of partfiles in directory inputPath 
    String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
    // find column names and construct output header
    String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
    HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
    int numColumns = colNamesToIds.size();
    String outHeader = getOutputHeader(fs, headerLine, oprnds);
    String tmpPath = MRJobConfiguration.constructTempOutputFilename();
    // Construct RDD for input data
    @SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
    JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
    long numRowsTf = 0, numColumnsTf = 0;
    JavaPairRDD<Long, String> tfPairRDD = null;
    if (!oprnds.isApply) {
        // build specification file with column IDs insteadof column names
        String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // Build transformation metadata, including recode maps, bin definitions, etc.
        // Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
        String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
        numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
        // store the specFileWithIDs as transformation metadata
        MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
    } else {
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
        MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
        // path to specification file
        String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        // Apply transformation metadata, and perform actual transformation 
        tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
    }
    // copy auxiliary data (old and new header lines) from temporary location to txMtdPath
    moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
    // convert to csv output format (serialized longwritable/text)
    JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
    if (outtfPairRDD != null) {
        MatrixObject outMO = outputs[0];
        String outVar = outMO.getVarName();
        outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
        sec.addLineageRDD(outVar, inst.getParams().get("target"));
        //update output statistics (required for correctness)
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
        mcOut.setDimension(numRowsTf, numColumnsTf);
        mcOut.setNonZeros(-1);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 2 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class MLContextConversionUtil method binaryBlocksToFrameObject.

/**
	 * Convert a {@code JavaPairRDD<Long, FrameBlock>} to a {@code FrameObject}.
	 * 
	 * @param variableName
	 *            name of the variable associated with the frame
	 * @param binaryBlocks
	 *            {@code JavaPairRDD<Long, FrameBlock>} representation of a
	 *            binary-block frame
	 * @param frameMetadata
	 *            the frame metadata
	 * @return the {@code JavaPairRDD<Long, FrameBlock>} frame converted to a
	 *         {@code FrameObject}
	 */
public static FrameObject binaryBlocksToFrameObject(String variableName, JavaPairRDD<Long, FrameBlock> binaryBlocks, FrameMetadata frameMetadata) {
    MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
    frameObject.setRDDHandle(new RDDObject(binaryBlocks, variableName));
    return frameObject;
}
Also used : ValueType(org.apache.sysml.parser.Expression.ValueType) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 3 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class MLContextConversionUtil method javaRDDStringCSVToFrameObject.

/**
	 * Convert a {@code JavaRDD<String>} in CSV format to a {@code FrameObject}
	 * 
	 * @param variableName
	 *            name of the variable associated with the frame
	 * @param javaRDD
	 *            the Java RDD of strings
	 * @param frameMetadata
	 *            frame metadata
	 * @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
	 */
public static FrameObject javaRDDStringCSVToFrameObject(String variableName, JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
    FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
    JavaPairRDD<Long, FrameBlock> rdd;
    try {
        rdd = FrameRDDConverterUtils.csvToBinaryBlock(jsc(), javaPairRDDText, mc, frameObject.getSchema(), false, ",", false, -1);
    } catch (DMLRuntimeException e) {
        e.printStackTrace();
        return null;
    }
    frameObject.setRDDHandle(new RDDObject(rdd, variableName));
    return frameObject;
}
Also used : ValueType(org.apache.sysml.parser.Expression.ValueType) Text(org.apache.hadoop.io.Text) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) LongWritable(org.apache.hadoop.io.LongWritable)

Example 4 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class MLContextConversionUtil method binaryBlocksToMatrixObject.

private static MatrixObject binaryBlocksToMatrixObject(String variableName, JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks, MatrixMetadata matrixMetadata, boolean copy) {
    MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<MatrixIndexes, MatrixBlock> javaPairRdd = SparkUtils.copyBinaryBlockMatrix(binaryBlocks, copy);
    MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
    matrixObject.setRDDHandle(new RDDObject(javaPairRdd, variableName));
    return matrixObject;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 5 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class MLContextConversionUtil method binaryBlocksToMatrixObject.

private static MatrixObject binaryBlocksToMatrixObject(JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks, MatrixMetadata matrixMetadata, boolean copy) {
    MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<MatrixIndexes, MatrixBlock> javaPairRdd = SparkUtils.copyBinaryBlockMatrix(binaryBlocks, copy);
    MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
    matrixObject.setRDDHandle(new RDDObject(javaPairRdd));
    return matrixObject;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Aggregations

RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)53 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)37 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)22 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)18 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)18 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)18 LongWritable (org.apache.hadoop.io.LongWritable)17 Text (org.apache.hadoop.io.Text)17 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)16 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)16 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)16 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)13 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)12 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)12 ValueType (org.apache.sysml.parser.Expression.ValueType)11 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)9 IOException (java.io.IOException)7 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 Checkpoint (org.apache.sysml.lops.Checkpoint)6 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)6