Search in sources :

Example 6 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class DataTransform method spDataTransform.

public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // Parse transform instruction (the first instruction) to obtain relevant fields
    TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
    JobConf job = new JobConf();
    FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
    checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
    // find the first file in alphabetical ordering of partfiles in directory inputPath 
    String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
    // find column names and construct output header
    String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
    HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
    int numColumns = colNamesToIds.size();
    String outHeader = getOutputHeader(fs, headerLine, oprnds);
    String tmpPath = MRJobConfiguration.constructTempOutputFilename();
    // Construct RDD for input data
    @SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
    JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
    long numRowsTf = 0, numColumnsTf = 0;
    JavaPairRDD<Long, String> tfPairRDD = null;
    if (!oprnds.isApply) {
        // build specification file with column IDs insteadof column names
        String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // Build transformation metadata, including recode maps, bin definitions, etc.
        // Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
        String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
        numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
        // store the specFileWithIDs as transformation metadata
        MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
    } else {
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
        MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
        // path to specification file
        String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        // Apply transformation metadata, and perform actual transformation 
        tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
    }
    // copy auxiliary data (old and new header lines) from temporary location to txMtdPath
    moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
    // convert to csv output format (serialized longwritable/text)
    JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
    if (outtfPairRDD != null) {
        MatrixObject outMO = outputs[0];
        String outVar = outMO.getVarName();
        outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
        sec.addLineageRDD(outVar, inst.getParams().get("target"));
        //update output statistics (required for correctness)
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
        mcOut.setDimension(numRowsTf, numColumnsTf);
        mcOut.setNonZeros(-1);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 7 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class DataPartitioner method createPartitionedMatrixObject.

public MatrixObject createPartitionedMatrixObject(MatrixObject in, String fnameNew, boolean force) throws DMLRuntimeException {
    ValueType vt = in.getValueType();
    String varname = in.getVarName();
    MatrixObject out = new MatrixObject(vt, fnameNew);
    out.setDataType(DataType.MATRIX);
    out.setVarName(varname + NAME_SUFFIX);
    return createPartitionedMatrixObject(in, out, force);
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ValueType(org.apache.sysml.parser.Expression.ValueType)

Example 8 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class MLContextConversionUtil method dataFrameToMatrixObject.

/**
	 * Convert a {@code DataFrame} to a {@code MatrixObject}.
	 * 
	 * @param variableName
	 *            name of the variable associated with the matrix
	 * @param dataFrame
	 *            the Spark {@code DataFrame}
	 * @param matrixMetadata
	 *            the matrix metadata
	 * @return the {@code DataFrame} matrix converted to a converted to a
	 *         {@code MatrixObject}
	 */
public static MatrixObject dataFrameToMatrixObject(String variableName, Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
    matrixMetadata = (matrixMetadata != null) ? matrixMetadata : new MatrixMetadata();
    JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = dataFrameToMatrixBinaryBlocks(dataFrame, matrixMetadata);
    MatrixObject mo = binaryBlocksToMatrixObject(variableName, binaryBlock, matrixMetadata, false);
    //keep lineage of original dataset to allow bypassing binary block conversion if possible
    mo.getRDDHandle().addLineageChild(new DatasetObject(dataFrame, variableName, isDataFrameWithIDColumn(matrixMetadata), isVectorBasedDataFrame(matrixMetadata)));
    return mo;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject)

Example 9 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class MLContextConversionUtil method binaryBlocksToMatrixObject.

private static MatrixObject binaryBlocksToMatrixObject(String variableName, JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks, MatrixMetadata matrixMetadata, boolean copy) {
    MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<MatrixIndexes, MatrixBlock> javaPairRdd = SparkUtils.copyBinaryBlockMatrix(binaryBlocks, copy);
    MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
    matrixObject.setRDDHandle(new RDDObject(javaPairRdd, variableName));
    return matrixObject;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 10 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class ExternalFunctionProgramBlock method verifyAndAttachOutputs.

/**
	 * Method to verify that function outputs match with declared outputs
	 * 
	 * @param ec execution context
	 * @param returnFunc package function
	 * @param outputParams output parameters
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
protected void verifyAndAttachOutputs(ExecutionContext ec, PackageFunction returnFunc, String outputParams) throws DMLRuntimeException {
    ArrayList<String> outputs = getParameters(outputParams);
    if (outputs.size() != returnFunc.getNumFunctionOutputs()) {
        throw new DMLRuntimeException("Number of function outputs (" + returnFunc.getNumFunctionOutputs() + ") " + "does not match with declaration (" + outputs.size() + ").");
    }
    // iterate over each output and verify that type matches
    for (int i = 0; i < outputs.size(); i++) {
        StringTokenizer tk = new StringTokenizer(outputs.get(i), ":");
        ArrayList<String> tokens = new ArrayList<String>();
        while (tk.hasMoreTokens()) {
            tokens.add(tk.nextToken());
        }
        if (returnFunc.getFunctionOutput(i).getType() == FunctionParameterType.Matrix) {
            Matrix m = (Matrix) returnFunc.getFunctionOutput(i);
            if (!(tokens.get(0).equals(getFunctionParameterDataTypeString(FunctionParameterType.Matrix))) || !(tokens.get(2).equals(getMatrixValueTypeString(m.getValueType())))) {
                throw new DMLRuntimeException("Function output '" + outputs.get(i) + "' does not match with declaration.");
            }
            // add result to variableMapping
            String varName = tokens.get(1);
            MatrixObject newVar = createOutputMatrixObject(m);
            newVar.setVarName(varName);
            //getVariables().put(varName, newVar); //put/override in local symbol table
            ec.setVariable(varName, newVar);
            continue;
        }
        if (returnFunc.getFunctionOutput(i).getType() == FunctionParameterType.Scalar) {
            Scalar s = (Scalar) returnFunc.getFunctionOutput(i);
            if (!tokens.get(0).equals(getFunctionParameterDataTypeString(FunctionParameterType.Scalar)) || !tokens.get(2).equals(getScalarValueTypeString(s.getScalarType()))) {
                throw new DMLRuntimeException("Function output '" + outputs.get(i) + "' does not match with declaration.");
            }
            // allocate and set appropriate object based on type
            ScalarObject scalarObject = null;
            ScalarValueType type = s.getScalarType();
            switch(type) {
                case Integer:
                    scalarObject = new IntObject(tokens.get(1), Long.parseLong(s.getValue()));
                    break;
                case Double:
                    scalarObject = new DoubleObject(tokens.get(1), Double.parseDouble(s.getValue()));
                    break;
                case Boolean:
                    scalarObject = new BooleanObject(tokens.get(1), Boolean.parseBoolean(s.getValue()));
                    break;
                case Text:
                    scalarObject = new StringObject(tokens.get(1), s.getValue());
                    break;
                default:
                    throw new DMLRuntimeException("Unknown scalar value type '" + type + "' of output '" + outputs.get(i) + "'.");
            }
            //this.getVariables().put(tokens.get(1), scalarObject);
            ec.setVariable(tokens.get(1), scalarObject);
            continue;
        }
        if (returnFunc.getFunctionOutput(i).getType() == FunctionParameterType.Object) {
            if (!tokens.get(0).equals(getFunctionParameterDataTypeString(FunctionParameterType.Object))) {
                throw new DMLRuntimeException("Function output '" + outputs.get(i) + "' does not match with declaration.");
            }
            throw new DMLRuntimeException("Object types not yet supported");
        // continue;
        }
        throw new DMLRuntimeException("Unknown data type '" + returnFunc.getFunctionOutput(i).getType() + "' " + "of output '" + outputs.get(i) + "'.");
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) Scalar(org.apache.sysml.udf.Scalar) ScalarObject(org.apache.sysml.runtime.instructions.cp.ScalarObject) StringTokenizer(java.util.StringTokenizer) Matrix(org.apache.sysml.udf.Matrix) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) ScalarValueType(org.apache.sysml.udf.Scalar.ScalarValueType) StringObject(org.apache.sysml.runtime.instructions.cp.StringObject) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Aggregations

MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)201 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)74 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)45 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)39 Data (org.apache.sysml.runtime.instructions.cp.Data)37 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)26 Pointer (jcuda.Pointer)20 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)20 IOException (java.io.IOException)17 ArrayList (java.util.ArrayList)16 ScalarObject (org.apache.sysml.runtime.instructions.cp.ScalarObject)14 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)13 CacheableData (org.apache.sysml.runtime.controlprogram.caching.CacheableData)12 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)12 Hop (org.apache.sysml.hops.Hop)11 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)11 ParForProgramBlock (org.apache.sysml.runtime.controlprogram.ParForProgramBlock)10 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)10 Path (org.apache.hadoop.fs.Path)9 LongWritable (org.apache.hadoop.io.LongWritable)9