Search in sources :

Example 36 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project incubator-systemml by apache.

the class MLContextUtil method convertInputType.

/**
	 * Convert input types to internal SystemML representations
	 *
	 * @param parameterName
	 *            The name of the input parameter
	 * @param parameterValue
	 *            The value of the input parameter
	 * @param metadata
	 *            matrix/frame metadata
	 * @return input in SystemML data representation
	 */
public static Data convertInputType(String parameterName, Object parameterValue, Metadata metadata) {
    String name = parameterName;
    Object value = parameterValue;
    boolean hasMetadata = (metadata != null) ? true : false;
    boolean hasMatrixMetadata = hasMetadata && (metadata instanceof MatrixMetadata) ? true : false;
    boolean hasFrameMetadata = hasMetadata && (metadata instanceof FrameMetadata) ? true : false;
    if (name == null) {
        throw new MLContextException("Input parameter name is null");
    } else if (value == null) {
        throw new MLContextException("Input parameter value is null for: " + parameterName);
    } else if (value instanceof JavaRDD<?>) {
        @SuppressWarnings("unchecked") JavaRDD<String> javaRDD = (JavaRDD<String>) value;
        if (hasMatrixMetadata) {
            MatrixMetadata matrixMetadata = (MatrixMetadata) metadata;
            if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
                return MLContextConversionUtil.javaRDDStringIJVToMatrixObject(name, javaRDD, matrixMetadata);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToMatrixObject(name, javaRDD, matrixMetadata);
            }
        } else if (hasFrameMetadata) {
            FrameMetadata frameMetadata = (FrameMetadata) metadata;
            if (frameMetadata.getFrameFormat() == FrameFormat.IJV) {
                return MLContextConversionUtil.javaRDDStringIJVToFrameObject(name, javaRDD, frameMetadata);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToFrameObject(name, javaRDD, frameMetadata);
            }
        } else if (!hasMetadata) {
            String firstLine = javaRDD.first();
            boolean isAllNumbers = isCSVLineAllNumbers(firstLine);
            if (isAllNumbers) {
                return MLContextConversionUtil.javaRDDStringCSVToMatrixObject(name, javaRDD);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToFrameObject(name, javaRDD);
            }
        }
    } else if (value instanceof RDD<?>) {
        @SuppressWarnings("unchecked") RDD<String> rdd = (RDD<String>) value;
        if (hasMatrixMetadata) {
            MatrixMetadata matrixMetadata = (MatrixMetadata) metadata;
            if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
                return MLContextConversionUtil.rddStringIJVToMatrixObject(name, rdd, matrixMetadata);
            } else {
                return MLContextConversionUtil.rddStringCSVToMatrixObject(name, rdd, matrixMetadata);
            }
        } else if (hasFrameMetadata) {
            FrameMetadata frameMetadata = (FrameMetadata) metadata;
            if (frameMetadata.getFrameFormat() == FrameFormat.IJV) {
                return MLContextConversionUtil.rddStringIJVToFrameObject(name, rdd, frameMetadata);
            } else {
                return MLContextConversionUtil.rddStringCSVToFrameObject(name, rdd, frameMetadata);
            }
        } else if (!hasMetadata) {
            String firstLine = rdd.first();
            boolean isAllNumbers = isCSVLineAllNumbers(firstLine);
            if (isAllNumbers) {
                return MLContextConversionUtil.rddStringCSVToMatrixObject(name, rdd);
            } else {
                return MLContextConversionUtil.rddStringCSVToFrameObject(name, rdd);
            }
        }
    } else if (value instanceof MatrixBlock) {
        MatrixBlock matrixBlock = (MatrixBlock) value;
        return MLContextConversionUtil.matrixBlockToMatrixObject(name, matrixBlock, (MatrixMetadata) metadata);
    } else if (value instanceof FrameBlock) {
        FrameBlock frameBlock = (FrameBlock) value;
        return MLContextConversionUtil.frameBlockToFrameObject(name, frameBlock, (FrameMetadata) metadata);
    } else if (value instanceof Dataset<?>) {
        @SuppressWarnings("unchecked") Dataset<Row> dataFrame = (Dataset<Row>) value;
        dataFrame = MLUtils.convertVectorColumnsToML(dataFrame);
        if (hasMatrixMetadata) {
            return MLContextConversionUtil.dataFrameToMatrixObject(name, dataFrame, (MatrixMetadata) metadata);
        } else if (hasFrameMetadata) {
            return MLContextConversionUtil.dataFrameToFrameObject(name, dataFrame, (FrameMetadata) metadata);
        } else if (!hasMetadata) {
            boolean looksLikeMatrix = doesDataFrameLookLikeMatrix(dataFrame);
            if (looksLikeMatrix) {
                return MLContextConversionUtil.dataFrameToMatrixObject(name, dataFrame);
            } else {
                return MLContextConversionUtil.dataFrameToFrameObject(name, dataFrame);
            }
        }
    } else if (value instanceof BinaryBlockMatrix) {
        BinaryBlockMatrix binaryBlockMatrix = (BinaryBlockMatrix) value;
        if (metadata == null) {
            metadata = binaryBlockMatrix.getMatrixMetadata();
        }
        JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks = binaryBlockMatrix.getBinaryBlocks();
        return MLContextConversionUtil.binaryBlocksToMatrixObject(name, binaryBlocks, (MatrixMetadata) metadata);
    } else if (value instanceof BinaryBlockFrame) {
        BinaryBlockFrame binaryBlockFrame = (BinaryBlockFrame) value;
        if (metadata == null) {
            metadata = binaryBlockFrame.getFrameMetadata();
        }
        JavaPairRDD<Long, FrameBlock> binaryBlocks = binaryBlockFrame.getBinaryBlocks();
        return MLContextConversionUtil.binaryBlocksToFrameObject(name, binaryBlocks, (FrameMetadata) metadata);
    } else if (value instanceof Matrix) {
        Matrix matrix = (Matrix) value;
        return matrix.toMatrixObject();
    } else if (value instanceof Frame) {
        Frame frame = (Frame) value;
        return frame.toFrameObject();
    } else if (value instanceof double[][]) {
        double[][] doubleMatrix = (double[][]) value;
        return MLContextConversionUtil.doubleMatrixToMatrixObject(name, doubleMatrix, (MatrixMetadata) metadata);
    } else if (value instanceof URL) {
        URL url = (URL) value;
        return MLContextConversionUtil.urlToMatrixObject(name, url, (MatrixMetadata) metadata);
    } else if (value instanceof Integer) {
        return new IntObject((Integer) value);
    } else if (value instanceof Double) {
        return new DoubleObject((Double) value);
    } else if (value instanceof String) {
        return new StringObject((String) value);
    } else if (value instanceof Boolean) {
        return new BooleanObject((Boolean) value);
    }
    return null;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) URL(java.net.URL) RDD(org.apache.spark.rdd.RDD) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) StringObject(org.apache.sysml.runtime.instructions.cp.StringObject) Dataset(org.apache.spark.sql.Dataset) JavaRDD(org.apache.spark.api.java.JavaRDD) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) StringObject(org.apache.sysml.runtime.instructions.cp.StringObject) Row(org.apache.spark.sql.Row) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Example 37 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project incubator-systemml by apache.

the class MLContextUtil method displayInputs.

/**
	 * Obtain a display of script inputs.
	 *
	 * @param name
	 *            the title to display for the inputs
	 * @param map
	 *            the map of inputs
	 * @param symbolTable the symbol table
	 * @return the script inputs represented as a String
	 */
public static String displayInputs(String name, Map<String, Object> map, LocalVariableMap symbolTable) {
    StringBuilder sb = new StringBuilder();
    sb.append(name);
    sb.append(":\n");
    Set<String> keys = map.keySet();
    if (keys.isEmpty()) {
        sb.append("None\n");
    } else {
        int count = 0;
        for (String key : keys) {
            Object object = map.get(key);
            @SuppressWarnings("rawtypes") Class clazz = object.getClass();
            String type = clazz.getSimpleName();
            if (object instanceof JavaRDD<?>) {
                type = "JavaRDD";
            } else if (object instanceof RDD<?>) {
                type = "RDD";
            }
            sb.append("  [");
            sb.append(++count);
            sb.append("]");
            sb.append(" (");
            sb.append(type);
            if (doesSymbolTableContainMatrixObject(symbolTable, key)) {
                sb.append(" as Matrix");
            } else if (doesSymbolTableContainFrameObject(symbolTable, key)) {
                sb.append(" as Frame");
            }
            sb.append(") ");
            sb.append(key);
            sb.append(": ");
            String str = null;
            if (object instanceof MatrixBlock) {
                MatrixBlock mb = (MatrixBlock) object;
                str = "MatrixBlock [sparse? = " + mb.isInSparseFormat() + ", nonzeros = " + mb.getNonZeros() + ", size: " + mb.getNumRows() + " X " + mb.getNumColumns() + "]";
            } else
                // TODO: Deal with OOM for other objects such as Frame, etc
                str = object.toString();
            str = StringUtils.abbreviate(str, 100);
            sb.append(str);
            sb.append("\n");
        }
    }
    return sb.toString();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) StringObject(org.apache.sysml.runtime.instructions.cp.StringObject) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 38 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project incubator-systemml by apache.

the class FrameConverterTest method runConverter.

/**
	 * @param oinfo 
	 * @param frame1
	 * @param frame2
	 * @param fprop
	 * @param schema
	 * @return 
	 * @throws DMLRuntimeException, IOException
	 */
@SuppressWarnings("unchecked")
private void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws DMLRuntimeException, IOException {
    SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
    JavaSparkContext sc = sec.getSparkContext();
    ValueType[] lschema = schema.toArray(new ValueType[0]);
    MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
    switch(type) {
        case CSV2BIN:
            {
                InputInfo iinfo = InputInfo.CSVInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2CSV:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                CSVFileFormatProperties fprop = new CSVFileFormatProperties();
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case TXTCELL2BIN:
            {
                InputInfo iinfo = InputInfo.TextCellInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2TXTCELL:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case MAT2BIN:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2MAT:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
                break;
            }
        case DFRM2BIN:
            {
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                //Create DataFrame 
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
                JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
                Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2DFRM:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
                //Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary 
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        default:
            throw new RuntimeException("Unsuported converter type: " + type.toString());
    }
    sec.close();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Dataset(org.apache.spark.sql.Dataset) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritableFrameToLongFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)

Example 39 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project incubator-systemml by apache.

the class MLContextFrameTest method testFrame.

public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE inputType, IO_TYPE outputType) {
    System.out.println("MLContextTest - Frame JavaRDD<String> for format: " + format + " Script: " + script_type);
    List<String> listA = new ArrayList<String>();
    List<String> listB = new ArrayList<String>();
    FrameMetadata fmA = null, fmB = null;
    Script script = null;
    ValueType[] schemaA = { ValueType.INT, ValueType.STRING, ValueType.DOUBLE, ValueType.BOOLEAN };
    List<ValueType> lschemaA = Arrays.asList(schemaA);
    FrameSchema fschemaA = new FrameSchema(lschemaA);
    ValueType[] schemaB = { ValueType.STRING, ValueType.DOUBLE, ValueType.BOOLEAN };
    List<ValueType> lschemaB = Arrays.asList(schemaB);
    FrameSchema fschemaB = new FrameSchema(lschemaB);
    if (inputType != IO_TYPE.FILE) {
        if (format == FrameFormat.CSV) {
            listA.add("1,Str2,3.0,true");
            listA.add("4,Str5,6.0,false");
            listA.add("7,Str8,9.0,true");
            listB.add("Str12,13.0,true");
            listB.add("Str25,26.0,false");
            fmA = new FrameMetadata(FrameFormat.CSV, fschemaA, 3, 4);
            fmB = new FrameMetadata(FrameFormat.CSV, fschemaB, 2, 3);
        } else if (format == FrameFormat.IJV) {
            listA.add("1 1 1");
            listA.add("1 2 Str2");
            listA.add("1 3 3.0");
            listA.add("1 4 true");
            listA.add("2 1 4");
            listA.add("2 2 Str5");
            listA.add("2 3 6.0");
            listA.add("2 4 false");
            listA.add("3 1 7");
            listA.add("3 2 Str8");
            listA.add("3 3 9.0");
            listA.add("3 4 true");
            listB.add("1 1 Str12");
            listB.add("1 2 13.0");
            listB.add("1 3 true");
            listB.add("2 1 Str25");
            listB.add("2 2 26.0");
            listB.add("2 3 false");
            fmA = new FrameMetadata(FrameFormat.IJV, fschemaA, 3, 4);
            fmB = new FrameMetadata(FrameFormat.IJV, fschemaB, 2, 3);
        }
        JavaRDD<String> javaRDDA = sc.parallelize(listA);
        JavaRDD<String> javaRDDB = sc.parallelize(listB);
        if (inputType == IO_TYPE.DATAFRAME) {
            JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDA, CSV_DELIM, schemaA);
            JavaRDD<Row> javaRddRowB = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDB, CSV_DELIM, schemaB);
            // Create DataFrame
            StructType dfSchemaA = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaA, false);
            Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, dfSchemaA);
            StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
            Dataset<Row> dataFrameB = spark.createDataFrame(javaRddRowB, dfSchemaB);
            if (script_type == SCRIPT_TYPE.DML)
                script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).out("A").out("C");
            else if (script_type == SCRIPT_TYPE.PYDML)
                // DO NOT USE ; at the end of any statment, it throws NPE
                script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
        } else {
            if (inputType == IO_TYPE.JAVA_RDD_STR_CSV || inputType == IO_TYPE.JAVA_RDD_STR_IJV) {
                if (script_type == SCRIPT_TYPE.DML)
                    script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).out("A").out("C");
                else if (script_type == SCRIPT_TYPE.PYDML)
                    // DO NOT USE ; at the end of any statment, it throws
                    // NPE
                    script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
            } else if (inputType == IO_TYPE.RDD_STR_CSV || inputType == IO_TYPE.RDD_STR_IJV) {
                RDD<String> rddA = JavaRDD.toRDD(javaRDDA);
                RDD<String> rddB = JavaRDD.toRDD(javaRDDB);
                if (script_type == SCRIPT_TYPE.DML)
                    script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", rddA, fmA).in("B", rddB, fmB).out("A").out("C");
                else if (script_type == SCRIPT_TYPE.PYDML)
                    // DO NOT USE ; at the end of any statment, it throws
                    // NPE
                    script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", rddA, fmA).in("B", rddB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
            }
        }
    } else {
        // Input type is file
        String fileA = null, fileB = null;
        if (format == FrameFormat.CSV) {
            fileA = baseDirectory + File.separator + "FrameA.csv";
            fileB = baseDirectory + File.separator + "FrameB.csv";
        } else if (format == FrameFormat.IJV) {
            fileA = baseDirectory + File.separator + "FrameA.ijv";
            fileB = baseDirectory + File.separator + "FrameB.ijv";
        }
        if (script_type == SCRIPT_TYPE.DML)
            script = dml("A=read($A); B=read($B);A[2:3,2:4]=B;C=A[2:3,2:3];A[1,1]=234").in("$A", fileA, fmA).in("$B", fileB, fmB).out("A").out("C");
        else if (script_type == SCRIPT_TYPE.PYDML)
            // DO NOT USE ; at the end of any statment, it throws NPE
            script = pydml("A=load($A)\nB=load($B)\nA[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("$A", fileA).in("$B", fileB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
    }
    MLResults mlResults = ml.execute(script);
    //Validate output schema
    List<ValueType> lschemaOutA = Arrays.asList(mlResults.getFrameObject("A").getSchema());
    List<ValueType> lschemaOutC = Arrays.asList(mlResults.getFrameObject("C").getSchema());
    Assert.assertEquals(ValueType.INT, lschemaOutA.get(0));
    Assert.assertEquals(ValueType.STRING, lschemaOutA.get(1));
    Assert.assertEquals(ValueType.DOUBLE, lschemaOutA.get(2));
    Assert.assertEquals(ValueType.BOOLEAN, lschemaOutA.get(3));
    Assert.assertEquals(ValueType.STRING, lschemaOutC.get(0));
    Assert.assertEquals(ValueType.DOUBLE, lschemaOutC.get(1));
    if (outputType == IO_TYPE.JAVA_RDD_STR_CSV) {
        JavaRDD<String> javaRDDStringCSVA = mlResults.getJavaRDDStringCSV("A");
        List<String> linesA = javaRDDStringCSVA.collect();
        Assert.assertEquals("1,Str2,3.0,true", linesA.get(0));
        Assert.assertEquals("4,Str12,13.0,true", linesA.get(1));
        Assert.assertEquals("7,Str25,26.0,false", linesA.get(2));
        JavaRDD<String> javaRDDStringCSVC = mlResults.getJavaRDDStringCSV("C");
        List<String> linesC = javaRDDStringCSVC.collect();
        Assert.assertEquals("Str12,13.0", linesC.get(0));
        Assert.assertEquals("Str25,26.0", linesC.get(1));
    } else if (outputType == IO_TYPE.JAVA_RDD_STR_IJV) {
        JavaRDD<String> javaRDDStringIJVA = mlResults.getJavaRDDStringIJV("A");
        List<String> linesA = javaRDDStringIJVA.collect();
        Assert.assertEquals("1 1 1", linesA.get(0));
        Assert.assertEquals("1 2 Str2", linesA.get(1));
        Assert.assertEquals("1 3 3.0", linesA.get(2));
        Assert.assertEquals("1 4 true", linesA.get(3));
        Assert.assertEquals("2 1 4", linesA.get(4));
        Assert.assertEquals("2 2 Str12", linesA.get(5));
        Assert.assertEquals("2 3 13.0", linesA.get(6));
        Assert.assertEquals("2 4 true", linesA.get(7));
        JavaRDD<String> javaRDDStringIJVC = mlResults.getJavaRDDStringIJV("C");
        List<String> linesC = javaRDDStringIJVC.collect();
        Assert.assertEquals("1 1 Str12", linesC.get(0));
        Assert.assertEquals("1 2 13.0", linesC.get(1));
        Assert.assertEquals("2 1 Str25", linesC.get(2));
        Assert.assertEquals("2 2 26.0", linesC.get(3));
    } else if (outputType == IO_TYPE.RDD_STR_CSV) {
        RDD<String> rddStringCSVA = mlResults.getRDDStringCSV("A");
        Iterator<String> iteratorA = rddStringCSVA.toLocalIterator();
        Assert.assertEquals("1,Str2,3.0,true", iteratorA.next());
        Assert.assertEquals("4,Str12,13.0,true", iteratorA.next());
        Assert.assertEquals("7,Str25,26.0,false", iteratorA.next());
        RDD<String> rddStringCSVC = mlResults.getRDDStringCSV("C");
        Iterator<String> iteratorC = rddStringCSVC.toLocalIterator();
        Assert.assertEquals("Str12,13.0", iteratorC.next());
        Assert.assertEquals("Str25,26.0", iteratorC.next());
    } else if (outputType == IO_TYPE.RDD_STR_IJV) {
        RDD<String> rddStringIJVA = mlResults.getRDDStringIJV("A");
        Iterator<String> iteratorA = rddStringIJVA.toLocalIterator();
        Assert.assertEquals("1 1 1", iteratorA.next());
        Assert.assertEquals("1 2 Str2", iteratorA.next());
        Assert.assertEquals("1 3 3.0", iteratorA.next());
        Assert.assertEquals("1 4 true", iteratorA.next());
        Assert.assertEquals("2 1 4", iteratorA.next());
        Assert.assertEquals("2 2 Str12", iteratorA.next());
        Assert.assertEquals("2 3 13.0", iteratorA.next());
        Assert.assertEquals("2 4 true", iteratorA.next());
        Assert.assertEquals("3 1 7", iteratorA.next());
        Assert.assertEquals("3 2 Str25", iteratorA.next());
        Assert.assertEquals("3 3 26.0", iteratorA.next());
        Assert.assertEquals("3 4 false", iteratorA.next());
        RDD<String> rddStringIJVC = mlResults.getRDDStringIJV("C");
        Iterator<String> iteratorC = rddStringIJVC.toLocalIterator();
        Assert.assertEquals("1 1 Str12", iteratorC.next());
        Assert.assertEquals("1 2 13.0", iteratorC.next());
        Assert.assertEquals("2 1 Str25", iteratorC.next());
        Assert.assertEquals("2 2 26.0", iteratorC.next());
    } else if (outputType == IO_TYPE.DATAFRAME) {
        Dataset<Row> dataFrameA = mlResults.getDataFrame("A").drop(RDDConverterUtils.DF_ID_COLUMN);
        StructType dfschemaA = dataFrameA.schema();
        StructField structTypeA = dfschemaA.apply(0);
        Assert.assertEquals(DataTypes.LongType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(1);
        Assert.assertEquals(DataTypes.StringType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(2);
        Assert.assertEquals(DataTypes.DoubleType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(3);
        Assert.assertEquals(DataTypes.BooleanType, structTypeA.dataType());
        List<Row> listAOut = dataFrameA.collectAsList();
        Row row1 = listAOut.get(0);
        Assert.assertEquals("Mismatch with expected value", Long.valueOf(1), row1.get(0));
        Assert.assertEquals("Mismatch with expected value", "Str2", row1.get(1));
        Assert.assertEquals("Mismatch with expected value", 3.0, row1.get(2));
        Assert.assertEquals("Mismatch with expected value", true, row1.get(3));
        Row row2 = listAOut.get(1);
        Assert.assertEquals("Mismatch with expected value", Long.valueOf(4), row2.get(0));
        Assert.assertEquals("Mismatch with expected value", "Str12", row2.get(1));
        Assert.assertEquals("Mismatch with expected value", 13.0, row2.get(2));
        Assert.assertEquals("Mismatch with expected value", true, row2.get(3));
        Dataset<Row> dataFrameC = mlResults.getDataFrame("C").drop(RDDConverterUtils.DF_ID_COLUMN);
        StructType dfschemaC = dataFrameC.schema();
        StructField structTypeC = dfschemaC.apply(0);
        Assert.assertEquals(DataTypes.StringType, structTypeC.dataType());
        structTypeC = dfschemaC.apply(1);
        Assert.assertEquals(DataTypes.DoubleType, structTypeC.dataType());
        List<Row> listCOut = dataFrameC.collectAsList();
        Row row3 = listCOut.get(0);
        Assert.assertEquals("Mismatch with expected value", "Str12", row3.get(0));
        Assert.assertEquals("Mismatch with expected value", 13.0, row3.get(1));
        Row row4 = listCOut.get(1);
        Assert.assertEquals("Mismatch with expected value", "Str25", row4.get(0));
        Assert.assertEquals("Mismatch with expected value", 26.0, row4.get(1));
    } else {
        String[][] frameA = mlResults.getFrameAs2DStringArray("A");
        Assert.assertEquals("Str2", frameA[0][1]);
        Assert.assertEquals("3.0", frameA[0][2]);
        Assert.assertEquals("13.0", frameA[1][2]);
        Assert.assertEquals("true", frameA[1][3]);
        Assert.assertEquals("Str25", frameA[2][1]);
        String[][] frameC = mlResults.getFrameAs2DStringArray("C");
        Assert.assertEquals("Str12", frameC[0][0]);
        Assert.assertEquals("Str25", frameC[1][0]);
        Assert.assertEquals("13.0", frameC[0][1]);
        Assert.assertEquals("26.0", frameC[1][1]);
    }
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) FrameSchema(org.apache.sysml.api.mlcontext.FrameSchema) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) RDD(org.apache.spark.rdd.RDD) StructField(org.apache.spark.sql.types.StructField) Iterator(scala.collection.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata)

Example 40 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project cdap by caskdata.

the class StringValueFilterCompute method initialize.

@Override
public void initialize(SparkExecutionPluginContext context) throws Exception {
    // should never happen, here to test app correctness in unit tests
    Schema inputSchema = context.getInputSchema();
    if (inputSchema != null && !inputSchema.equals(context.getOutputSchema())) {
        throw new IllegalStateException("runtime schema does not match what was set at configure time.");
    }
    interpreter = context.createSparkInterpreter();
    interpreter.compile("package test\n" + "import co.cask.cdap.api.data.format._\n" + "import org.apache.spark._\n" + "import org.apache.spark.api.java._\n" + "import org.apache.spark.rdd._\n" + "object Compute {\n" + "  def compute(rdd: RDD[StructuredRecord]): JavaRDD[StructuredRecord] = {\n" + "    val value = \"" + conf.value + "\"\n" + "    val field = \"" + conf.field + "\"\n" + "    JavaRDD.fromRDD(rdd.filter(r => !value.equals(r.get(field))))\n" + "  }\n" + "}");
    computeMethod = interpreter.getClassLoader().loadClass("test.Compute").getDeclaredMethod("compute", RDD.class);
}
Also used : RDD(org.apache.spark.rdd.RDD) JavaRDD(org.apache.spark.api.java.JavaRDD) Schema(co.cask.cdap.api.data.schema.Schema)

Aggregations

JavaRDD (org.apache.spark.api.java.JavaRDD)56 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)32 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)24 List (java.util.List)22 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)20 Collectors (java.util.stream.Collectors)19 Tuple2 (scala.Tuple2)19 Argument (org.broadinstitute.barclay.argparser.Argument)17 Broadcast (org.apache.spark.broadcast.Broadcast)15 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)15 SAMFileHeader (htsjdk.samtools.SAMFileHeader)14 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)14 UserException (org.broadinstitute.hellbender.exceptions.UserException)14 CommandLineProgramProperties (org.broadinstitute.barclay.argparser.CommandLineProgramProperties)13 GATKSparkTool (org.broadinstitute.hellbender.engine.spark.GATKSparkTool)13 IOException (java.io.IOException)12 Serializable (java.io.Serializable)12 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)12 java.util (java.util)11 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)11