Search in sources :

Example 11 with Dataset

use of org.apache.spark.sql.Dataset in project net.jgp.labs.spark by jgperrin.

the class StreamingIngestionFileSystemTextFileToDataframeApp method start.

private void start() {
    // Create a local StreamingContext with two working thread and batch
    // interval of
    // 1 second
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
    JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
    msgDataStream.print();
    // Create JavaRDD<Row>
    msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {

        private static final long serialVersionUID = -590010339928376829L;

        @Override
        public void call(JavaRDD<String> rdd) {
            JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {

                private static final long serialVersionUID = 5167089361335095997L;

                @Override
                public Row call(String msg) {
                    Row row = RowFactory.create(msg);
                    return row;
                }
            });
            // Create Schema
            StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
            // Get Spark 2.0 session
            SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
            Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
            msgDataFrame.show();
        }
    });
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) Dataset(org.apache.spark.sql.Dataset) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 12 with Dataset

use of org.apache.spark.sql.Dataset in project incubator-systemml by apache.

the class MLContextUtil method convertInputType.

/**
 * Convert input types to internal SystemML representations
 *
 * @param parameterName
 *            The name of the input parameter
 * @param parameterValue
 *            The value of the input parameter
 * @param metadata
 *            matrix/frame metadata
 * @return input in SystemML data representation
 */
public static Data convertInputType(String parameterName, Object parameterValue, Metadata metadata) {
    String name = parameterName;
    Object value = parameterValue;
    boolean hasMetadata = (metadata != null) ? true : false;
    boolean hasMatrixMetadata = hasMetadata && (metadata instanceof MatrixMetadata) ? true : false;
    boolean hasFrameMetadata = hasMetadata && (metadata instanceof FrameMetadata) ? true : false;
    if (name == null) {
        throw new MLContextException("Input parameter name is null");
    } else if (value == null) {
        throw new MLContextException("Input parameter value is null for: " + parameterName);
    } else if (value instanceof JavaRDD<?>) {
        @SuppressWarnings("unchecked") JavaRDD<String> javaRDD = (JavaRDD<String>) value;
        if (hasMatrixMetadata) {
            MatrixMetadata matrixMetadata = (MatrixMetadata) metadata;
            if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
                return MLContextConversionUtil.javaRDDStringIJVToMatrixObject(javaRDD, matrixMetadata);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToMatrixObject(javaRDD, matrixMetadata);
            }
        } else if (hasFrameMetadata) {
            FrameMetadata frameMetadata = (FrameMetadata) metadata;
            if (frameMetadata.getFrameFormat() == FrameFormat.IJV) {
                return MLContextConversionUtil.javaRDDStringIJVToFrameObject(javaRDD, frameMetadata);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToFrameObject(javaRDD, frameMetadata);
            }
        } else if (!hasMetadata) {
            String firstLine = javaRDD.first();
            boolean isAllNumbers = isCSVLineAllNumbers(firstLine);
            if (isAllNumbers) {
                return MLContextConversionUtil.javaRDDStringCSVToMatrixObject(javaRDD);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToFrameObject(javaRDD);
            }
        }
    } else if (value instanceof RDD<?>) {
        @SuppressWarnings("unchecked") RDD<String> rdd = (RDD<String>) value;
        if (hasMatrixMetadata) {
            MatrixMetadata matrixMetadata = (MatrixMetadata) metadata;
            if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
                return MLContextConversionUtil.rddStringIJVToMatrixObject(rdd, matrixMetadata);
            } else {
                return MLContextConversionUtil.rddStringCSVToMatrixObject(rdd, matrixMetadata);
            }
        } else if (hasFrameMetadata) {
            FrameMetadata frameMetadata = (FrameMetadata) metadata;
            if (frameMetadata.getFrameFormat() == FrameFormat.IJV) {
                return MLContextConversionUtil.rddStringIJVToFrameObject(rdd, frameMetadata);
            } else {
                return MLContextConversionUtil.rddStringCSVToFrameObject(rdd, frameMetadata);
            }
        } else if (!hasMetadata) {
            String firstLine = rdd.first();
            boolean isAllNumbers = isCSVLineAllNumbers(firstLine);
            if (isAllNumbers) {
                return MLContextConversionUtil.rddStringCSVToMatrixObject(rdd);
            } else {
                return MLContextConversionUtil.rddStringCSVToFrameObject(rdd);
            }
        }
    } else if (value instanceof MatrixBlock) {
        MatrixBlock matrixBlock = (MatrixBlock) value;
        return MLContextConversionUtil.matrixBlockToMatrixObject(name, matrixBlock, (MatrixMetadata) metadata);
    } else if (value instanceof FrameBlock) {
        FrameBlock frameBlock = (FrameBlock) value;
        return MLContextConversionUtil.frameBlockToFrameObject(name, frameBlock, (FrameMetadata) metadata);
    } else if (value instanceof Dataset<?>) {
        @SuppressWarnings("unchecked") Dataset<Row> dataFrame = (Dataset<Row>) value;
        dataFrame = MLUtils.convertVectorColumnsToML(dataFrame);
        if (hasMatrixMetadata) {
            return MLContextConversionUtil.dataFrameToMatrixObject(dataFrame, (MatrixMetadata) metadata);
        } else if (hasFrameMetadata) {
            return MLContextConversionUtil.dataFrameToFrameObject(dataFrame, (FrameMetadata) metadata);
        } else if (!hasMetadata) {
            boolean looksLikeMatrix = doesDataFrameLookLikeMatrix(dataFrame);
            if (looksLikeMatrix) {
                return MLContextConversionUtil.dataFrameToMatrixObject(dataFrame);
            } else {
                return MLContextConversionUtil.dataFrameToFrameObject(dataFrame);
            }
        }
    } else if (value instanceof Matrix) {
        Matrix matrix = (Matrix) value;
        if ((matrix.hasBinaryBlocks()) && (!matrix.hasMatrixObject())) {
            if (metadata == null) {
                metadata = matrix.getMatrixMetadata();
            }
            JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks = matrix.toBinaryBlocks();
            return MLContextConversionUtil.binaryBlocksToMatrixObject(binaryBlocks, (MatrixMetadata) metadata);
        } else {
            return matrix.toMatrixObject();
        }
    } else if (value instanceof Frame) {
        Frame frame = (Frame) value;
        if ((frame.hasBinaryBlocks()) && (!frame.hasFrameObject())) {
            if (metadata == null) {
                metadata = frame.getFrameMetadata();
            }
            JavaPairRDD<Long, FrameBlock> binaryBlocks = frame.toBinaryBlocks();
            return MLContextConversionUtil.binaryBlocksToFrameObject(binaryBlocks, (FrameMetadata) metadata);
        } else {
            return frame.toFrameObject();
        }
    } else if (value instanceof double[][]) {
        double[][] doubleMatrix = (double[][]) value;
        return MLContextConversionUtil.doubleMatrixToMatrixObject(name, doubleMatrix, (MatrixMetadata) metadata);
    } else if (value instanceof URL) {
        URL url = (URL) value;
        return MLContextConversionUtil.urlToMatrixObject(url, (MatrixMetadata) metadata);
    } else if (value instanceof Integer) {
        return new IntObject((Integer) value);
    } else if (value instanceof Double) {
        return new DoubleObject((Double) value);
    } else if (value instanceof String) {
        return new StringObject((String) value);
    } else if (value instanceof Boolean) {
        return new BooleanObject((Boolean) value);
    }
    return null;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) URL(java.net.URL) RDD(org.apache.spark.rdd.RDD) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) StringObject(org.apache.sysml.runtime.instructions.cp.StringObject) Dataset(org.apache.spark.sql.Dataset) JavaRDD(org.apache.spark.api.java.JavaRDD) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) StringObject(org.apache.sysml.runtime.instructions.cp.StringObject) Row(org.apache.spark.sql.Row) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Example 13 with Dataset

use of org.apache.spark.sql.Dataset in project incubator-systemml by apache.

the class FrameConverterTest method runConverter.

@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
    SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
    JavaSparkContext sc = sec.getSparkContext();
    ValueType[] lschema = schema.toArray(new ValueType[0]);
    MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
    switch(type) {
        case CSV2BIN:
            {
                InputInfo iinfo = InputInfo.CSVInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2CSV:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                CSVFileFormatProperties fprop = new CSVFileFormatProperties();
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case TXTCELL2BIN:
            {
                InputInfo iinfo = InputInfo.TextCellInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2TXTCELL:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case MAT2BIN:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2MAT:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
                break;
            }
        case DFRM2BIN:
            {
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                // Create DataFrame
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
                JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
                Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2DFRM:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
                // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        default:
            throw new RuntimeException("Unsuported converter type: " + type.toString());
    }
    sec.close();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Dataset(org.apache.spark.sql.Dataset) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritableFrameToLongFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)

Example 14 with Dataset

use of org.apache.spark.sql.Dataset in project mmtf-spark by sbl-sdsc.

the class SparkRegressor method fit.

/**
 * Dataset must at least contain the following two columns:
 * label: the class labels
 * features: feature vector
 * @param data
 * @return map with metrics
 */
public Map<String, String> fit(Dataset<Row> data) {
    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = data.randomSplit(new double[] { 1.0 - testFraction, testFraction }, seed);
    Dataset<Row> trainingData = splits[0];
    Dataset<Row> testData = splits[1];
    // Train a RandomForest model.
    predictor.setLabelCol(label).setFeaturesCol("features");
    // Chain indexer and forest in a Pipeline
    Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { predictor });
    // Train model. This also runs the indexer.
    PipelineModel model = pipeline.fit(trainingData);
    // Make predictions.
    Dataset<Row> predictions = model.transform(testData);
    // Display some sample predictions
    System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());
    String primaryKey = predictions.columns()[0];
    predictions.select(primaryKey, label, "prediction").sample(false, 0.1, seed).show(50);
    Map<String, String> metrics = new LinkedHashMap<>();
    metrics.put("Method", predictor.getClass().getSimpleName());
    // Select (prediction, true label) and compute test error
    RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol(label).setPredictionCol("prediction").setMetricName("rmse");
    metrics.put("rmse", Double.toString(evaluator.evaluate(predictions)));
    return metrics;
}
Also used : Dataset(org.apache.spark.sql.Dataset) Row(org.apache.spark.sql.Row) RegressionEvaluator(org.apache.spark.ml.evaluation.RegressionEvaluator) Pipeline(org.apache.spark.ml.Pipeline) PipelineModel(org.apache.spark.ml.PipelineModel) LinkedHashMap(java.util.LinkedHashMap)

Example 15 with Dataset

use of org.apache.spark.sql.Dataset in project systemml by apache.

the class MLContextUtil method convertInputType.

/**
 * Convert input types to internal SystemML representations
 *
 * @param parameterName
 *            The name of the input parameter
 * @param parameterValue
 *            The value of the input parameter
 * @param metadata
 *            matrix/frame metadata
 * @return input in SystemML data representation
 */
public static Data convertInputType(String parameterName, Object parameterValue, Metadata metadata) {
    String name = parameterName;
    Object value = parameterValue;
    boolean hasMetadata = (metadata != null) ? true : false;
    boolean hasMatrixMetadata = hasMetadata && (metadata instanceof MatrixMetadata) ? true : false;
    boolean hasFrameMetadata = hasMetadata && (metadata instanceof FrameMetadata) ? true : false;
    if (name == null) {
        throw new MLContextException("Input parameter name is null");
    } else if (value == null) {
        throw new MLContextException("Input parameter value is null for: " + parameterName);
    } else if (value instanceof JavaRDD<?>) {
        @SuppressWarnings("unchecked") JavaRDD<String> javaRDD = (JavaRDD<String>) value;
        if (hasMatrixMetadata) {
            MatrixMetadata matrixMetadata = (MatrixMetadata) metadata;
            if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
                return MLContextConversionUtil.javaRDDStringIJVToMatrixObject(javaRDD, matrixMetadata);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToMatrixObject(javaRDD, matrixMetadata);
            }
        } else if (hasFrameMetadata) {
            FrameMetadata frameMetadata = (FrameMetadata) metadata;
            if (frameMetadata.getFrameFormat() == FrameFormat.IJV) {
                return MLContextConversionUtil.javaRDDStringIJVToFrameObject(javaRDD, frameMetadata);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToFrameObject(javaRDD, frameMetadata);
            }
        } else if (!hasMetadata) {
            String firstLine = javaRDD.first();
            boolean isAllNumbers = isCSVLineAllNumbers(firstLine);
            if (isAllNumbers) {
                return MLContextConversionUtil.javaRDDStringCSVToMatrixObject(javaRDD);
            } else {
                return MLContextConversionUtil.javaRDDStringCSVToFrameObject(javaRDD);
            }
        }
    } else if (value instanceof RDD<?>) {
        @SuppressWarnings("unchecked") RDD<String> rdd = (RDD<String>) value;
        if (hasMatrixMetadata) {
            MatrixMetadata matrixMetadata = (MatrixMetadata) metadata;
            if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
                return MLContextConversionUtil.rddStringIJVToMatrixObject(rdd, matrixMetadata);
            } else {
                return MLContextConversionUtil.rddStringCSVToMatrixObject(rdd, matrixMetadata);
            }
        } else if (hasFrameMetadata) {
            FrameMetadata frameMetadata = (FrameMetadata) metadata;
            if (frameMetadata.getFrameFormat() == FrameFormat.IJV) {
                return MLContextConversionUtil.rddStringIJVToFrameObject(rdd, frameMetadata);
            } else {
                return MLContextConversionUtil.rddStringCSVToFrameObject(rdd, frameMetadata);
            }
        } else if (!hasMetadata) {
            String firstLine = rdd.first();
            boolean isAllNumbers = isCSVLineAllNumbers(firstLine);
            if (isAllNumbers) {
                return MLContextConversionUtil.rddStringCSVToMatrixObject(rdd);
            } else {
                return MLContextConversionUtil.rddStringCSVToFrameObject(rdd);
            }
        }
    } else if (value instanceof MatrixBlock) {
        MatrixBlock matrixBlock = (MatrixBlock) value;
        return MLContextConversionUtil.matrixBlockToMatrixObject(name, matrixBlock, (MatrixMetadata) metadata);
    } else if (value instanceof FrameBlock) {
        FrameBlock frameBlock = (FrameBlock) value;
        return MLContextConversionUtil.frameBlockToFrameObject(name, frameBlock, (FrameMetadata) metadata);
    } else if (value instanceof Dataset<?>) {
        @SuppressWarnings("unchecked") Dataset<Row> dataFrame = (Dataset<Row>) value;
        dataFrame = MLUtils.convertVectorColumnsToML(dataFrame);
        if (hasMatrixMetadata) {
            return MLContextConversionUtil.dataFrameToMatrixObject(dataFrame, (MatrixMetadata) metadata);
        } else if (hasFrameMetadata) {
            return MLContextConversionUtil.dataFrameToFrameObject(dataFrame, (FrameMetadata) metadata);
        } else if (!hasMetadata) {
            boolean looksLikeMatrix = doesDataFrameLookLikeMatrix(dataFrame);
            if (looksLikeMatrix) {
                return MLContextConversionUtil.dataFrameToMatrixObject(dataFrame);
            } else {
                return MLContextConversionUtil.dataFrameToFrameObject(dataFrame);
            }
        }
    } else if (value instanceof Matrix) {
        Matrix matrix = (Matrix) value;
        if ((matrix.hasBinaryBlocks()) && (!matrix.hasMatrixObject())) {
            if (metadata == null) {
                metadata = matrix.getMatrixMetadata();
            }
            JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks = matrix.toBinaryBlocks();
            return MLContextConversionUtil.binaryBlocksToMatrixObject(binaryBlocks, (MatrixMetadata) metadata);
        } else {
            return matrix.toMatrixObject();
        }
    } else if (value instanceof Frame) {
        Frame frame = (Frame) value;
        if ((frame.hasBinaryBlocks()) && (!frame.hasFrameObject())) {
            if (metadata == null) {
                metadata = frame.getFrameMetadata();
            }
            JavaPairRDD<Long, FrameBlock> binaryBlocks = frame.toBinaryBlocks();
            return MLContextConversionUtil.binaryBlocksToFrameObject(binaryBlocks, (FrameMetadata) metadata);
        } else {
            return frame.toFrameObject();
        }
    } else if (value instanceof double[][]) {
        double[][] doubleMatrix = (double[][]) value;
        return MLContextConversionUtil.doubleMatrixToMatrixObject(name, doubleMatrix, (MatrixMetadata) metadata);
    } else if (value instanceof URL) {
        URL url = (URL) value;
        return MLContextConversionUtil.urlToMatrixObject(url, (MatrixMetadata) metadata);
    } else if (value instanceof Integer) {
        return new IntObject((Integer) value);
    } else if (value instanceof Double) {
        return new DoubleObject((Double) value);
    } else if (value instanceof String) {
        return new StringObject((String) value);
    } else if (value instanceof Boolean) {
        return new BooleanObject((Boolean) value);
    }
    return null;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) URL(java.net.URL) RDD(org.apache.spark.rdd.RDD) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) StringObject(org.apache.sysml.runtime.instructions.cp.StringObject) Dataset(org.apache.spark.sql.Dataset) JavaRDD(org.apache.spark.api.java.JavaRDD) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) StringObject(org.apache.sysml.runtime.instructions.cp.StringObject) Row(org.apache.spark.sql.Row) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Aggregations

Dataset (org.apache.spark.sql.Dataset)27 Row (org.apache.spark.sql.Row)16 SparkSession (org.apache.spark.sql.SparkSession)14 ArrayList (java.util.ArrayList)12 JavaRDD (org.apache.spark.api.java.JavaRDD)10 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)10 List (java.util.List)8 Map (java.util.Map)7 Tuple2 (scala.Tuple2)7 Collectors (java.util.stream.Collectors)6 Set (java.util.Set)5 Serializable (java.io.Serializable)4 HashMap (java.util.HashMap)4 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)4 MapFunction (org.apache.spark.api.java.function.MapFunction)4 Pipeline (org.apache.spark.ml.Pipeline)4 PipelineModel (org.apache.spark.ml.PipelineModel)4 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 Iterator (java.util.Iterator)3