Search in sources :

Example 6 with Dataset

use of org.apache.spark.sql.Dataset in project beam by apache.

the class AbstractTranslationContext method startPipeline.

// --------------------------------------------------------------------------------------------
// Pipeline methods
// --------------------------------------------------------------------------------------------
/**
 * Starts the pipeline.
 */
public void startPipeline() {
    SparkStructuredStreamingPipelineOptions options = serializablePipelineOptions.get().as(SparkStructuredStreamingPipelineOptions.class);
    int datasetIndex = 0;
    for (Dataset<?> dataset : leaves) {
        if (options.isStreaming()) {
            // TODO: deal with Beam Discarding, Accumulating and Accumulating & Retracting	outputmodes
            // with DatastreamWriter.outputMode
            DataStreamWriter<?> dataStreamWriter = dataset.writeStream();
            // spark sets a default checkpoint dir if not set.
            if (options.getCheckpointDir() != null) {
                dataStreamWriter = dataStreamWriter.option("checkpointLocation", options.getCheckpointDir());
            }
            launchStreaming(dataStreamWriter.foreach(new NoOpForeachWriter<>()));
        } else {
            if (options.getTestMode()) {
                LOG.debug("**** dataset {} catalyst execution plans ****", ++datasetIndex);
                dataset.explain(true);
            }
            // apply a dummy fn just to apply foreach action that will trigger the pipeline run in
            // spark
            dataset.foreach((ForeachFunction) t -> {
            });
        }
    }
}
Also used : SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Dataset(org.apache.spark.sql.Dataset) ForeachFunction(org.apache.spark.api.java.function.ForeachFunction) LoggerFactory(org.slf4j.LoggerFactory) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) TransformInputs(org.apache.beam.runners.core.construction.TransformInputs) PTransform(org.apache.beam.sdk.transforms.PTransform) HashSet(java.util.HashSet) EncoderHelpers(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.EncoderHelpers) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) ForeachWriter(org.apache.spark.sql.ForeachWriter) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) SparkSession(org.apache.spark.sql.SparkSession) Logger(org.slf4j.Logger) SparkConf(org.apache.spark.SparkConf) Set(java.util.Set) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) DataStreamWriter(org.apache.spark.sql.streaming.DataStreamWriter) SparkStructuredStreamingPipelineOptions(org.apache.beam.runners.spark.structuredstreaming.SparkStructuredStreamingPipelineOptions) List(java.util.List) PValue(org.apache.beam.sdk.values.PValue) PCollectionView(org.apache.beam.sdk.values.PCollectionView) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings) SparkStructuredStreamingPipelineOptions(org.apache.beam.runners.spark.structuredstreaming.SparkStructuredStreamingPipelineOptions)

Example 7 with Dataset

use of org.apache.spark.sql.Dataset in project zeppelin by apache.

the class Spark2Shims method showDataFrame.

@Override
public String showDataFrame(Object obj, int maxResult, InterpreterContext context) {
    if (obj instanceof Dataset) {
        Dataset<Row> df = ((Dataset) obj).toDF();
        String[] columns = df.columns();
        // DDL will empty DataFrame
        if (columns.length == 0) {
            return "";
        }
        // fetch maxResult+1 rows so that we can check whether it is larger than zeppelin.spark.maxResult
        List<Row> rows = df.takeAsList(maxResult + 1);
        String template = context.getLocalProperties().get("template");
        if (!StringUtils.isBlank(template)) {
            if (rows.size() >= 1) {
                return new SingleRowInterpreterResult(sparkRowToList(rows.get(0)), template, context).toHtml();
            } else {
                return "";
            }
        }
        StringBuilder msg = new StringBuilder();
        msg.append("\n%table ");
        msg.append(StringUtils.join(TableDataUtils.normalizeColumns(columns), "\t"));
        msg.append("\n");
        boolean isLargerThanMaxResult = rows.size() > maxResult;
        if (isLargerThanMaxResult) {
            rows = rows.subList(0, maxResult);
        }
        for (Row row : rows) {
            for (int i = 0; i < row.size(); ++i) {
                msg.append(TableDataUtils.normalizeColumn(row.get(i)));
                if (i != row.size() - 1) {
                    msg.append("\t");
                }
            }
            msg.append("\n");
        }
        if (isLargerThanMaxResult) {
            msg.append("\n");
            msg.append(ResultMessages.getExceedsLimitRowsMessage(maxResult, "zeppelin.spark.maxResult"));
        }
        // append %text at the end, otherwise the following output will be put in table as well.
        msg.append("\n%text ");
        return msg.toString();
    } else {
        return obj.toString();
    }
}
Also used : Dataset(org.apache.spark.sql.Dataset) SingleRowInterpreterResult(org.apache.zeppelin.interpreter.SingleRowInterpreterResult) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) Row(org.apache.spark.sql.Row)

Example 8 with Dataset

use of org.apache.spark.sql.Dataset in project zeppelin by apache.

the class Spark3Shims method showDataFrame.

@Override
public String showDataFrame(Object obj, int maxResult, InterpreterContext context) {
    if (obj instanceof Dataset) {
        Dataset<Row> df = ((Dataset) obj).toDF();
        String[] columns = df.columns();
        // DDL will empty DataFrame
        if (columns.length == 0) {
            return "";
        }
        // fetch maxResult+1 rows so that we can check whether it is larger than zeppelin.spark.maxResult
        List<Row> rows = df.takeAsList(maxResult + 1);
        String template = context.getLocalProperties().get("template");
        if (!StringUtils.isBlank(template)) {
            if (rows.size() >= 1) {
                return new SingleRowInterpreterResult(sparkRowToList(rows.get(0)), template, context).toHtml();
            } else {
                return "";
            }
        }
        StringBuilder msg = new StringBuilder();
        msg.append("%table ");
        msg.append(StringUtils.join(TableDataUtils.normalizeColumns(columns), "\t"));
        msg.append("\n");
        boolean isLargerThanMaxResult = rows.size() > maxResult;
        if (isLargerThanMaxResult) {
            rows = rows.subList(0, maxResult);
        }
        for (Row row : rows) {
            for (int i = 0; i < row.size(); ++i) {
                msg.append(TableDataUtils.normalizeColumn(row.get(i)));
                if (i != row.size() - 1) {
                    msg.append("\t");
                }
            }
            msg.append("\n");
        }
        if (isLargerThanMaxResult) {
            msg.append("\n");
            msg.append(ResultMessages.getExceedsLimitRowsMessage(maxResult, "zeppelin.spark.maxResult"));
        }
        // append %text at the end, otherwise the following output will be put in table as well.
        msg.append("\n%text ");
        return msg.toString();
    } else {
        return obj.toString();
    }
}
Also used : Dataset(org.apache.spark.sql.Dataset) SingleRowInterpreterResult(org.apache.zeppelin.interpreter.SingleRowInterpreterResult) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) Row(org.apache.spark.sql.Row)

Example 9 with Dataset

use of org.apache.spark.sql.Dataset in project kylo by Teradata.

the class SparkMetadataExtractor method parse.

@Override
public List<FileMetadata> parse(String[] filePaths) {
    List<DataFrame> dataFrameList = new ArrayList<>();
    for (String path : filePaths) {
        DataFrame df = sqlContext.read().format("com.thinkbiganalytics.spark.file.metadata").load(path);
        dataFrameList.add(df);
    }
    DataFrame unionDf = SparkUtil.unionAll(dataFrameList);
    Encoder<FileMetadata> encoder = Encoders.bean(FileMetadata.class);
    Dataset dataset = unionDf.as(encoder);
    return dataset.collectAsList();
}
Also used : Dataset(org.apache.spark.sql.Dataset) ArrayList(java.util.ArrayList) FileMetadata(com.thinkbiganalytics.kylo.metadata.file.FileMetadata) DataFrame(org.apache.spark.sql.DataFrame)

Example 10 with Dataset

use of org.apache.spark.sql.Dataset in project net.jgp.labs.spark by jgperrin.

the class RandomForestRegressorInPipelineApp method main.

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("RandomForestRegressorApp").master("local[*]").getOrCreate();
    // $example on$
    // Load and parse the data file, converting it to a DataFrame.
    Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
    df.show(20, false);
    // Automatically identify categorical features, and index them.
    // Set maxCategories so features with > 4 distinct values are treated as continuous.
    VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = df.randomSplit(new double[] { 1, 0 });
    // Dataset<Row>[] splits = df.randomSplit(new double[] {0.7, 0.3});
    // splits[0];
    Dataset<Row> trainingData = df;
    // Dataset<Row> testData = splits[1];
    Dataset<Row> testData = spark.read().format("libsvm").load("data/sample-ml/simplegauss_test.txt");
    // Train a RandomForest model.
    RandomForestRegressor rf = new RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures");
    // Chain indexer and forest in a Pipeline
    Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { featureIndexer, rf });
    // Train model. This also runs the indexer.
    PipelineModel model = pipeline.fit(trainingData);
    // Make predictions.
    Dataset<Row> predictions = model.transform(testData);
    // Select example rows to display.
    predictions.select("prediction", "label", "features").show(5);
    // Select (prediction, true label) and compute test error
    RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
    double rmse = evaluator.evaluate(predictions);
    System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
    RandomForestRegressionModel rfModel = (RandomForestRegressionModel) (model.stages()[1]);
    System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
    // $example off$
    Double feature = 2.0;
    Vector features = Vectors.dense(feature);
    double p = rfModel.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    feature = 11.0;
    features = Vectors.dense(feature);
    p = rfModel.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    spark.stop();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Dataset(org.apache.spark.sql.Dataset) VectorIndexer(org.apache.spark.ml.feature.VectorIndexer) Pipeline(org.apache.spark.ml.Pipeline) PipelineModel(org.apache.spark.ml.PipelineModel) RandomForestRegressionModel(org.apache.spark.ml.regression.RandomForestRegressionModel) RandomForestRegressor(org.apache.spark.ml.regression.RandomForestRegressor) VectorIndexerModel(org.apache.spark.ml.feature.VectorIndexerModel) Row(org.apache.spark.sql.Row) RegressionEvaluator(org.apache.spark.ml.evaluation.RegressionEvaluator) Vector(org.apache.spark.ml.linalg.Vector)

Aggregations

Dataset (org.apache.spark.sql.Dataset)27 Row (org.apache.spark.sql.Row)16 SparkSession (org.apache.spark.sql.SparkSession)14 ArrayList (java.util.ArrayList)12 JavaRDD (org.apache.spark.api.java.JavaRDD)10 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)10 List (java.util.List)8 Map (java.util.Map)7 Tuple2 (scala.Tuple2)7 Collectors (java.util.stream.Collectors)6 Set (java.util.Set)5 Serializable (java.io.Serializable)4 HashMap (java.util.HashMap)4 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)4 MapFunction (org.apache.spark.api.java.function.MapFunction)4 Pipeline (org.apache.spark.ml.Pipeline)4 PipelineModel (org.apache.spark.ml.PipelineModel)4 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 Iterator (java.util.Iterator)3