Search in sources :

Example 21 with Dataset

use of org.apache.spark.sql.Dataset in project kylo by Teradata.

the class SparkProvenance method run.

private void run(@Nonnull final PrintStream out, @Nonnull final String... args) {
    // Check how many arguments were passed in
    if (args.length < 5) {
        String msg = "Proper Usage is: <flowfile-id> <job-flowfile-id> <feed-name (category.feed)> <connection-url (url to connect to JMS or KAFAK)> <type (JMS, KAFKA)>" + "You provided " + args.length + " args which are (comma separated): " + StringUtils.join(args, ",");
        out.println(msg);
        throw new IllegalArgumentException(msg);
    }
    ProvenanceEventService provenanceEventService = null;
    final SparkContext sparkContext = SparkContext.getOrCreate();
    try {
        final SparkProvenanceConfiguration params = new SparkProvenanceConfiguration(args);
        // Get the proper ProvenanceService
        provenanceEventService = ProvenanceServiceFactory.getProvenanceEventService(params);
        // Collection of custom Provenance Events we will be sending to Kylo
        List<ProvenanceEventRecordDTO> events = new ArrayList<>();
        // do some work.  Look up the database names in Hive
        final HiveContext hiveContext = new HiveContext(sparkContext);
        // Do some work... i.e. look up the Databases in Hive
        ProvenanceEventRecordDTO event = newEvent("Databases", params);
        Dataset df = hiveContext.sql("show databases");
        event.getAttributeMap().put("databases", df.toJSON().collectAsList().toString());
        event.setEventTime(System.currentTimeMillis());
        events.add(event);
        event = newEvent("Another Step", params);
        event.getAttributeMap().put("UUID 1", UUID.randomUUID().toString());
        event.setEventTime(System.currentTimeMillis());
        event.getAttributeMap().put("timestamp", String.valueOf(System.currentTimeMillis()));
        events.add(event);
        // Send the events off
        provenanceEventService.sendEvents(events);
        log.info("Spark app finished");
    } catch (Exception e) {
        log.error("Failed to run Spark Provenance Job: {}", e.toString(), e);
    } finally {
        provenanceEventService.closeConnection();
        sparkContext.stop();
        log.info("Exiting!!!!");
        System.exit(0);
    }
}
Also used : ProvenanceEventService(com.thinkbiganalytics.provenance.api.ProvenanceEventService) SparkContext(org.apache.spark.SparkContext) ProvenanceEventRecordDTO(com.thinkbiganalytics.nifi.provenance.model.ProvenanceEventRecordDTO) Dataset(org.apache.spark.sql.Dataset) ArrayList(java.util.ArrayList) HiveContext(org.apache.spark.sql.hive.HiveContext)

Example 22 with Dataset

use of org.apache.spark.sql.Dataset in project kylo by Teradata.

the class SparkFileMetadataExtractor method parse.

@Override
public List<FileMetadata> parse(String[] filePaths) {
    List<Dataset> dataFrameList = new ArrayList<>();
    for (String path : filePaths) {
        Dataset df = (Dataset) sqlContext.read().format("com.thinkbiganalytics.spark.file.metadata").load(path);
        dataFrameList.add(df);
    }
    Dataset unionDf = unionAll(dataFrameList);
    Encoder<FileMetadata> encoder = Encoders.bean(FileMetadata.class);
    Dataset fileData = unionDf.as(encoder);
    return fileData.collectAsList();
}
Also used : Dataset(org.apache.spark.sql.Dataset) ArrayList(java.util.ArrayList) FileMetadata(com.thinkbiganalytics.kylo.metadata.file.FileMetadata)

Example 23 with Dataset

use of org.apache.spark.sql.Dataset in project ignite by apache.

the class SharedRDDExample method main.

/**
 * Executes the example.
 * @param args Command line arguments, none required.
 */
public static void main(String[] args) {
    // Spark Configuration.
    SparkConf sparkConf = new SparkConf().setAppName("JavaIgniteRDDExample").setMaster("local").set("spark.executor.instances", "2");
    // Spark context.
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    // Adjust the logger to exclude the logs of no interest.
    Logger.getRootLogger().setLevel(Level.ERROR);
    Logger.getLogger("org.apache.ignite").setLevel(Level.INFO);
    // Creates Ignite context with specific configuration and runs Ignite in the embedded mode.
    JavaIgniteContext<Integer, Integer> igniteContext = new JavaIgniteContext<Integer, Integer>(sparkContext, "examples/config/spark/example-shared-rdd.xml", false);
    // Create a Java Ignite RDD of Type (Int,Int) Integer Pair.
    JavaIgniteRDD<Integer, Integer> sharedRDD = igniteContext.<Integer, Integer>fromCache("sharedRDD");
    // Define data to be stored in the Ignite RDD (cache).
    List<Integer> data = new ArrayList<>(20);
    for (int i = 0; i < 20; i++) {
        data.add(i);
    }
    // Preparing a Java RDD.
    JavaRDD<Integer> javaRDD = sparkContext.<Integer>parallelize(data);
    // Fill the Ignite RDD in with Int pairs. Here Pairs are represented as Scala Tuple2.
    sharedRDD.savePairs(javaRDD.<Integer, Integer>mapToPair(new PairFunction<Integer, Integer, Integer>() {

        @Override
        public Tuple2<Integer, Integer> call(Integer val) throws Exception {
            return new Tuple2<Integer, Integer>(val, val);
        }
    }));
    System.out.println(">>> Iterating over Ignite Shared RDD...");
    // Iterate over the Ignite RDD.
    sharedRDD.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {

        @Override
        public void call(Tuple2<Integer, Integer> tuple) throws Exception {
            System.out.println("(" + tuple._1 + "," + tuple._2 + ")");
        }
    });
    System.out.println(">>> Transforming values stored in Ignite Shared RDD...");
    // Filter out even values as a transformed RDD.
    JavaPairRDD<Integer, Integer> transformedValues = sharedRDD.filter(new Function<Tuple2<Integer, Integer>, Boolean>() {

        @Override
        public Boolean call(Tuple2<Integer, Integer> tuple) throws Exception {
            return tuple._2() % 2 == 0;
        }
    });
    // Print out the transformed values.
    transformedValues.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {

        @Override
        public void call(Tuple2<Integer, Integer> tuple) throws Exception {
            System.out.println("(" + tuple._1 + "," + tuple._2 + ")");
        }
    });
    System.out.println(">>> Executing SQL query over Ignite Shared RDD...");
    // Execute SQL query over the Ignite RDD.
    Dataset df = sharedRDD.sql("select _val from Integer where _key < 9");
    // Show the result of the execution.
    df.show();
    // Close IgniteContext on all the workers.
    igniteContext.close(true);
}
Also used : Dataset(org.apache.spark.sql.Dataset) ArrayList(java.util.ArrayList) JavaIgniteContext(org.apache.ignite.spark.JavaIgniteContext) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Example 24 with Dataset

use of org.apache.spark.sql.Dataset in project net.jgp.labs.spark by jgperrin.

the class PiComputeLambdaApp method start.

/**
 * The processing code.
 */
private void start(int slices) {
    int numberOfThrows = 100000 * slices;
    System.out.println("About to throw " + numberOfThrows + " darts, ready? Stay away from the target!");
    long t0 = System.currentTimeMillis();
    SparkSession spark = SparkSession.builder().appName("Spark Pi with lambdas").master("local[*]").getOrCreate();
    long t1 = System.currentTimeMillis();
    System.out.println("Session initialized in " + (t1 - t0) + " ms");
    List<Integer> l = new ArrayList<>(numberOfThrows);
    for (int i = 0; i < numberOfThrows; i++) {
        l.add(i);
    }
    Dataset<Row> incrementalDf = spark.createDataset(l, Encoders.INT()).toDF();
    long t2 = System.currentTimeMillis();
    System.out.println("Initial dataframe built in " + (t2 - t1) + " ms");
    Dataset<Integer> dotsDs = incrementalDf.map((MapFunction<Row, Integer>) status -> {
        double x = Math.random() * 2 - 1;
        double y = Math.random() * 2 - 1;
        counter++;
        if (counter % 100000 == 0) {
            System.out.println("" + counter + " darts thrown so far");
        }
        return (x * x + y * y <= 1) ? 1 : 0;
    }, Encoders.INT());
    long t3 = System.currentTimeMillis();
    System.out.println("Throwing darts done in " + (t3 - t2) + " ms");
    int dartsInCircle = dotsDs.reduce((ReduceFunction<Integer>) (x, y) -> x + y);
    long t4 = System.currentTimeMillis();
    System.out.println("Analyzing result in " + (t4 - t3) + " ms");
    System.out.println("Pi is roughly " + 4.0 * dartsInCircle / numberOfThrows);
    spark.stop();
}
Also used : List(java.util.List) Dataset(org.apache.spark.sql.Dataset) Row(org.apache.spark.sql.Row) ReduceFunction(org.apache.spark.api.java.function.ReduceFunction) MapFunction(org.apache.spark.api.java.function.MapFunction) Encoders(org.apache.spark.sql.Encoders) Serializable(java.io.Serializable) ArrayList(java.util.ArrayList) SparkSession(org.apache.spark.sql.SparkSession) SparkSession(org.apache.spark.sql.SparkSession) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row)

Example 25 with Dataset

use of org.apache.spark.sql.Dataset in project net.jgp.labs.spark by jgperrin.

the class RandomForestRegressorApp method main.

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("RandomForestRegressorApp").master("local[*]").getOrCreate();
    // Load and parse the data file, converting it to a DataFrame.
    Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
    df.show(20, false);
    // Automatically identify categorical features, and index them.
    // Set maxCategories so features with > 4 distinct values are treated as continuous.
    VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = df.randomSplit(new double[] { 1, 0 });
    // Dataset<Row>[] splits = df.randomSplit(new double[] {0.7, 0.3});
    // splits[0];
    Dataset<Row> trainingData = df;
    // Dataset<Row> testData = splits[1];
    Dataset<Row> testData = spark.read().format("libsvm").load("data/sample-ml/simplegauss_test.txt");
    // Train a RandomForest model.
    RandomForestRegressor rf = new RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures");
    // Chain indexer and forest in a Pipeline
    Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { featureIndexer, rf });
    // Train model. This also runs the indexer.
    PipelineModel model = pipeline.fit(trainingData);
    // Make predictions.
    Dataset<Row> predictions = model.transform(testData);
    // Select example rows to display.
    predictions.select("prediction", "label", "features").show(5);
    // Select (prediction, true label) and compute test error
    RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
    double rmse = evaluator.evaluate(predictions);
    System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
    RandomForestRegressionModel rfModel = (RandomForestRegressionModel) (model.stages()[1]);
    System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
    // $example off$
    Double feature = 2.0;
    Vector features = Vectors.dense(feature);
    double p = rfModel.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    feature = 11.0;
    features = Vectors.dense(feature);
    p = rfModel.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    spark.stop();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Dataset(org.apache.spark.sql.Dataset) VectorIndexer(org.apache.spark.ml.feature.VectorIndexer) Pipeline(org.apache.spark.ml.Pipeline) PipelineModel(org.apache.spark.ml.PipelineModel) RandomForestRegressionModel(org.apache.spark.ml.regression.RandomForestRegressionModel) RandomForestRegressor(org.apache.spark.ml.regression.RandomForestRegressor) VectorIndexerModel(org.apache.spark.ml.feature.VectorIndexerModel) Row(org.apache.spark.sql.Row) RegressionEvaluator(org.apache.spark.ml.evaluation.RegressionEvaluator) Vector(org.apache.spark.ml.linalg.Vector)

Aggregations

Dataset (org.apache.spark.sql.Dataset)27 Row (org.apache.spark.sql.Row)16 SparkSession (org.apache.spark.sql.SparkSession)14 ArrayList (java.util.ArrayList)12 JavaRDD (org.apache.spark.api.java.JavaRDD)10 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)10 List (java.util.List)8 Map (java.util.Map)7 Tuple2 (scala.Tuple2)7 Collectors (java.util.stream.Collectors)6 Set (java.util.Set)5 Serializable (java.io.Serializable)4 HashMap (java.util.HashMap)4 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)4 MapFunction (org.apache.spark.api.java.function.MapFunction)4 Pipeline (org.apache.spark.ml.Pipeline)4 PipelineModel (org.apache.spark.ml.PipelineModel)4 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 Iterator (java.util.Iterator)3