use of org.apache.spark.sql.Dataset in project kylo by Teradata.
the class SparkProvenance method run.
private void run(@Nonnull final PrintStream out, @Nonnull final String... args) {
// Check how many arguments were passed in
if (args.length < 5) {
String msg = "Proper Usage is: <flowfile-id> <job-flowfile-id> <feed-name (category.feed)> <connection-url (url to connect to JMS or KAFAK)> <type (JMS, KAFKA)>" + "You provided " + args.length + " args which are (comma separated): " + StringUtils.join(args, ",");
out.println(msg);
throw new IllegalArgumentException(msg);
}
ProvenanceEventService provenanceEventService = null;
final SparkContext sparkContext = SparkContext.getOrCreate();
try {
final SparkProvenanceConfiguration params = new SparkProvenanceConfiguration(args);
// Get the proper ProvenanceService
provenanceEventService = ProvenanceServiceFactory.getProvenanceEventService(params);
// Collection of custom Provenance Events we will be sending to Kylo
List<ProvenanceEventRecordDTO> events = new ArrayList<>();
// do some work. Look up the database names in Hive
final HiveContext hiveContext = new HiveContext(sparkContext);
// Do some work... i.e. look up the Databases in Hive
ProvenanceEventRecordDTO event = newEvent("Databases", params);
Dataset df = hiveContext.sql("show databases");
event.getAttributeMap().put("databases", df.toJSON().collectAsList().toString());
event.setEventTime(System.currentTimeMillis());
events.add(event);
event = newEvent("Another Step", params);
event.getAttributeMap().put("UUID 1", UUID.randomUUID().toString());
event.setEventTime(System.currentTimeMillis());
event.getAttributeMap().put("timestamp", String.valueOf(System.currentTimeMillis()));
events.add(event);
// Send the events off
provenanceEventService.sendEvents(events);
log.info("Spark app finished");
} catch (Exception e) {
log.error("Failed to run Spark Provenance Job: {}", e.toString(), e);
} finally {
provenanceEventService.closeConnection();
sparkContext.stop();
log.info("Exiting!!!!");
System.exit(0);
}
}
use of org.apache.spark.sql.Dataset in project kylo by Teradata.
the class SparkFileMetadataExtractor method parse.
@Override
public List<FileMetadata> parse(String[] filePaths) {
List<Dataset> dataFrameList = new ArrayList<>();
for (String path : filePaths) {
Dataset df = (Dataset) sqlContext.read().format("com.thinkbiganalytics.spark.file.metadata").load(path);
dataFrameList.add(df);
}
Dataset unionDf = unionAll(dataFrameList);
Encoder<FileMetadata> encoder = Encoders.bean(FileMetadata.class);
Dataset fileData = unionDf.as(encoder);
return fileData.collectAsList();
}
use of org.apache.spark.sql.Dataset in project ignite by apache.
the class SharedRDDExample method main.
/**
* Executes the example.
* @param args Command line arguments, none required.
*/
public static void main(String[] args) {
// Spark Configuration.
SparkConf sparkConf = new SparkConf().setAppName("JavaIgniteRDDExample").setMaster("local").set("spark.executor.instances", "2");
// Spark context.
JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
// Adjust the logger to exclude the logs of no interest.
Logger.getRootLogger().setLevel(Level.ERROR);
Logger.getLogger("org.apache.ignite").setLevel(Level.INFO);
// Creates Ignite context with specific configuration and runs Ignite in the embedded mode.
JavaIgniteContext<Integer, Integer> igniteContext = new JavaIgniteContext<Integer, Integer>(sparkContext, "examples/config/spark/example-shared-rdd.xml", false);
// Create a Java Ignite RDD of Type (Int,Int) Integer Pair.
JavaIgniteRDD<Integer, Integer> sharedRDD = igniteContext.<Integer, Integer>fromCache("sharedRDD");
// Define data to be stored in the Ignite RDD (cache).
List<Integer> data = new ArrayList<>(20);
for (int i = 0; i < 20; i++) {
data.add(i);
}
// Preparing a Java RDD.
JavaRDD<Integer> javaRDD = sparkContext.<Integer>parallelize(data);
// Fill the Ignite RDD in with Int pairs. Here Pairs are represented as Scala Tuple2.
sharedRDD.savePairs(javaRDD.<Integer, Integer>mapToPair(new PairFunction<Integer, Integer, Integer>() {
@Override
public Tuple2<Integer, Integer> call(Integer val) throws Exception {
return new Tuple2<Integer, Integer>(val, val);
}
}));
System.out.println(">>> Iterating over Ignite Shared RDD...");
// Iterate over the Ignite RDD.
sharedRDD.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {
@Override
public void call(Tuple2<Integer, Integer> tuple) throws Exception {
System.out.println("(" + tuple._1 + "," + tuple._2 + ")");
}
});
System.out.println(">>> Transforming values stored in Ignite Shared RDD...");
// Filter out even values as a transformed RDD.
JavaPairRDD<Integer, Integer> transformedValues = sharedRDD.filter(new Function<Tuple2<Integer, Integer>, Boolean>() {
@Override
public Boolean call(Tuple2<Integer, Integer> tuple) throws Exception {
return tuple._2() % 2 == 0;
}
});
// Print out the transformed values.
transformedValues.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {
@Override
public void call(Tuple2<Integer, Integer> tuple) throws Exception {
System.out.println("(" + tuple._1 + "," + tuple._2 + ")");
}
});
System.out.println(">>> Executing SQL query over Ignite Shared RDD...");
// Execute SQL query over the Ignite RDD.
Dataset df = sharedRDD.sql("select _val from Integer where _key < 9");
// Show the result of the execution.
df.show();
// Close IgniteContext on all the workers.
igniteContext.close(true);
}
use of org.apache.spark.sql.Dataset in project net.jgp.labs.spark by jgperrin.
the class PiComputeLambdaApp method start.
/**
* The processing code.
*/
private void start(int slices) {
int numberOfThrows = 100000 * slices;
System.out.println("About to throw " + numberOfThrows + " darts, ready? Stay away from the target!");
long t0 = System.currentTimeMillis();
SparkSession spark = SparkSession.builder().appName("Spark Pi with lambdas").master("local[*]").getOrCreate();
long t1 = System.currentTimeMillis();
System.out.println("Session initialized in " + (t1 - t0) + " ms");
List<Integer> l = new ArrayList<>(numberOfThrows);
for (int i = 0; i < numberOfThrows; i++) {
l.add(i);
}
Dataset<Row> incrementalDf = spark.createDataset(l, Encoders.INT()).toDF();
long t2 = System.currentTimeMillis();
System.out.println("Initial dataframe built in " + (t2 - t1) + " ms");
Dataset<Integer> dotsDs = incrementalDf.map((MapFunction<Row, Integer>) status -> {
double x = Math.random() * 2 - 1;
double y = Math.random() * 2 - 1;
counter++;
if (counter % 100000 == 0) {
System.out.println("" + counter + " darts thrown so far");
}
return (x * x + y * y <= 1) ? 1 : 0;
}, Encoders.INT());
long t3 = System.currentTimeMillis();
System.out.println("Throwing darts done in " + (t3 - t2) + " ms");
int dartsInCircle = dotsDs.reduce((ReduceFunction<Integer>) (x, y) -> x + y);
long t4 = System.currentTimeMillis();
System.out.println("Analyzing result in " + (t4 - t3) + " ms");
System.out.println("Pi is roughly " + 4.0 * dartsInCircle / numberOfThrows);
spark.stop();
}
use of org.apache.spark.sql.Dataset in project net.jgp.labs.spark by jgperrin.
the class RandomForestRegressorApp method main.
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("RandomForestRegressorApp").master("local[*]").getOrCreate();
// Load and parse the data file, converting it to a DataFrame.
Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
df.show(20, false);
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = df.randomSplit(new double[] { 1, 0 });
// Dataset<Row>[] splits = df.randomSplit(new double[] {0.7, 0.3});
// splits[0];
Dataset<Row> trainingData = df;
// Dataset<Row> testData = splits[1];
Dataset<Row> testData = spark.read().format("libsvm").load("data/sample-ml/simplegauss_test.txt");
// Train a RandomForest model.
RandomForestRegressor rf = new RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures");
// Chain indexer and forest in a Pipeline
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { featureIndexer, rf });
// Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
Dataset<Row> predictions = model.transform(testData);
// Select example rows to display.
predictions.select("prediction", "label", "features").show(5);
// Select (prediction, true label) and compute test error
RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
double rmse = evaluator.evaluate(predictions);
System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
RandomForestRegressionModel rfModel = (RandomForestRegressionModel) (model.stages()[1]);
System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
// $example off$
Double feature = 2.0;
Vector features = Vectors.dense(feature);
double p = rfModel.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
feature = 11.0;
features = Vectors.dense(feature);
p = rfModel.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
spark.stop();
}
Aggregations