use of org.apache.spark.sql.Dataset in project beam by apache.
the class AbstractTranslationContext method startPipeline.
// --------------------------------------------------------------------------------------------
// Pipeline methods
// --------------------------------------------------------------------------------------------
/**
* Starts the pipeline.
*/
public void startPipeline() {
SparkStructuredStreamingPipelineOptions options = serializablePipelineOptions.get().as(SparkStructuredStreamingPipelineOptions.class);
int datasetIndex = 0;
for (Dataset<?> dataset : leaves) {
if (options.isStreaming()) {
// TODO: deal with Beam Discarding, Accumulating and Accumulating & Retracting outputmodes
// with DatastreamWriter.outputMode
DataStreamWriter<?> dataStreamWriter = dataset.writeStream();
// spark sets a default checkpoint dir if not set.
if (options.getCheckpointDir() != null) {
dataStreamWriter = dataStreamWriter.option("checkpointLocation", options.getCheckpointDir());
}
launchStreaming(dataStreamWriter.foreach(new NoOpForeachWriter<>()));
} else {
if (options.getTestMode()) {
LOG.debug("**** dataset {} catalyst execution plans ****", ++datasetIndex);
dataset.explain(true);
}
// apply a dummy fn just to apply foreach action that will trigger the pipeline run in
// spark
dataset.foreach((ForeachFunction) t -> {
});
}
}
}
use of org.apache.spark.sql.Dataset in project zeppelin by apache.
the class Spark2Shims method showDataFrame.
@Override
public String showDataFrame(Object obj, int maxResult, InterpreterContext context) {
if (obj instanceof Dataset) {
Dataset<Row> df = ((Dataset) obj).toDF();
String[] columns = df.columns();
// DDL will empty DataFrame
if (columns.length == 0) {
return "";
}
// fetch maxResult+1 rows so that we can check whether it is larger than zeppelin.spark.maxResult
List<Row> rows = df.takeAsList(maxResult + 1);
String template = context.getLocalProperties().get("template");
if (!StringUtils.isBlank(template)) {
if (rows.size() >= 1) {
return new SingleRowInterpreterResult(sparkRowToList(rows.get(0)), template, context).toHtml();
} else {
return "";
}
}
StringBuilder msg = new StringBuilder();
msg.append("\n%table ");
msg.append(StringUtils.join(TableDataUtils.normalizeColumns(columns), "\t"));
msg.append("\n");
boolean isLargerThanMaxResult = rows.size() > maxResult;
if (isLargerThanMaxResult) {
rows = rows.subList(0, maxResult);
}
for (Row row : rows) {
for (int i = 0; i < row.size(); ++i) {
msg.append(TableDataUtils.normalizeColumn(row.get(i)));
if (i != row.size() - 1) {
msg.append("\t");
}
}
msg.append("\n");
}
if (isLargerThanMaxResult) {
msg.append("\n");
msg.append(ResultMessages.getExceedsLimitRowsMessage(maxResult, "zeppelin.spark.maxResult"));
}
// append %text at the end, otherwise the following output will be put in table as well.
msg.append("\n%text ");
return msg.toString();
} else {
return obj.toString();
}
}
use of org.apache.spark.sql.Dataset in project zeppelin by apache.
the class Spark3Shims method showDataFrame.
@Override
public String showDataFrame(Object obj, int maxResult, InterpreterContext context) {
if (obj instanceof Dataset) {
Dataset<Row> df = ((Dataset) obj).toDF();
String[] columns = df.columns();
// DDL will empty DataFrame
if (columns.length == 0) {
return "";
}
// fetch maxResult+1 rows so that we can check whether it is larger than zeppelin.spark.maxResult
List<Row> rows = df.takeAsList(maxResult + 1);
String template = context.getLocalProperties().get("template");
if (!StringUtils.isBlank(template)) {
if (rows.size() >= 1) {
return new SingleRowInterpreterResult(sparkRowToList(rows.get(0)), template, context).toHtml();
} else {
return "";
}
}
StringBuilder msg = new StringBuilder();
msg.append("%table ");
msg.append(StringUtils.join(TableDataUtils.normalizeColumns(columns), "\t"));
msg.append("\n");
boolean isLargerThanMaxResult = rows.size() > maxResult;
if (isLargerThanMaxResult) {
rows = rows.subList(0, maxResult);
}
for (Row row : rows) {
for (int i = 0; i < row.size(); ++i) {
msg.append(TableDataUtils.normalizeColumn(row.get(i)));
if (i != row.size() - 1) {
msg.append("\t");
}
}
msg.append("\n");
}
if (isLargerThanMaxResult) {
msg.append("\n");
msg.append(ResultMessages.getExceedsLimitRowsMessage(maxResult, "zeppelin.spark.maxResult"));
}
// append %text at the end, otherwise the following output will be put in table as well.
msg.append("\n%text ");
return msg.toString();
} else {
return obj.toString();
}
}
use of org.apache.spark.sql.Dataset in project kylo by Teradata.
the class SparkMetadataExtractor method parse.
@Override
public List<FileMetadata> parse(String[] filePaths) {
List<DataFrame> dataFrameList = new ArrayList<>();
for (String path : filePaths) {
DataFrame df = sqlContext.read().format("com.thinkbiganalytics.spark.file.metadata").load(path);
dataFrameList.add(df);
}
DataFrame unionDf = SparkUtil.unionAll(dataFrameList);
Encoder<FileMetadata> encoder = Encoders.bean(FileMetadata.class);
Dataset dataset = unionDf.as(encoder);
return dataset.collectAsList();
}
use of org.apache.spark.sql.Dataset in project net.jgp.labs.spark by jgperrin.
the class RandomForestRegressorInPipelineApp method main.
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("RandomForestRegressorApp").master("local[*]").getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
df.show(20, false);
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = df.randomSplit(new double[] { 1, 0 });
// Dataset<Row>[] splits = df.randomSplit(new double[] {0.7, 0.3});
// splits[0];
Dataset<Row> trainingData = df;
// Dataset<Row> testData = splits[1];
Dataset<Row> testData = spark.read().format("libsvm").load("data/sample-ml/simplegauss_test.txt");
// Train a RandomForest model.
RandomForestRegressor rf = new RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures");
// Chain indexer and forest in a Pipeline
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { featureIndexer, rf });
// Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
Dataset<Row> predictions = model.transform(testData);
// Select example rows to display.
predictions.select("prediction", "label", "features").show(5);
// Select (prediction, true label) and compute test error
RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
double rmse = evaluator.evaluate(predictions);
System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
RandomForestRegressionModel rfModel = (RandomForestRegressionModel) (model.stages()[1]);
System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
// $example off$
Double feature = 2.0;
Vector features = Vectors.dense(feature);
double p = rfModel.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
feature = 11.0;
features = Vectors.dense(feature);
p = rfModel.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
spark.stop();
}
Aggregations