use of io.cdap.cdap.api.TxRunnable in project cdap by caskdata.
the class DynamicSparkCompute method lazyInit.
// when checkpointing is enabled, and Spark is loading DStream operations from an existing checkpoint,
// delegate will be null and the initialize() method won't have been called. So we need to instantiate
// the delegate and initialize it.
private void lazyInit(final JavaSparkContext jsc) throws Exception {
if (delegate == null) {
PluginFunctionContext pluginFunctionContext = dynamicDriverContext.getPluginFunctionContext();
delegate = pluginFunctionContext.createPlugin();
final StageSpec stageSpec = pluginFunctionContext.getStageSpec();
final JavaSparkExecutionContext sec = dynamicDriverContext.getSparkExecutionContext();
Transactionals.execute(sec, new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, jsc, datasetContext, pipelineRuntime, stageSpec);
delegate.initialize(sparkPluginContext);
}
}, Exception.class);
}
}
use of io.cdap.cdap.api.TxRunnable in project cdap by caskdata.
the class DStreamCollection method compute.
@Override
public <U> SparkCollection<U> compute(StageSpec stageSpec, SparkCompute<T, U> compute) throws Exception {
SparkCompute<T, U> wrappedCompute = new DynamicSparkCompute<>(new DynamicDriverContext(stageSpec, sec, new NoopStageStatisticsCollector()), compute);
Transactionals.execute(sec, new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, JavaSparkContext.fromSparkContext(stream.context().sparkContext()), datasetContext, pipelineRuntime, stageSpec);
wrappedCompute.initialize(sparkPluginContext);
}
}, Exception.class);
return wrap(stream.transform(new ComputeTransformFunction<>(sec, stageSpec, wrappedCompute)));
}
use of io.cdap.cdap.api.TxRunnable in project cdap by caskdata.
the class CharCountProgram method run.
@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext sc = new JavaSparkContext();
// Verify the codec is being set
Preconditions.checkArgument("org.apache.spark.io.LZFCompressionCodec".equals(sc.getConf().get("spark.io.compression.codec")));
// read the dataset
JavaPairRDD<byte[], String> inputData = sec.fromDataset("keys");
// create a new RDD with the same key but with a new value which is the length of the string
final JavaPairRDD<byte[], byte[]> stringLengths = inputData.mapToPair(new PairFunction<Tuple2<byte[], String>, byte[], byte[]>() {
@Override
public Tuple2<byte[], byte[]> call(Tuple2<byte[], String> stringTuple2) throws Exception {
return new Tuple2<>(stringTuple2._1(), Bytes.toBytes(stringTuple2._2().length()));
}
});
// write a total count to a table (that emits a metric we can validate in the test case)
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
long count = stringLengths.count();
Table totals = context.getDataset("totals");
totals.increment(new Increment("total").add("total", count));
// write the character count to dataset
sec.saveAsDataset(stringLengths, "count");
}
});
}
use of io.cdap.cdap.api.TxRunnable in project cdap by caskdata.
the class SparkCSVToSpaceProgram method run.
@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> fileSetArgs = new HashMap<>();
final Metrics metrics = sec.getMetrics();
FileSetArguments.addInputPath(fileSetArgs, sec.getRuntimeArguments().get("input.path"));
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
final List<String> converted = input.values().map(new Function<Text, String>() {
@Override
public String call(Text input) throws Exception {
String line = input.toString();
metrics.count("num.lines", 1);
return line.replaceAll(",", " ");
}
}).collect();
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
Map<String, String> args = sec.getRuntimeArguments();
String outputPath = args.get("output.path");
Map<String, String> fileSetArgs = new HashMap<>();
FileSetArguments.setOutputPath(fileSetArgs, outputPath);
FileSet fileSet = context.getDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
try (PrintWriter writer = new PrintWriter(fileSet.getOutputLocation().getOutputStream())) {
for (String line : converted) {
writer.write(line);
writer.println();
}
}
}
});
}
use of io.cdap.cdap.api.TxRunnable in project cdap by caskdata.
the class SparkLogParser method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> runtimeArguments = sec.getRuntimeArguments();
String inputFileSet = runtimeArguments.get("input");
final String outputTable = runtimeArguments.get("output");
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {
@Override
public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
return SparkAppUsingGetDataset.parse(input._2());
}
}).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {
@Override
public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
return stats1.aggregate(stats2);
}
}).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {
@Override
public Iterator<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
final Gson gson = new Gson();
return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
}
})).iterator();
}
});
// Collect all data to driver and write to dataset directly. That's the intend of the test.
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
KeyValueTable kvTable = context.getDataset(outputTable);
for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
kvTable.write(entry.getKey(), entry.getValue());
}
}
});
}
Aggregations