use of org.apache.beam.runners.spark.SparkPipelineOptions in project beam by apache.
the class SparkUnboundedSource method read.
public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read(JavaStreamingContext jssc, SparkRuntimeContext rc, UnboundedSource<T, CheckpointMarkT> source, String stepName) {
SparkPipelineOptions options = rc.getPipelineOptions().as(SparkPipelineOptions.class);
Long maxRecordsPerBatch = options.getMaxRecordsPerBatch();
SourceDStream<T, CheckpointMarkT> sourceDStream = new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch);
JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream = JavaPairInputDStream$.MODULE$.fromInputDStream(sourceDStream, JavaSparkContext$.MODULE$.<Source<T>>fakeClassTag(), JavaSparkContext$.MODULE$.<CheckpointMarkT>fakeClassTag());
// call mapWithState to read from a checkpointable sources.
JavaMapWithStateDStream<Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>> mapWithStateDStream = inputDStream.mapWithState(StateSpec.function(StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName)).numPartitions(sourceDStream.getNumPartitions()));
// set checkpoint duration for read stream, if set.
checkpointStream(mapWithStateDStream, options);
// report the number of input elements for this InputDStream to the InputInfoTracker.
int id = inputDStream.inputDStream().id();
JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction());
// register ReadReportDStream to report information related to this read.
new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName).register();
// output the actual (deserialized) stream.
WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getDefaultOutputCoder(), GlobalWindow.Coder.INSTANCE);
JavaDStream<WindowedValue<T>> readUnboundedStream = mapWithStateDStream.flatMap(new Tuple2byteFlatMapFunction()).map(CoderHelpers.fromByteFunction(coder));
return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id));
}
Aggregations