use of org.apache.spark.sql.Dataset in project Gaffer by gchq.
the class DataFrameToIterableRowTest method shouldConvertDataFrameToIterableOfRows.
@Test
public void shouldConvertDataFrameToIterableOfRows() {
// Given
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
final Function<Dataset<Row>, Iterable<? extends Row>> function = new DataFrameToIterableRow();
final Dataset<Row> dataframe = Graphs$.MODULE$.friends().vertices();
// When
final Iterable<? extends Row> result = function.apply(dataframe);
final List<Row> resultList = Lists.newArrayList(result);
// Then
assertThat(resultList).hasSize(7);
}
use of org.apache.spark.sql.Dataset in project beam by apache.
the class ParDoTranslatorBatch method translateTransform.
@Override
public void translateTransform(PTransform<PCollection<InputT>, PCollectionTuple> transform, AbstractTranslationContext context) {
String stepName = context.getCurrentTransform().getFullName();
// Check for not supported advanced features
// TODO: add support of Splittable DoFn
DoFn<InputT, OutputT> doFn = getDoFn(context);
checkState(!DoFnSignatures.isSplittable(doFn), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
// TODO: add support of states and timers
checkState(!DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");
checkState(!DoFnSignatures.requiresTimeSortedInput(doFn), "@RequiresTimeSortedInput is not " + "supported for the moment");
DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
// Init main variables
PValue input = context.getInput();
Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
TupleTag<?> mainOutputTag = getTupleTag(context);
List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
WindowingStrategy<?, ?> windowingStrategy = ((PCollection<InputT>) input).getWindowingStrategy();
Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();
// construct a map from side input to WindowingStrategy so that
// the DoFn runner can map main-input windows to side input windows
List<PCollectionView<?>> sideInputs = getSideInputs(context);
Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
for (PCollectionView<?> sideInput : sideInputs) {
sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
}
SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);
Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
for (TupleTag<?> tag : outputTags) {
if (!tag.equals(mainOutputTag)) {
additionalOutputTags.add(tag);
}
}
Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
@SuppressWarnings("unchecked") DoFnFunction<InputT, OutputT> doFnWrapper = new DoFnFunction(metricsAccum, stepName, doFn, windowingStrategy, sideInputStrategies, context.getSerializableOptions(), additionalOutputTags, mainOutputTag, inputCoder, outputCoderMap, broadcastStateData, doFnSchemaInformation, sideInputMapping);
MultiOutputCoder multipleOutputCoder = MultiOutputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs = inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
if (outputs.entrySet().size() > 1) {
allOutputs.persist();
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
}
} else {
Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
Dataset<WindowedValue<?>> outputDataset = allOutputs.map((MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder));
context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
}
}
Aggregations