use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class PViewToIdMapper method visitPrimitiveTransform.
@Override
public void visitPrimitiveTransform(TransformHierarchy.Node node) {
if (node.getTransform() instanceof SamzaPublishView) {
final PCollectionView view = ((SamzaPublishView) node.getTransform()).getView();
visitValue(view, node);
}
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class ParDoTranslatorBatch method translateTransform.
@Override
public void translateTransform(PTransform<PCollection<InputT>, PCollectionTuple> transform, AbstractTranslationContext context) {
String stepName = context.getCurrentTransform().getFullName();
// Check for not supported advanced features
// TODO: add support of Splittable DoFn
DoFn<InputT, OutputT> doFn = getDoFn(context);
checkState(!DoFnSignatures.isSplittable(doFn), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
// TODO: add support of states and timers
checkState(!DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");
checkState(!DoFnSignatures.requiresTimeSortedInput(doFn), "@RequiresTimeSortedInput is not " + "supported for the moment");
DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
// Init main variables
PValue input = context.getInput();
Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
TupleTag<?> mainOutputTag = getTupleTag(context);
List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
WindowingStrategy<?, ?> windowingStrategy = ((PCollection<InputT>) input).getWindowingStrategy();
Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();
// construct a map from side input to WindowingStrategy so that
// the DoFn runner can map main-input windows to side input windows
List<PCollectionView<?>> sideInputs = getSideInputs(context);
Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
for (PCollectionView<?> sideInput : sideInputs) {
sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
}
SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);
Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
for (TupleTag<?> tag : outputTags) {
if (!tag.equals(mainOutputTag)) {
additionalOutputTags.add(tag);
}
}
Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
@SuppressWarnings("unchecked") DoFnFunction<InputT, OutputT> doFnWrapper = new DoFnFunction(metricsAccum, stepName, doFn, windowingStrategy, sideInputStrategies, context.getSerializableOptions(), additionalOutputTags, mainOutputTag, inputCoder, outputCoderMap, broadcastStateData, doFnSchemaInformation, sideInputMapping);
MultiOutputCoder multipleOutputCoder = MultiOutputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs = inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
if (outputs.entrySet().size() > 1) {
allOutputs.persist();
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
}
} else {
Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
Dataset<WindowedValue<?>> outputDataset = allOutputs.map((MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder));
context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
}
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class TransformTranslator method parDo.
private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {
@Override
@SuppressWarnings("unchecked")
public void evaluate(ParDo.MultiOutput<InputT, OutputT> transform, EvaluationContext context) {
String stepName = context.getCurrentTransform().getFullName();
DoFn<InputT, OutputT> doFn = transform.getFn();
checkState(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
JavaRDD<WindowedValue<InputT>> inRDD = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
JavaPairRDD<TupleTag<?>, WindowedValue<?>> all;
DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
boolean stateful = signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0;
DoFnSchemaInformation doFnSchemaInformation;
doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
MultiDoFnFunction<InputT, OutputT> multiDoFnFunction = new MultiDoFnFunction<>(metricsAccum, stepName, doFn, context.getSerializableOptions(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, TranslationUtils.getSideInputs(transform.getSideInputs().values(), context), windowingStrategy, stateful, doFnSchemaInformation, sideInputMapping);
if (stateful) {
// Based on the fact that the signature is stateful, DoFnSignatures ensures
// that it is also keyed
all = statefulParDoTransform((KvCoder) context.getInput(transform).getCoder(), windowingStrategy.getWindowFn().windowCoder(), (JavaRDD) inRDD, getPartitioner(context), (MultiDoFnFunction) multiDoFnFunction, signature.processElement().requiresTimeSortedInput());
} else {
all = inRDD.mapPartitionsToPair(multiDoFnFunction);
}
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
if (outputs.size() > 1) {
StorageLevel level = StorageLevel.fromString(context.storageLevel());
if (canAvoidRddSerialization(level)) {
// if it is memory only reduce the overhead of moving to bytes
all = all.persist(level);
} else {
// Caching can cause Serialization, we need to code to bytes
// more details in https://issues.apache.org/jira/browse/BEAM-2669
Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).persist(level).mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
}
}
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
// Object is the best we can do since different outputs can have different tags
JavaRDD<WindowedValue<Object>> values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
context.putDataset(output.getValue(), new BoundedDataset<>(values));
}
}
@Override
public String toNativeString() {
return "mapPartitions(new <fn>())";
}
};
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class ParDoMultiOutputTranslatorBatch method translateNode.
@Override
public void translateNode(ParDo.MultiOutput<InputT, OutputT> transform, Twister2BatchTranslationContext context) {
DoFn<InputT, OutputT> doFn;
doFn = transform.getFn();
if (DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable()) {
throw new UnsupportedOperationException(String.format("Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn));
}
BatchTSetImpl<WindowedValue<InputT>> inputTTSet = context.getInputDataSet(context.getInput(transform));
WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
Map<String, PCollectionView<?>> sideInputMapping;
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
// DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
DoFnSchemaInformation doFnSchemaInformation;
doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
TupleTag<OutputT> mainOutput = transform.getMainOutputTag();
List<TupleTag<?>> additionalOutputTags = new ArrayList<>(transform.getAdditionalOutputTags().getAll());
Map<String, PCollectionView<?>> sideInputs = transform.getSideInputs();
// TODO : note change from List to map in sideinputs
// construct a map from side input to WindowingStrategy so that
// the DoFn runner can map main-input windows to side input windows
Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
for (PCollectionView<?> sideInput : sideInputs.values()) {
sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
}
TupleTag<?> mainOutputTag;
try {
mainOutputTag = ParDoTranslation.getMainOutputTag(context.getCurrentTransform());
} catch (IOException e) {
throw new RuntimeException(e);
}
Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
outputMap.put(mainOutputTag, 0);
int count = 1;
for (TupleTag<?> tag : outputs.keySet()) {
if (!outputMap.containsKey(tag)) {
outputMap.put(tag, count++);
}
}
ComputeTSet<RawUnionValue, Iterator<WindowedValue<InputT>>> outputTSet = inputTTSet.direct().<RawUnionValue>compute(new DoFnFunction<OutputT, InputT>(context, doFn, inputCoder, outputCoders, additionalOutputTags, windowingStrategy, sideInputStrategies, mainOutput, doFnSchemaInformation, outputMap, sideInputMapping));
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
ComputeTSet<WindowedValue<OutputT>, Iterator<RawUnionValue>> tempTSet = outputTSet.direct().compute(new OutputTagFilter(outputMap.get(output.getKey())));
context.setOutputDataSet((PCollection) output.getValue(), tempTSet);
}
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class WriteFilesTest method testCustomShardStrategyDisplayData.
@Test
public void testCustomShardStrategyDisplayData() {
DynamicDestinations<String, Void, String> dynamicDestinations = DynamicFileDestinations.constant(DefaultFilenamePolicy.fromParams(new Params().withBaseFilename(getBaseOutputDirectory().resolve("file", StandardResolveOptions.RESOLVE_FILE)).withShardTemplate("-SS-of-NN")));
SimpleSink<Void> sink = new SimpleSink<Void>(getBaseOutputDirectory(), dynamicDestinations, Compression.UNCOMPRESSED) {
@Override
public void populateDisplayData(DisplayData.Builder builder) {
builder.add(DisplayData.item("foo", "bar"));
}
};
WriteFiles<String, ?, String> write = WriteFiles.to(sink).withSharding(new PTransform<PCollection<String>, PCollectionView<Integer>>() {
@Override
public PCollectionView<Integer> expand(PCollection<String> input) {
return null;
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
builder.add(DisplayData.item("spam", "ham"));
}
});
DisplayData displayData = DisplayData.from(write);
assertThat(displayData, hasDisplayItem("sink", sink.getClass()));
assertThat(displayData, includesDisplayDataFor("sink", sink));
assertThat(displayData, hasDisplayItem("spam", "ham"));
}
Aggregations