use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class BeamAggregationRel method buildPTransform.
@Override
public PTransform<PCollectionList<Row>, PCollection<Row>> buildPTransform() {
Schema outputSchema = CalciteUtils.toSchema(getRowType());
List<FieldAggregation> aggregationAdapters = getNamedAggCalls().stream().map(aggCall -> new FieldAggregation(aggCall.getKey(), aggCall.getValue())).collect(toList());
return new Transform(windowFn, windowFieldIndex, getGroupSet(), aggregationAdapters, outputSchema);
}
use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class BeamSqlErrorTest method testFailedExpression.
@Test
public void testFailedExpression() {
Schema resultType = Schema.builder().addStringField(ID).addStringField(COUNTRY_CODE).addDoubleField(SUM_AMOUNT).build();
Schema midResultType = Schema.builder().addStringField(ID).addStringField(COUNTRY_CODE).addStringField(CURRENCY).addInt64Field(F_3).build();
String sql = "SELECT id,country_code,CalculatePrice(sum(CastUdf(amount)),currency) as sum_amount FROM PCOLLECTION group by id,country_code,currency";
PTransform mock = spy(PTransform.class);
when(mock.expand(Matchers.any())).thenAnswer(invocationOnMock -> invocationOnMock.getArgument(0, PCollection.class));
ArgumentCaptor<PCollection> captor = ArgumentCaptor.forClass(PCollection.class);
PCollection<Row> validRowsResult = boundedInputBytes.apply("calculate", SqlTransform.query(sql).withAutoLoading(false).withErrorsTransformer(mock).registerUdf("CastUdf", CastUdf.class).registerUdf("CalculatePrice", CalculatePrice.class)).setCoder(SchemaCoder.of(resultType));
PAssert.that(validRowsResult).containsInAnyOrder(TestUtils.RowsBuilder.of(resultType).addRows("1", "US", 100.0).getRows());
Schema firstErrorSchema = Schema.builder().addRowField(ROW, inputType).addStringField(ERROR).build();
Row failedOnFirstUdfElement = TestTableUtils.buildRows(firstErrorSchema, Arrays.asList(TestTableUtils.buildRows(inputType, Arrays.asList("2", invalidAmount, "US", "$")).get(0), "Found invalid value " + invalidAmount)).get(0);
Schema secondErrorSchema = Schema.builder().addRowField(ROW, midResultType).addStringField(ERROR).build();
Row failedOnSecondUdfElement = TestTableUtils.buildRows(secondErrorSchema, Arrays.asList(TestTableUtils.buildRows(midResultType, Arrays.asList("3", "US", invalidCurrency, 100L)).get(0), "Currency isn't supported " + invalidCurrency)).get(0);
Mockito.verify(mock, times(2)).expand(captor.capture());
PAssert.that(captor.getAllValues().get(0)).containsInAnyOrder(failedOnFirstUdfElement);
PAssert.that(captor.getAllValues().get(1)).containsInAnyOrder(failedOnSecondUdfElement);
pipeline.run().waitUntilFinish();
}
use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class WriteWithShardingFactoryTest method withNoShardingSpecifiedReturnsNewTransform.
@Test
public void withNoShardingSpecifiedReturnsNewTransform() {
ResourceId outputDirectory = LocalResources.fromString("/foo", true);
PTransform<PCollection<Object>, WriteFilesResult<Void>> original = WriteFiles.to(new FileBasedSink<Object, Void, Object>(StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(new FakeFilenamePolicy())) {
@Override
public WriteOperation<Void, Object> createWriteOperation() {
throw new IllegalArgumentException("Should not be used");
}
});
@SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of()));
AppliedPTransform<PCollection<Object>, WriteFilesResult<Void>, PTransform<PCollection<Object>, WriteFilesResult<Void>>> originalApplication = AppliedPTransform.of("write", PValues.expandInput(objs), Collections.emptyMap(), original, ResourceHints.create(), p);
assertThat(factory.getReplacementTransform(originalApplication).getTransform(), not(equalTo((Object) original)));
}
use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class DataflowPipelineTranslatorTest method testStreamingGroupIntoBatchesWithShardedKeyTranslationUnifiedWorker.
@Test
public void testStreamingGroupIntoBatchesWithShardedKeyTranslationUnifiedWorker() throws Exception {
List<String> experiments = new ArrayList<>(ImmutableList.of(GcpOptions.STREAMING_ENGINE_EXPERIMENT, GcpOptions.WINDMILL_SERVICE_EXPERIMENT, "use_runner_v2"));
JobSpecification jobSpec = runStreamingGroupIntoBatchesAndGetJobSpec(true, experiments);
List<Step> steps = jobSpec.getJob().getSteps();
Step shardedStateStep = steps.get(steps.size() - 1);
Map<String, Object> properties = shardedStateStep.getProperties();
assertTrue(properties.containsKey(PropertyNames.USES_KEYED_STATE));
assertTrue(properties.containsKey(PropertyNames.ALLOWS_SHARDABLE_STATE));
assertEquals("true", getString(properties, PropertyNames.ALLOWS_SHARDABLE_STATE));
assertTrue(properties.containsKey(PropertyNames.PRESERVES_KEYS));
assertEquals("true", getString(properties, PropertyNames.PRESERVES_KEYS));
// Also checks the runner proto is correctly populated.
Map<String, RunnerApi.PTransform> transformMap = jobSpec.getPipelineProto().getComponents().getTransformsMap();
boolean transformFound = false;
for (Map.Entry<String, RunnerApi.PTransform> transform : transformMap.entrySet()) {
RunnerApi.FunctionSpec spec = transform.getValue().getSpec();
if (spec.getUrn().equals(PTransformTranslation.GROUP_INTO_BATCHES_WITH_SHARDED_KEY_URN)) {
for (String subtransform : transform.getValue().getSubtransformsList()) {
RunnerApi.PTransform ptransform = transformMap.get(subtransform);
if (ptransform.getSpec().getUrn().equals(PTransformTranslation.GROUP_INTO_BATCHES_URN)) {
transformFound = true;
}
}
}
}
assertTrue(transformFound);
boolean coderFound = false;
Map<String, RunnerApi.Coder> coderMap = jobSpec.getPipelineProto().getComponents().getCodersMap();
for (Map.Entry<String, RunnerApi.Coder> coder : coderMap.entrySet()) {
if (coder.getValue().getSpec().getUrn().equals(ModelCoders.SHARDED_KEY_CODER_URN)) {
coderFound = true;
}
}
assertTrue(coderFound);
}
use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class ParDoTranslatorBatch method translateTransform.
@Override
public void translateTransform(PTransform<PCollection<InputT>, PCollectionTuple> transform, AbstractTranslationContext context) {
String stepName = context.getCurrentTransform().getFullName();
// Check for not supported advanced features
// TODO: add support of Splittable DoFn
DoFn<InputT, OutputT> doFn = getDoFn(context);
checkState(!DoFnSignatures.isSplittable(doFn), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
// TODO: add support of states and timers
checkState(!DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");
checkState(!DoFnSignatures.requiresTimeSortedInput(doFn), "@RequiresTimeSortedInput is not " + "supported for the moment");
DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
// Init main variables
PValue input = context.getInput();
Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
TupleTag<?> mainOutputTag = getTupleTag(context);
List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
WindowingStrategy<?, ?> windowingStrategy = ((PCollection<InputT>) input).getWindowingStrategy();
Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();
// construct a map from side input to WindowingStrategy so that
// the DoFn runner can map main-input windows to side input windows
List<PCollectionView<?>> sideInputs = getSideInputs(context);
Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
for (PCollectionView<?> sideInput : sideInputs) {
sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
}
SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);
Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
for (TupleTag<?> tag : outputTags) {
if (!tag.equals(mainOutputTag)) {
additionalOutputTags.add(tag);
}
}
Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
@SuppressWarnings("unchecked") DoFnFunction<InputT, OutputT> doFnWrapper = new DoFnFunction(metricsAccum, stepName, doFn, windowingStrategy, sideInputStrategies, context.getSerializableOptions(), additionalOutputTags, mainOutputTag, inputCoder, outputCoderMap, broadcastStateData, doFnSchemaInformation, sideInputMapping);
MultiOutputCoder multipleOutputCoder = MultiOutputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs = inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
if (outputs.entrySet().size() > 1) {
allOutputs.persist();
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
}
} else {
Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
Dataset<WindowedValue<?>> outputDataset = allOutputs.map((MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder));
context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
}
}
Aggregations