use of org.apache.beam.sdk.util.FilePatternMatchingShardedFile in project beam by apache.
the class FlinkRequiresStableInputTest method testParDoRequiresStableInput.
/**
* Test for the support of {@link DoFn.RequiresStableInput} in both {@link ParDo.SingleOutput} and
* {@link ParDo.MultiOutput}.
*
* <p>In each test, a singleton string value is paired with a random key. In the following
* transform, the value is written to a file, whose path is specified by the random key, and then
* the transform fails. When the pipeline retries, the latter transform should receive the same
* input from the former transform, because its {@link DoFn} is annotated with {@link
* DoFn.RequiresStableInput}, and it will not fail due to presence of the file. Therefore, only
* one file for each transform is expected.
*
* <p>A Savepoint is taken until the desired state in the operators has been reached. We then
* restore the savepoint to check if we produce impotent results.
*/
@Test(timeout = 30_000)
@Ignore("BEAM-13575")
public void testParDoRequiresStableInput() throws Exception {
FlinkPipelineOptions options = FlinkPipelineOptions.defaults();
options.setParallelism(1);
// We only want to trigger external savepoints but we require
// checkpointing to be enabled for @RequiresStableInput
options.setCheckpointingInterval(Long.MAX_VALUE);
options.setRunner(FlinkRunner.class);
options.setStreaming(true);
ResourceId outputDir = FileSystems.matchNewResource(tempFolder.getRoot().getAbsolutePath(), true).resolve(String.format("requires-stable-input-%tF-%<tH-%<tM-%<tS-%<tL", new Date()), ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY);
String singleOutputPrefix = outputDir.resolve("pardo-single-output", ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY).resolve("key-", ResolveOptions.StandardResolveOptions.RESOLVE_FILE).toString();
String multiOutputPrefix = outputDir.resolve("pardo-multi-output", ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY).resolve("key-", ResolveOptions.StandardResolveOptions.RESOLVE_FILE).toString();
Pipeline p = createPipeline(options, singleOutputPrefix, multiOutputPrefix);
// a latch used by the transforms to signal completion
latch = new CountDownLatch(2);
JobID jobID = executePipeline(p);
String savepointDir;
do {
// Take a savepoint (checkpoint) which will trigger releasing the buffered elements
// and trigger the latch
savepointDir = takeSavepoint(jobID);
} while (!latch.await(100, TimeUnit.MILLISECONDS));
flinkCluster.cancelJob(jobID).get();
options.setShutdownSourcesAfterIdleMs(0L);
restoreFromSavepoint(p, savepointDir);
waitUntilJobIsDone();
assertThat(new FilePatternMatchingShardedFile(singleOutputPrefix + "*"), fileContentsHaveChecksum(VALUE_CHECKSUM));
assertThat(new FilePatternMatchingShardedFile(multiOutputPrefix + "*"), fileContentsHaveChecksum(VALUE_CHECKSUM));
}
use of org.apache.beam.sdk.util.FilePatternMatchingShardedFile in project beam by apache.
the class RequiresStableInputIT method testParDoRequiresStableInput.
/**
* Test for the support of {@link org.apache.beam.sdk.transforms.DoFn.RequiresStableInput} in both
* {@link ParDo.SingleOutput} and {@link ParDo.MultiOutput}.
*
* <p>In each test, a singleton string value is paired with a random key. In the following
* transform, the value is written to a file, whose path is specified by the random key, and then
* the transform fails. When the pipeline retries, the latter transform should receive the same
* input from the former transform, because its {@link DoFn} is annotated with {@link
* org.apache.beam.sdk.transforms.DoFn.RequiresStableInput}, and it will not fail due to presence
* of the file. Therefore, only one file for each transform is expected.
*/
@Test
public void testParDoRequiresStableInput() {
TestPipelineOptions options = TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
ResourceId outputDir = FileSystems.matchNewResource(options.getTempRoot(), true).resolve(String.format("requires-stable-input-%tF-%<tH-%<tM-%<tS-%<tL", new Date()), StandardResolveOptions.RESOLVE_DIRECTORY);
String singleOutputPrefix = outputDir.resolve("pardo-single-output", StandardResolveOptions.RESOLVE_DIRECTORY).resolve("key-", StandardResolveOptions.RESOLVE_FILE).toString();
String multiOutputPrefix = outputDir.resolve("pardo-multi-output", StandardResolveOptions.RESOLVE_DIRECTORY).resolve("key-", StandardResolveOptions.RESOLVE_FILE).toString();
Pipeline p = Pipeline.create(options);
SerializableFunction<Void, Void> firstTime = (SerializableFunction<Void, Void>) value -> {
throw new RuntimeException("Deliberate failure: should happen only once for each application of the DoFn" + "within the transform graph.");
};
PCollection<String> singleton = p.apply("CreatePCollectionOfOneValue", Create.of(VALUE));
singleton.apply("Single-PairWithRandomKey", MapElements.via(new PairWithRandomKeyFn())).apply("Single-MakeSideEffectAndThenFail", ParDo.of(new MakeSideEffectAndThenFailFn(singleOutputPrefix, firstTime)));
singleton.apply("Multi-PairWithRandomKey", MapElements.via(new PairWithRandomKeyFn())).apply("Multi-MakeSideEffectAndThenFail", ParDo.of(new MakeSideEffectAndThenFailFn(multiOutputPrefix, firstTime)).withOutputTags(new TupleTag<>(), TupleTagList.empty()));
p.run().waitUntilFinish();
assertThat(new FilePatternMatchingShardedFile(singleOutputPrefix + "*"), fileContentsHaveChecksum(VALUE_CHECKSUM));
assertThat(new FilePatternMatchingShardedFile(multiOutputPrefix + "*"), fileContentsHaveChecksum(VALUE_CHECKSUM));
}
Aggregations