use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.
the class FileBasedSinkTest method testGenerateOutputFilenames.
/**
* Output filenames are generated correctly when an extension is supplied.
*/
@Test
public void testGenerateOutputFilenames() {
List<ResourceId> expected;
List<ResourceId> actual;
ResourceId root = getBaseOutputDirectory();
SimpleSink sink = new SimpleSink(root, "file", ".SSSSS.of.NNNNN", ".test");
FilenamePolicy policy = sink.getFilenamePolicy();
expected = Arrays.asList(root.resolve("file.00000.of.00003.test", StandardResolveOptions.RESOLVE_FILE), root.resolve("file.00001.of.00003.test", StandardResolveOptions.RESOLVE_FILE), root.resolve("file.00002.of.00003.test", StandardResolveOptions.RESOLVE_FILE));
actual = generateDestinationFilenames(root, policy, 3);
assertEquals(expected, actual);
expected = Collections.singletonList(root.resolve("file.00000.of.00001.test", StandardResolveOptions.RESOLVE_FILE));
actual = generateDestinationFilenames(root, policy, 1);
assertEquals(expected, actual);
expected = new ArrayList<>();
actual = generateDestinationFilenames(root, policy, 0);
assertEquals(expected, actual);
}
use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.
the class PTransformMatchersTest method writeWithRunnerDeterminedSharding.
@Test
public void writeWithRunnerDeterminedSharding() {
ResourceId outputDirectory = LocalResources.fromString("/foo/bar", true);
FilenamePolicy policy = DefaultFilenamePolicy.constructUsingStandardParameters(StaticValueProvider.of(outputDirectory), DefaultFilenamePolicy.DEFAULT_SHARD_TEMPLATE, "");
WriteFiles<Integer> write = WriteFiles.to(new FileBasedSink<Integer>(StaticValueProvider.of(outputDirectory), policy) {
@Override
public WriteOperation<Integer> createWriteOperation() {
return null;
}
});
assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(write)), is(true));
WriteFiles<Integer> withStaticSharding = write.withNumShards(3);
assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(withStaticSharding)), is(false));
WriteFiles<Integer> withCustomSharding = write.withSharding(Sum.integersGlobally().asSingletonView());
assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(withCustomSharding)), is(false));
}
use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.
the class AvroIOTest method testWindowedAvroIOWrite.
@Test
@Category({ ValidatesRunner.class, UsesTestStream.class })
public void testWindowedAvroIOWrite() throws Throwable {
Path baseDir = Files.createTempDirectory(tmpFolder.getRoot().toPath(), "testwrite");
String baseFilename = baseDir.resolve("prefix").toString();
Instant base = new Instant(0);
ArrayList<GenericClass> allElements = new ArrayList<>();
ArrayList<TimestampedValue<GenericClass>> firstWindowElements = new ArrayList<>();
ArrayList<Instant> firstWindowTimestamps = Lists.newArrayList(base.plus(Duration.standardSeconds(0)), base.plus(Duration.standardSeconds(10)), base.plus(Duration.standardSeconds(20)), base.plus(Duration.standardSeconds(30)));
Random random = new Random();
for (int i = 0; i < 100; ++i) {
GenericClass item = new GenericClass(i, String.valueOf(i));
allElements.add(item);
firstWindowElements.add(TimestampedValue.of(item, firstWindowTimestamps.get(random.nextInt(firstWindowTimestamps.size()))));
}
ArrayList<TimestampedValue<GenericClass>> secondWindowElements = new ArrayList<>();
ArrayList<Instant> secondWindowTimestamps = Lists.newArrayList(base.plus(Duration.standardSeconds(60)), base.plus(Duration.standardSeconds(70)), base.plus(Duration.standardSeconds(80)), base.plus(Duration.standardSeconds(90)));
for (int i = 100; i < 200; ++i) {
GenericClass item = new GenericClass(i, String.valueOf(i));
allElements.add(new GenericClass(i, String.valueOf(i)));
secondWindowElements.add(TimestampedValue.of(item, secondWindowTimestamps.get(random.nextInt(secondWindowTimestamps.size()))));
}
TimestampedValue<GenericClass>[] firstWindowArray = firstWindowElements.toArray(new TimestampedValue[100]);
TimestampedValue<GenericClass>[] secondWindowArray = secondWindowElements.toArray(new TimestampedValue[100]);
TestStream<GenericClass> values = TestStream.create(AvroCoder.of(GenericClass.class)).advanceWatermarkTo(new Instant(0)).addElements(firstWindowArray[0], Arrays.copyOfRange(firstWindowArray, 1, firstWindowArray.length)).advanceWatermarkTo(new Instant(0).plus(Duration.standardMinutes(1))).addElements(secondWindowArray[0], Arrays.copyOfRange(secondWindowArray, 1, secondWindowArray.length)).advanceWatermarkToInfinity();
FilenamePolicy policy = new WindowedFilenamePolicy(baseFilename);
windowedAvroWritePipeline.apply(values).apply(Window.<GenericClass>into(FixedWindows.of(Duration.standardMinutes(1)))).apply(AvroIO.write(GenericClass.class).to(baseFilename).withFilenamePolicy(policy).withWindowedWrites().withNumShards(2));
windowedAvroWritePipeline.run();
// Validate that the data written matches the expected elements in the expected order
List<File> expectedFiles = new ArrayList<>();
for (int shard = 0; shard < 2; shard++) {
for (int window = 0; window < 2; window++) {
Instant windowStart = new Instant(0).plus(Duration.standardMinutes(window));
IntervalWindow intervalWindow = new IntervalWindow(windowStart, Duration.standardMinutes(1));
expectedFiles.add(new File(baseFilename + "-" + intervalWindow.toString() + "-" + shard + "-of-1" + "-pane-0-final"));
}
}
List<GenericClass> actualElements = new ArrayList<>();
for (File outputFile : expectedFiles) {
assertTrue("Expected output file " + outputFile.getAbsolutePath(), outputFile.exists());
try (DataFileReader<GenericClass> reader = new DataFileReader<>(outputFile, new ReflectDatumReader<GenericClass>(ReflectData.get().getSchema(GenericClass.class)))) {
Iterators.addAll(actualElements, reader);
}
outputFile.delete();
}
assertThat(actualElements, containsInAnyOrder(allElements.toArray()));
}
use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.
the class FileBasedSinkTest method testGenerateOutputFilenamesWithoutExtension.
/** Output filenames are generated correctly when an extension is not supplied. */
@Test
public void testGenerateOutputFilenamesWithoutExtension() {
List<ResourceId> expected;
List<ResourceId> actual;
ResourceId root = getBaseOutputDirectory();
SimpleSink sink = new SimpleSink(root, "file", "-SSSSS-of-NNNNN", "");
FilenamePolicy policy = sink.getFilenamePolicy();
expected = Arrays.asList(root.resolve("file-00000-of-00003", StandardResolveOptions.RESOLVE_FILE), root.resolve("file-00001-of-00003", StandardResolveOptions.RESOLVE_FILE), root.resolve("file-00002-of-00003", StandardResolveOptions.RESOLVE_FILE));
actual = generateDestinationFilenames(root, policy, 3);
assertEquals(expected, actual);
expected = Collections.singletonList(root.resolve("file-00000-of-00001", StandardResolveOptions.RESOLVE_FILE));
actual = generateDestinationFilenames(root, policy, 1);
assertEquals(expected, actual);
expected = new ArrayList<>();
actual = generateDestinationFilenames(root, policy, 0);
assertEquals(expected, actual);
}
use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.
the class WriteWithShardingFactoryTest method withNoShardingSpecifiedReturnsNewTransform.
@Test
public void withNoShardingSpecifiedReturnsNewTransform() {
ResourceId outputDirectory = LocalResources.fromString("/foo", true);
FilenamePolicy policy = DefaultFilenamePolicy.constructUsingStandardParameters(StaticValueProvider.of(outputDirectory), DefaultFilenamePolicy.DEFAULT_SHARD_TEMPLATE, "");
WriteFiles<Object> original = WriteFiles.to(new FileBasedSink<Object>(StaticValueProvider.of(outputDirectory), policy) {
@Override
public WriteOperation<Object> createWriteOperation() {
throw new IllegalArgumentException("Should not be used");
}
});
@SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of()));
AppliedPTransform<PCollection<Object>, PDone, WriteFiles<Object>> originalApplication = AppliedPTransform.of("write", objs.expand(), Collections.<TupleTag<?>, PValue>emptyMap(), original, p);
assertThat(factory.getReplacementTransform(originalApplication).getTransform(), not(equalTo((Object) original)));
}
Aggregations