Search in sources :

Example 1 with FilenamePolicy

use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.

the class FileBasedSinkTest method testGenerateOutputFilenames.

/**
   * Output filenames are generated correctly when an extension is supplied.
   */
@Test
public void testGenerateOutputFilenames() {
    List<ResourceId> expected;
    List<ResourceId> actual;
    ResourceId root = getBaseOutputDirectory();
    SimpleSink sink = new SimpleSink(root, "file", ".SSSSS.of.NNNNN", ".test");
    FilenamePolicy policy = sink.getFilenamePolicy();
    expected = Arrays.asList(root.resolve("file.00000.of.00003.test", StandardResolveOptions.RESOLVE_FILE), root.resolve("file.00001.of.00003.test", StandardResolveOptions.RESOLVE_FILE), root.resolve("file.00002.of.00003.test", StandardResolveOptions.RESOLVE_FILE));
    actual = generateDestinationFilenames(root, policy, 3);
    assertEquals(expected, actual);
    expected = Collections.singletonList(root.resolve("file.00000.of.00001.test", StandardResolveOptions.RESOLVE_FILE));
    actual = generateDestinationFilenames(root, policy, 1);
    assertEquals(expected, actual);
    expected = new ArrayList<>();
    actual = generateDestinationFilenames(root, policy, 0);
    assertEquals(expected, actual);
}
Also used : ResourceId(org.apache.beam.sdk.io.fs.ResourceId) FilenamePolicy(org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy) Test(org.junit.Test)

Example 2 with FilenamePolicy

use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.

the class PTransformMatchersTest method writeWithRunnerDeterminedSharding.

@Test
public void writeWithRunnerDeterminedSharding() {
    ResourceId outputDirectory = LocalResources.fromString("/foo/bar", true);
    FilenamePolicy policy = DefaultFilenamePolicy.constructUsingStandardParameters(StaticValueProvider.of(outputDirectory), DefaultFilenamePolicy.DEFAULT_SHARD_TEMPLATE, "");
    WriteFiles<Integer> write = WriteFiles.to(new FileBasedSink<Integer>(StaticValueProvider.of(outputDirectory), policy) {

        @Override
        public WriteOperation<Integer> createWriteOperation() {
            return null;
        }
    });
    assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(write)), is(true));
    WriteFiles<Integer> withStaticSharding = write.withNumShards(3);
    assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(withStaticSharding)), is(false));
    WriteFiles<Integer> withCustomSharding = write.withSharding(Sum.integersGlobally().asSingletonView());
    assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(withCustomSharding)), is(false));
}
Also used : ResourceId(org.apache.beam.sdk.io.fs.ResourceId) DefaultFilenamePolicy(org.apache.beam.sdk.io.DefaultFilenamePolicy) FilenamePolicy(org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy) Test(org.junit.Test)

Example 3 with FilenamePolicy

use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.

the class AvroIOTest method testWindowedAvroIOWrite.

@Test
@Category({ ValidatesRunner.class, UsesTestStream.class })
public void testWindowedAvroIOWrite() throws Throwable {
    Path baseDir = Files.createTempDirectory(tmpFolder.getRoot().toPath(), "testwrite");
    String baseFilename = baseDir.resolve("prefix").toString();
    Instant base = new Instant(0);
    ArrayList<GenericClass> allElements = new ArrayList<>();
    ArrayList<TimestampedValue<GenericClass>> firstWindowElements = new ArrayList<>();
    ArrayList<Instant> firstWindowTimestamps = Lists.newArrayList(base.plus(Duration.standardSeconds(0)), base.plus(Duration.standardSeconds(10)), base.plus(Duration.standardSeconds(20)), base.plus(Duration.standardSeconds(30)));
    Random random = new Random();
    for (int i = 0; i < 100; ++i) {
        GenericClass item = new GenericClass(i, String.valueOf(i));
        allElements.add(item);
        firstWindowElements.add(TimestampedValue.of(item, firstWindowTimestamps.get(random.nextInt(firstWindowTimestamps.size()))));
    }
    ArrayList<TimestampedValue<GenericClass>> secondWindowElements = new ArrayList<>();
    ArrayList<Instant> secondWindowTimestamps = Lists.newArrayList(base.plus(Duration.standardSeconds(60)), base.plus(Duration.standardSeconds(70)), base.plus(Duration.standardSeconds(80)), base.plus(Duration.standardSeconds(90)));
    for (int i = 100; i < 200; ++i) {
        GenericClass item = new GenericClass(i, String.valueOf(i));
        allElements.add(new GenericClass(i, String.valueOf(i)));
        secondWindowElements.add(TimestampedValue.of(item, secondWindowTimestamps.get(random.nextInt(secondWindowTimestamps.size()))));
    }
    TimestampedValue<GenericClass>[] firstWindowArray = firstWindowElements.toArray(new TimestampedValue[100]);
    TimestampedValue<GenericClass>[] secondWindowArray = secondWindowElements.toArray(new TimestampedValue[100]);
    TestStream<GenericClass> values = TestStream.create(AvroCoder.of(GenericClass.class)).advanceWatermarkTo(new Instant(0)).addElements(firstWindowArray[0], Arrays.copyOfRange(firstWindowArray, 1, firstWindowArray.length)).advanceWatermarkTo(new Instant(0).plus(Duration.standardMinutes(1))).addElements(secondWindowArray[0], Arrays.copyOfRange(secondWindowArray, 1, secondWindowArray.length)).advanceWatermarkToInfinity();
    FilenamePolicy policy = new WindowedFilenamePolicy(baseFilename);
    windowedAvroWritePipeline.apply(values).apply(Window.<GenericClass>into(FixedWindows.of(Duration.standardMinutes(1)))).apply(AvroIO.write(GenericClass.class).to(baseFilename).withFilenamePolicy(policy).withWindowedWrites().withNumShards(2));
    windowedAvroWritePipeline.run();
    // Validate that the data written matches the expected elements in the expected order
    List<File> expectedFiles = new ArrayList<>();
    for (int shard = 0; shard < 2; shard++) {
        for (int window = 0; window < 2; window++) {
            Instant windowStart = new Instant(0).plus(Duration.standardMinutes(window));
            IntervalWindow intervalWindow = new IntervalWindow(windowStart, Duration.standardMinutes(1));
            expectedFiles.add(new File(baseFilename + "-" + intervalWindow.toString() + "-" + shard + "-of-1" + "-pane-0-final"));
        }
    }
    List<GenericClass> actualElements = new ArrayList<>();
    for (File outputFile : expectedFiles) {
        assertTrue("Expected output file " + outputFile.getAbsolutePath(), outputFile.exists());
        try (DataFileReader<GenericClass> reader = new DataFileReader<>(outputFile, new ReflectDatumReader<GenericClass>(ReflectData.get().getSchema(GenericClass.class)))) {
            Iterators.addAll(actualElements, reader);
        }
        outputFile.delete();
    }
    assertThat(actualElements, containsInAnyOrder(allElements.toArray()));
}
Also used : Path(java.nio.file.Path) Instant(org.joda.time.Instant) ArrayList(java.util.ArrayList) FilenamePolicy(org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy) DataFileReader(org.apache.avro.file.DataFileReader) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) Random(java.util.Random) File(java.io.File) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 4 with FilenamePolicy

use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.

the class FileBasedSinkTest method testGenerateOutputFilenamesWithoutExtension.

/** Output filenames are generated correctly when an extension is not supplied. */
@Test
public void testGenerateOutputFilenamesWithoutExtension() {
    List<ResourceId> expected;
    List<ResourceId> actual;
    ResourceId root = getBaseOutputDirectory();
    SimpleSink sink = new SimpleSink(root, "file", "-SSSSS-of-NNNNN", "");
    FilenamePolicy policy = sink.getFilenamePolicy();
    expected = Arrays.asList(root.resolve("file-00000-of-00003", StandardResolveOptions.RESOLVE_FILE), root.resolve("file-00001-of-00003", StandardResolveOptions.RESOLVE_FILE), root.resolve("file-00002-of-00003", StandardResolveOptions.RESOLVE_FILE));
    actual = generateDestinationFilenames(root, policy, 3);
    assertEquals(expected, actual);
    expected = Collections.singletonList(root.resolve("file-00000-of-00001", StandardResolveOptions.RESOLVE_FILE));
    actual = generateDestinationFilenames(root, policy, 1);
    assertEquals(expected, actual);
    expected = new ArrayList<>();
    actual = generateDestinationFilenames(root, policy, 0);
    assertEquals(expected, actual);
}
Also used : ResourceId(org.apache.beam.sdk.io.fs.ResourceId) FilenamePolicy(org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy) Test(org.junit.Test)

Example 5 with FilenamePolicy

use of org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy in project beam by apache.

the class WriteWithShardingFactoryTest method withNoShardingSpecifiedReturnsNewTransform.

@Test
public void withNoShardingSpecifiedReturnsNewTransform() {
    ResourceId outputDirectory = LocalResources.fromString("/foo", true);
    FilenamePolicy policy = DefaultFilenamePolicy.constructUsingStandardParameters(StaticValueProvider.of(outputDirectory), DefaultFilenamePolicy.DEFAULT_SHARD_TEMPLATE, "");
    WriteFiles<Object> original = WriteFiles.to(new FileBasedSink<Object>(StaticValueProvider.of(outputDirectory), policy) {

        @Override
        public WriteOperation<Object> createWriteOperation() {
            throw new IllegalArgumentException("Should not be used");
        }
    });
    @SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of()));
    AppliedPTransform<PCollection<Object>, PDone, WriteFiles<Object>> originalApplication = AppliedPTransform.of("write", objs.expand(), Collections.<TupleTag<?>, PValue>emptyMap(), original, p);
    assertThat(factory.getReplacementTransform(originalApplication).getTransform(), not(equalTo((Object) original)));
}
Also used : DefaultFilenamePolicy(org.apache.beam.sdk.io.DefaultFilenamePolicy) FilenamePolicy(org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy) PCollection(org.apache.beam.sdk.values.PCollection) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) PDone(org.apache.beam.sdk.values.PDone) WriteFiles(org.apache.beam.sdk.io.WriteFiles) Test(org.junit.Test)

Aggregations

FilenamePolicy (org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy)5 Test (org.junit.Test)5 ResourceId (org.apache.beam.sdk.io.fs.ResourceId)4 DefaultFilenamePolicy (org.apache.beam.sdk.io.DefaultFilenamePolicy)2 File (java.io.File)1 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1 Random (java.util.Random)1 DataFileReader (org.apache.avro.file.DataFileReader)1 WriteFiles (org.apache.beam.sdk.io.WriteFiles)1 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)1 PCollection (org.apache.beam.sdk.values.PCollection)1 PDone (org.apache.beam.sdk.values.PDone)1 TimestampedValue (org.apache.beam.sdk.values.TimestampedValue)1 Instant (org.joda.time.Instant)1 Category (org.junit.experimental.categories.Category)1