use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class FileBasedSourceTest method testSplitAtFractionExhaustive.
@Test
public void testSplitAtFractionExhaustive() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
// Smaller file for exhaustive testing.
File file = createFileWithData("file", createStringDataset(3, 20));
Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
assertSplitAtFractionExhaustive(source, options);
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class FileBasedSourceTest method testFullyReadFilePattern.
@Test
public void testFullyReadFilePattern() throws IOException {
PipelineOptions options = PipelineOptionsFactory.create();
List<String> data1 = createStringDataset(3, 50);
File file1 = createFileWithData("file1", data1);
List<String> data2 = createStringDataset(3, 50);
createFileWithData("file2", data2);
List<String> data3 = createStringDataset(3, 50);
createFileWithData("file3", data3);
List<String> data4 = createStringDataset(3, 50);
createFileWithData("otherfile", data4);
TestFileBasedSource source = new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null);
List<String> expectedResults = new ArrayList<String>();
expectedResults.addAll(data1);
expectedResults.addAll(data2);
expectedResults.addAll(data3);
assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class MinimalWordCountJava8 method main.
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
// In order to run your pipeline, you need to make following runner specific changes:
//
// CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner
// or FlinkRunner.
// CHANGE 2/3: Specify runner-required options.
// For BlockingDataflowRunner, set project and temp location as follows:
// DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
// dataflowOptions.setRunner(BlockingDataflowRunner.class);
// dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE");
// dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY");
// For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions}
// for more details.
// options.as(FlinkPipelineOptions.class)
// .setRunner(FlinkRunner.class);
Pipeline p = Pipeline.create(options);
p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")).apply(FlatMapElements.into(TypeDescriptors.strings()).via((String word) -> Arrays.asList(word.split("[^\\p{L}]+")))).apply(Filter.by((String word) -> !word.isEmpty())).apply(Count.<String>perElement()).apply(MapElements.into(TypeDescriptors.strings()).via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())).apply(TextIO.write().to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
p.run().waitUntilFinish();
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class AvroSourceTest method testSplitsWithSmallBlocks.
@Test
public void testSplitsWithSmallBlocks() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
// Test reading from an object file with many small random-sized blocks.
// The file itself doesn't have to be big; we can use a decreased record count.
List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
String filename = generateTestFile("tmp.avro", expected, SyncBehavior.SYNC_RANDOM, DEFAULT_RECORD_COUNT / 20, /* max records/block */
AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC);
File file = new File(filename);
// Small minimum bundle size
AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L);
// Assert that the source produces the expected records
assertEquals(expected, SourceTestUtils.readFromSource(source, options));
List<? extends BoundedSource<Bird>> splits;
int nonEmptySplits;
// Split with the minimum bundle size
splits = source.split(100L, options);
assertTrue(splits.size() > 2);
SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
nonEmptySplits = 0;
for (BoundedSource<Bird> subSource : splits) {
if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
nonEmptySplits += 1;
}
}
assertTrue(nonEmptySplits > 2);
// Split with larger bundle size
splits = source.split(file.length() / 4, options);
assertTrue(splits.size() > 2);
SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
nonEmptySplits = 0;
for (BoundedSource<Bird> subSource : splits) {
if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
nonEmptySplits += 1;
}
}
assertTrue(nonEmptySplits > 2);
// Split with the file length
splits = source.split(file.length(), options);
assertTrue(splits.size() == 1);
SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class CompressedSourceTest method testEmptyGzipProgress.
@Test
public void testEmptyGzipProgress() throws IOException {
File tmpFile = tmpFolder.newFile("empty.gz");
String filename = tmpFile.toPath().toString();
writeFile(tmpFile, new byte[0], CompressionMode.GZIP);
PipelineOptions options = PipelineOptionsFactory.create();
CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filename, 1));
try (BoundedReader<Byte> readerOrig = source.createReader(options)) {
assertThat(readerOrig, instanceOf(CompressedReader.class));
CompressedReader<Byte> reader = (CompressedReader<Byte>) readerOrig;
// before starting
assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
assertEquals(0, reader.getSplitPointsConsumed());
assertEquals(1, reader.getSplitPointsRemaining());
// confirm empty
assertFalse(reader.start());
// after reading empty source
assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
assertEquals(0, reader.getSplitPointsConsumed());
assertEquals(0, reader.getSplitPointsRemaining());
}
}
Aggregations