Search in sources :

Example 81 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class FileBasedSourceTest method testSplitAtFractionExhaustive.

@Test
public void testSplitAtFractionExhaustive() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    // Smaller file for exhaustive testing.
    File file = createFileWithData("file", createStringDataset(3, 20));
    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
    assertSplitAtFractionExhaustive(source, options);
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) File(java.io.File) Test(org.junit.Test)

Example 82 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class FileBasedSourceTest method testFullyReadFilePattern.

@Test
public void testFullyReadFilePattern() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data1 = createStringDataset(3, 50);
    File file1 = createFileWithData("file1", data1);
    List<String> data2 = createStringDataset(3, 50);
    createFileWithData("file2", data2);
    List<String> data3 = createStringDataset(3, 50);
    createFileWithData("file3", data3);
    List<String> data4 = createStringDataset(3, 50);
    createFileWithData("otherfile", data4);
    TestFileBasedSource source = new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null);
    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data1);
    expectedResults.addAll(data2);
    expectedResults.addAll(data3);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) File(java.io.File) Test(org.junit.Test)

Example 83 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class MinimalWordCountJava8 method main.

public static void main(String[] args) {
    PipelineOptions options = PipelineOptionsFactory.create();
    // In order to run your pipeline, you need to make following runner specific changes:
    //
    // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner
    // or FlinkRunner.
    // CHANGE 2/3: Specify runner-required options.
    // For BlockingDataflowRunner, set project and temp location as follows:
    //   DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
    //   dataflowOptions.setRunner(BlockingDataflowRunner.class);
    //   dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE");
    //   dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY");
    // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions}
    // for more details.
    //   options.as(FlinkPipelineOptions.class)
    //      .setRunner(FlinkRunner.class);
    Pipeline p = Pipeline.create(options);
    p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")).apply(FlatMapElements.into(TypeDescriptors.strings()).via((String word) -> Arrays.asList(word.split("[^\\p{L}]+")))).apply(Filter.by((String word) -> !word.isEmpty())).apply(Count.<String>perElement()).apply(MapElements.into(TypeDescriptors.strings()).via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())).apply(TextIO.write().to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
    p.run().waitUntilFinish();
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline)

Example 84 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class AvroSourceTest method testSplitsWithSmallBlocks.

@Test
public void testSplitsWithSmallBlocks() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    // Test reading from an object file with many small random-sized blocks.
    // The file itself doesn't have to be big; we can use a decreased record count.
    List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
    String filename = generateTestFile("tmp.avro", expected, SyncBehavior.SYNC_RANDOM, DEFAULT_RECORD_COUNT / 20, /* max records/block */
    AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC);
    File file = new File(filename);
    // Small minimum bundle size
    AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L);
    // Assert that the source produces the expected records
    assertEquals(expected, SourceTestUtils.readFromSource(source, options));
    List<? extends BoundedSource<Bird>> splits;
    int nonEmptySplits;
    // Split with the minimum bundle size
    splits = source.split(100L, options);
    assertTrue(splits.size() > 2);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    nonEmptySplits = 0;
    for (BoundedSource<Bird> subSource : splits) {
        if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
            nonEmptySplits += 1;
        }
    }
    assertTrue(nonEmptySplits > 2);
    // Split with larger bundle size
    splits = source.split(file.length() / 4, options);
    assertTrue(splits.size() > 2);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    nonEmptySplits = 0;
    for (BoundedSource<Bird> subSource : splits) {
        if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
            nonEmptySplits += 1;
        }
    }
    assertTrue(nonEmptySplits > 2);
    // Split with the file length
    splits = source.split(file.length(), options);
    assertTrue(splits.size() == 1);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) File(java.io.File) Test(org.junit.Test)

Example 85 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class CompressedSourceTest method testEmptyGzipProgress.

@Test
public void testEmptyGzipProgress() throws IOException {
    File tmpFile = tmpFolder.newFile("empty.gz");
    String filename = tmpFile.toPath().toString();
    writeFile(tmpFile, new byte[0], CompressionMode.GZIP);
    PipelineOptions options = PipelineOptionsFactory.create();
    CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filename, 1));
    try (BoundedReader<Byte> readerOrig = source.createReader(options)) {
        assertThat(readerOrig, instanceOf(CompressedReader.class));
        CompressedReader<Byte> reader = (CompressedReader<Byte>) readerOrig;
        // before starting
        assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
        assertEquals(0, reader.getSplitPointsConsumed());
        assertEquals(1, reader.getSplitPointsRemaining());
        // confirm empty
        assertFalse(reader.start());
        // after reading empty source
        assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
        assertEquals(0, reader.getSplitPointsConsumed());
        assertEquals(0, reader.getSplitPointsRemaining());
    }
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) CompressedReader(org.apache.beam.sdk.io.CompressedSource.CompressedReader) Matchers.containsString(org.hamcrest.Matchers.containsString) File(java.io.File) Test(org.junit.Test)

Aggregations

PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)92 Test (org.junit.Test)79 File (java.io.File)26 ArrayList (java.util.ArrayList)16 Pipeline (org.apache.beam.sdk.Pipeline)10 Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)9 Path (java.nio.file.Path)6 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)6 SerializedPipelineOptions (org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions)5 KV (org.apache.beam.sdk.values.KV)5 Matchers.containsString (org.hamcrest.Matchers.containsString)5 Table (com.google.api.services.bigquery.model.Table)4 TableReference (com.google.api.services.bigquery.model.TableReference)4 TableRow (com.google.api.services.bigquery.model.TableRow)4 HashBasedTable (com.google.common.collect.HashBasedTable)4 BoundedToUnboundedSourceAdapter (org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter)4 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)4 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)4 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)3 TableSchema (com.google.api.services.bigquery.model.TableSchema)3