Search in sources :

Example 31 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class FileBasedSourceTest method testReadAllSplitsOfSingleFile.

@Test
public void testReadAllSplitsOfSingleFile() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data = createStringDataset(3, 50);
    String fileName = "file";
    File file = createFileWithData(fileName, data);
    TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 16, null);
    List<? extends BoundedSource<String>> sources = source.split(32, null);
    // Not a trivial split.
    assertTrue(sources.size() > 1);
    List<String> results = new ArrayList<String>();
    for (BoundedSource<String> split : sources) {
        results.addAll(readFromSource(split, options));
    }
    assertThat(data, containsInAnyOrder(results.toArray()));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) File(java.io.File) Test(org.junit.Test)

Example 32 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class CompressedSourceTest method testUnsplittable.

@Test
public void testUnsplittable() throws IOException {
    String baseName = "test-input";
    File compressedFile = tmpFolder.newFile(baseName + ".gz");
    byte[] input = generateInput(10000);
    writeFile(compressedFile, input, CompressionMode.GZIP);
    CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1));
    List<Byte> expected = Lists.newArrayList();
    for (byte i : input) {
        expected.add(i);
    }
    PipelineOptions options = PipelineOptionsFactory.create();
    BoundedReader<Byte> reader = source.createReader(options);
    List<Byte> actual = Lists.newArrayList();
    for (boolean hasNext = reader.start(); hasNext; hasNext = reader.advance()) {
        actual.add(reader.getCurrent());
        // checkpoint every 9 elements
        if (actual.size() % 9 == 0) {
            Double fractionConsumed = reader.getFractionConsumed();
            assertNotNull(fractionConsumed);
            assertNull(reader.splitAtFraction(fractionConsumed));
        }
    }
    assertEquals(expected.size(), actual.size());
    assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Matchers.containsString(org.hamcrest.Matchers.containsString) File(java.io.File) Test(org.junit.Test)

Example 33 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class MongoDBGridFSIOTest method testSplit.

@Test
public void testSplit() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    MongoDbGridFSIO.Read<String> read = MongoDbGridFSIO.<String>read().withUri("mongodb://localhost:" + port).withDatabase(DATABASE);
    BoundedGridFSSource src = new BoundedGridFSSource(read, null);
    // make sure 2 files can fit in
    long desiredBundleSizeBytes = (src.getEstimatedSizeBytes(options) * 2L) / 5L + 1000;
    List<? extends BoundedSource<ObjectId>> splits = src.split(desiredBundleSizeBytes, options);
    int expectedNbSplits = 3;
    assertEquals(expectedNbSplits, splits.size());
    SourceTestUtils.assertSourcesEqualReferenceSource(src, splits, options);
    int nonEmptySplits = 0;
    int count = 0;
    for (BoundedSource<ObjectId> subSource : splits) {
        List<ObjectId> result = SourceTestUtils.readFromSource(subSource, options);
        if (result.size() > 0) {
            nonEmptySplits += 1;
        }
        count += result.size();
    }
    assertEquals(expectedNbSplits, nonEmptySplits);
    assertEquals(5, count);
}
Also used : ObjectId(org.bson.types.ObjectId) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) BoundedGridFSSource(org.apache.beam.sdk.io.mongodb.MongoDbGridFSIO.Read.BoundedGridFSSource) Test(org.junit.Test)

Example 34 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class JmsIOTest method testSplitForQueue.

@Test
public void testSplitForQueue() throws Exception {
    JmsIO.Read read = JmsIO.read().withQueue(QUEUE);
    PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
    int desiredNumSplits = 5;
    JmsIO.UnboundedJmsSource initialSource = new JmsIO.UnboundedJmsSource(read);
    List<JmsIO.UnboundedJmsSource> splits = initialSource.split(desiredNumSplits, pipelineOptions);
    // in the case of a queue, we have concurrent consumers by default, so the initial number
    // splits is equal to the desired number of splits
    assertEquals(desiredNumSplits, splits.size());
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Test(org.junit.Test)

Example 35 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class DirectRunnerTest method testMutatingOutputWithEnforcementDisabledSucceeds.

/**
   * Tests that a {@link DoFn} that mutates an output with a good equals() fails in the
   * {@link DirectRunner}.
   */
@Test
public void testMutatingOutputWithEnforcementDisabledSucceeds() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    options.setRunner(DirectRunner.class);
    options.as(DirectOptions.class).setEnforceImmutability(false);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply(Create.of(42)).apply(ParDo.of(new DoFn<Integer, List<Integer>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            List<Integer> outputList = Arrays.asList(1, 2, 3, 4);
            c.output(outputList);
            outputList.set(0, 37);
            c.output(outputList);
        }
    }));
    pipeline.run();
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DoFn(org.apache.beam.sdk.transforms.DoFn) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)92 Test (org.junit.Test)79 File (java.io.File)26 ArrayList (java.util.ArrayList)16 Pipeline (org.apache.beam.sdk.Pipeline)10 Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)9 Path (java.nio.file.Path)6 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)6 SerializedPipelineOptions (org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions)5 KV (org.apache.beam.sdk.values.KV)5 Matchers.containsString (org.hamcrest.Matchers.containsString)5 Table (com.google.api.services.bigquery.model.Table)4 TableReference (com.google.api.services.bigquery.model.TableReference)4 TableRow (com.google.api.services.bigquery.model.TableRow)4 HashBasedTable (com.google.common.collect.HashBasedTable)4 BoundedToUnboundedSourceAdapter (org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter)4 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)4 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)4 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)3 TableSchema (com.google.api.services.bigquery.model.TableSchema)3