Search in sources :

Example 66 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class OffsetBasedSourceTest method testSplitAtFractionExhaustive.

@Test
public void testSplitAtFractionExhaustive() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    CoarseRangeSource original = new CoarseRangeSource(13, 35, 1, 10);
    assertSplitAtFractionExhaustive(original, options);
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Test(org.junit.Test)

Example 67 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class FileBasedSourceTest method testReadRangeFromFileWithSplitsFromMiddleOfHeader.

@Test
public void testReadRangeFromFileWithSplitsFromMiddleOfHeader() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        data.add(header);
        data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);
    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data.subList(10, data.size()));
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Collections.singletonList(header));
    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    // Split starts after "<" of the header
    TestFileBasedSource source = new TestFileBasedSource(metadata, 64, 1, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
    // Split starts after "<h" of the header
    source = new TestFileBasedSource(metadata, 64, 2, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
    // Split starts after "<h>" of the header
    source = new TestFileBasedSource(metadata, 64, 3, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) File(java.io.File) Test(org.junit.Test)

Example 68 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class OffsetBasedSourceTest method testProgress.

@Test
public void testProgress() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    CoarseRangeSource source = new CoarseRangeSource(13, 17, 1, 2);
    try (OffsetBasedReader<Integer> reader = source.createReader(options)) {
        // Unstarted reader
        assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
        assertEquals(0, reader.getSplitPointsConsumed());
        assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
        // Start and produce the element 14 since granularity is 2.
        assertTrue(reader.start());
        assertTrue(reader.isAtSplitPoint());
        assertEquals(14, reader.getCurrent().intValue());
        assertEquals(0, reader.getSplitPointsConsumed());
        assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
        // Advance and produce the element 15, not a split point.
        assertTrue(reader.advance());
        assertEquals(15, reader.getCurrent().intValue());
        assertEquals(0, reader.getSplitPointsConsumed());
        assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
        // Advance and produce the element 16, is a split point. Since the next offset (17) is
        // outside the range [13, 17), remaining parallelism should become 1 from UNKNOWN.
        assertTrue(reader.advance());
        assertTrue(reader.isAtSplitPoint());
        assertEquals(16, reader.getCurrent().intValue());
        assertEquals(1, reader.getSplitPointsConsumed());
        // The next offset is outside the range.
        assertEquals(1, reader.getSplitPointsRemaining());
        // Advance and produce the element 17, not a split point.
        assertTrue(reader.advance());
        assertEquals(17, reader.getCurrent().intValue());
        assertEquals(1, reader.getSplitPointsConsumed());
        assertEquals(1, reader.getSplitPointsRemaining());
        // Advance and reach the end of the reader.
        assertFalse(reader.advance());
        assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
        assertEquals(2, reader.getSplitPointsConsumed());
        assertEquals(0, reader.getSplitPointsRemaining());
    }
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Test(org.junit.Test)

Example 69 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class OffsetBasedSourceTest method testReadingGranularityAndFractionConsumed.

@Test
public void testReadingGranularityAndFractionConsumed() throws IOException {
    // Tests that the reader correctly snaps to multiples of the given granularity
    // (note: this is testing test code), and that getFractionConsumed works sensibly
    // in the face of that.
    PipelineOptions options = PipelineOptionsFactory.create();
    CoarseRangeSource source = new CoarseRangeSource(13, 35, 1, 10);
    try (CoarseRangeReader reader = source.createReader(options)) {
        List<Integer> items = new ArrayList<>();
        assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
        assertTrue(reader.start());
        items.add(reader.getCurrent());
        while (reader.advance()) {
            Double fraction = reader.getFractionConsumed();
            assertNotNull(fraction);
            assertTrue(fraction.toString(), fraction > 0.0);
            assertTrue(fraction.toString(), fraction <= 1.0);
            items.add(reader.getCurrent());
        }
        assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
        assertEquals(20, items.size());
        assertEquals(20, items.get(0).intValue());
        assertEquals(39, items.get(items.size() - 1).intValue());
        source = new CoarseRangeSource(13, 17, 1, 10);
    }
    try (BoundedSource.BoundedReader<Integer> reader = source.createReader(options)) {
        assertFalse(reader.start());
    }
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) Test(org.junit.Test)

Example 70 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class OffsetBasedSourceTest method testSplitAtFraction.

@Test
public void testSplitAtFraction() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    CoarseRangeSource source = new CoarseRangeSource(13, 35, 1, 10);
    try (CoarseRangeReader reader = source.createReader(options)) {
        List<Integer> originalItems = new ArrayList<>();
        assertTrue(reader.start());
        originalItems.add(reader.getCurrent());
        assertTrue(reader.advance());
        originalItems.add(reader.getCurrent());
        assertTrue(reader.advance());
        originalItems.add(reader.getCurrent());
        assertTrue(reader.advance());
        originalItems.add(reader.getCurrent());
        assertNull(reader.splitAtFraction(0.0));
        assertNull(reader.splitAtFraction(reader.getFractionConsumed() - 0.1));
        BoundedSource<Integer> residual = reader.splitAtFraction(reader.getFractionConsumed() + 0.1);
        BoundedSource<Integer> primary = reader.getCurrentSource();
        List<Integer> primaryItems = readFromSource(primary, options);
        List<Integer> residualItems = readFromSource(residual, options);
        for (Integer item : residualItems) {
            assertTrue(item > reader.getCurrentOffset());
        }
        assertFalse(primaryItems.isEmpty());
        assertFalse(residualItems.isEmpty());
        assertTrue(primaryItems.get(primaryItems.size() - 1) <= residualItems.get(0));
        while (reader.advance()) {
            originalItems.add(reader.getCurrent());
        }
        assertEquals(originalItems, primaryItems);
    }
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) Test(org.junit.Test)

Aggregations

PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)92 Test (org.junit.Test)79 File (java.io.File)26 ArrayList (java.util.ArrayList)16 Pipeline (org.apache.beam.sdk.Pipeline)10 Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)9 Path (java.nio.file.Path)6 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)6 SerializedPipelineOptions (org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions)5 KV (org.apache.beam.sdk.values.KV)5 Matchers.containsString (org.hamcrest.Matchers.containsString)5 Table (com.google.api.services.bigquery.model.Table)4 TableReference (com.google.api.services.bigquery.model.TableReference)4 TableRow (com.google.api.services.bigquery.model.TableRow)4 HashBasedTable (com.google.common.collect.HashBasedTable)4 BoundedToUnboundedSourceAdapter (org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter)4 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)4 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)4 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)3 TableSchema (com.google.api.services.bigquery.model.TableSchema)3