Search in sources :

Example 6 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class TestDataflowRunnerTest method testCheckingForSuccessSkipsNonTentativeMetrics.

@Test
public void testCheckingForSuccessSkipsNonTentativeMetrics() throws Exception {
    DataflowPipelineJob job = spy(new DataflowPipelineJob(mockClient, "test-job", options, null));
    Pipeline p = TestPipeline.create(options);
    PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
    PAssert.that(pc).containsInAnyOrder(1, 2, 3);
    when(mockClient.getJobMetrics(anyString())).thenReturn(buildJobMetrics(generateMockMetrics(true, /* success */
    false)));
    TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
    runner.updatePAssertCount(p);
    doReturn(State.RUNNING).when(job).getState();
    assertThat(runner.checkForPAssertSuccess(job), equalTo(Optional.<Boolean>absent()));
}
Also used : TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 7 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class TestDataflowRunnerTest method testRunBatchJobThatSucceeds.

@Test
public void testRunBatchJobThatSucceeds() throws Exception {
    Pipeline p = Pipeline.create(options);
    PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
    PAssert.that(pc).containsInAnyOrder(1, 2, 3);
    DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class);
    when(mockJob.getState()).thenReturn(State.DONE);
    when(mockJob.getProjectId()).thenReturn("test-project");
    when(mockJob.getJobId()).thenReturn("test-job");
    DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class);
    when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob);
    TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
    when(mockClient.getJobMetrics(anyString())).thenReturn(generateMockMetricResponse(true, /* success */
    true));
    assertEquals(mockJob, runner.run(p, mockRunner));
}
Also used : TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 8 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class WriteFilesTest method testCustomShardedWrite.

@Test
@Category(NeedsRunner.class)
public void testCustomShardedWrite() throws IOException {
    // Flag to validate that the pipeline options are passed to the Sink
    WriteOptions options = TestPipeline.testingPipelineOptions().as(WriteOptions.class);
    options.setTestFlag("test_value");
    Pipeline p = TestPipeline.create(options);
    List<String> inputs = new ArrayList<>();
    // Prepare timestamps for the elements.
    List<Long> timestamps = new ArrayList<>();
    for (long i = 0; i < 1000; i++) {
        inputs.add(Integer.toString(3));
        timestamps.add(i + 1);
    }
    SimpleSink sink = makeSimpleSink();
    WriteFiles<String> write = WriteFiles.to(sink).withSharding(new LargestInt());
    p.apply(Create.timestamped(inputs, timestamps).withCoder(StringUtf8Coder.of())).apply(IDENTITY_MAP).apply(write);
    p.run();
    checkFileContents(getBaseOutputFilename(), inputs, Optional.of(3));
}
Also used : ArrayList(java.util.ArrayList) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 9 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class BatchLoads method expand.

@Override
public WriteResult expand(PCollection<KV<DestinationT, TableRow>> input) {
    Pipeline p = input.getPipeline();
    final String stepUuid = BigQueryHelpers.randomUUIDString();
    PCollectionView<String> tempFilePrefix = p.apply("Create", Create.of((Void) null)).apply("GetTempFilePrefix", ParDo.of(new DoFn<Void, String>() {

        @ProcessElement
        public void getTempFilePrefix(ProcessContext c) {
            c.output(resolveTempLocation(c.getPipelineOptions().getTempLocation(), "BigQueryWriteTemp", stepUuid));
        }
    })).apply("TempFilePrefixView", View.<String>asSingleton());
    // Create a singleton job ID token at execution time. This will be used as the base for all
    // load jobs issued from this instance of the transform.
    PCollectionView<String> jobIdTokenView = p.apply("TriggerIdCreation", Create.of("ignored")).apply("CreateJobId", MapElements.via(new SimpleFunction<String, String>() {

        @Override
        public String apply(String input) {
            return stepUuid;
        }
    })).apply(View.<String>asSingleton());
    PCollection<KV<DestinationT, TableRow>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, TableRow>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
    PCollectionView<Map<DestinationT, String>> schemasView = inputInGlobalWindow.apply(new CalculateSchemas<>(dynamicDestinations));
    TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag = new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {
    };
    TupleTag<KV<ShardedKey<DestinationT>, TableRow>> unwrittedRecordsTag = new TupleTag<KV<ShardedKey<DestinationT>, TableRow>>("unwrittenRecords") {
    };
    PCollectionTuple writeBundlesTuple = inputInGlobalWindow.apply("WriteBundlesToFiles", ParDo.of(new WriteBundlesToFiles<>(stepUuid, unwrittedRecordsTag, maxNumWritersPerBundle, maxFileSize)).withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
    PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles = writeBundlesTuple.get(writtenFilesTag).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
    // If the bundles contain too many output tables to be written inline to files (due to memory
    // limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
    // Group these records by key, and write the files after grouping. Since the record is grouped
    // by key, we can ensure that only one file is open at a time in each bundle.
    PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped = writeBundlesTuple.get(unwrittedRecordsTag).setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), TableRowJsonCoder.of())).apply(GroupByKey.<ShardedKey<DestinationT>, TableRow>create()).apply(ParDo.of(new WriteGroupedRecordsToFiles<DestinationT>(tempFilePrefix, maxFileSize)).withSideInputs(tempFilePrefix)).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
    // PCollection of filename, file byte size, and table destination.
    PCollection<WriteBundlesToFiles.Result<DestinationT>> results = PCollectionList.of(writtenFiles).and(writtenFilesGrouped).apply(Flatten.<Result<DestinationT>>pCollections());
    TupleTag<KV<ShardedKey<DestinationT>, List<String>>> multiPartitionsTag = new TupleTag<KV<ShardedKey<DestinationT>, List<String>>>("multiPartitionsTag") {
    };
    TupleTag<KV<ShardedKey<DestinationT>, List<String>>> singlePartitionTag = new TupleTag<KV<ShardedKey<DestinationT>, List<String>>>("singlePartitionTag") {
    };
    // Turn the list of files and record counts in a PCollectionView that can be used as a
    // side input.
    PCollectionView<Iterable<WriteBundlesToFiles.Result<DestinationT>>> resultsView = results.apply("ResultsView", View.<WriteBundlesToFiles.Result<DestinationT>>asIterable());
    // This transform will look at the set of files written for each table, and if any table has
    // too many files or bytes, will partition that table's files into multiple partitions for
    // loading.
    PCollection<Void> singleton = p.apply("singleton", Create.of((Void) null).withCoder(VoidCoder.of()));
    PCollectionTuple partitions = singleton.apply("WritePartition", ParDo.of(new WritePartition<>(singletonTable, tempFilePrefix, resultsView, multiPartitionsTag, singlePartitionTag)).withSideInputs(tempFilePrefix, resultsView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
    List<PCollectionView<?>> writeTablesSideInputs = Lists.newArrayList(jobIdTokenView, schemasView);
    writeTablesSideInputs.addAll(dynamicDestinations.getSideInputs());
    Coder<KV<ShardedKey<DestinationT>, List<String>>> partitionsCoder = KvCoder.of(ShardedKeyCoder.of(NullableCoder.of(destinationCoder)), ListCoder.of(StringUtf8Coder.of()));
    // If WriteBundlesToFiles produced more than MAX_NUM_FILES files or MAX_SIZE_BYTES bytes, then
    // the import needs to be split into multiple partitions, and those partitions will be
    // specified in multiPartitionsTag.
    PCollection<KV<TableDestination, String>> tempTables = partitions.get(multiPartitionsTag).setCoder(partitionsCoder).apply("MultiPartitionsReshuffle", Reshuffle.<ShardedKey<DestinationT>, List<String>>of()).apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables<>(false, bigQueryServices, jobIdTokenView, schemasView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, dynamicDestinations)).withSideInputs(writeTablesSideInputs));
    // This view maps each final table destination to the set of temporary partitioned tables
    // the PCollection was loaded into.
    PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = tempTables.apply("TempTablesView", View.<TableDestination, String>asMultimap());
    singleton.apply("WriteRename", ParDo.of(new WriteRename(bigQueryServices, jobIdTokenView, writeDisposition, createDisposition, tempTablesView)).withSideInputs(tempTablesView, jobIdTokenView));
    // Write single partition to final table
    partitions.get(singlePartitionTag).setCoder(partitionsCoder).apply("SinglePartitionsReshuffle", Reshuffle.<ShardedKey<DestinationT>, List<String>>of()).apply("SinglePartitionWriteTables", ParDo.of(new WriteTables<>(true, bigQueryServices, jobIdTokenView, schemasView, writeDisposition, createDisposition, dynamicDestinations)).withSideInputs(writeTablesSideInputs));
    PCollection<TableRow> empty = p.apply("CreateEmptyFailedInserts", Create.empty(TypeDescriptor.of(TableRow.class)));
    return WriteResult.in(input.getPipeline(), new TupleTag<TableRow>("failedInserts"), empty);
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) TupleTagList(org.apache.beam.sdk.values.TupleTagList) PCollectionList(org.apache.beam.sdk.values.PCollectionList) List(java.util.List) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) PCollectionView(org.apache.beam.sdk.values.PCollectionView) TableRow(com.google.api.services.bigquery.model.TableRow) Map(java.util.Map)

Example 10 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class PCollectionTupleTest method testExpandHasMatchingTags.

@Test
public void testExpandHasMatchingTags() {
    TupleTag<Integer> intTag = new TupleTag<>();
    TupleTag<String> strTag = new TupleTag<>();
    TupleTag<Long> longTag = new TupleTag<>();
    Pipeline p = TestPipeline.create();
    PCollection<Long> longs = p.apply(GenerateSequence.from(0).to(100));
    PCollection<String> strs = p.apply(Create.of("foo", "bar", "baz"));
    PCollection<Integer> ints = longs.apply(MapElements.via(new SimpleFunction<Long, Integer>() {

        @Override
        public Integer apply(Long input) {
            return input.intValue();
        }
    }));
    Map<TupleTag<?>, PCollection<?>> pcsByTag = ImmutableMap.<TupleTag<?>, PCollection<?>>builder().put(strTag, strs).put(intTag, ints).put(longTag, longs).build();
    PCollectionTuple tuple = PCollectionTuple.of(intTag, ints).and(longTag, longs).and(strTag, strs);
    assertThat(tuple.getAll(), equalTo(pcsByTag));
    PCollectionTuple reconstructed = PCollectionTuple.empty(p);
    for (Entry<TupleTag<?>, PValue> taggedValue : tuple.expand().entrySet()) {
        TupleTag<?> tag = taggedValue.getKey();
        PValue value = taggedValue.getValue();
        assertThat("The tag should map back to the value", tuple.get(tag), equalTo(value));
        assertThat(value, Matchers.<PValue>equalTo(pcsByTag.get(tag)));
        reconstructed = reconstructed.and(tag, (PCollection) value);
    }
    assertThat(reconstructed, equalTo(tuple));
}
Also used : TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Test(org.junit.Test)

Aggregations

Pipeline (org.apache.beam.sdk.Pipeline)184 Test (org.junit.Test)123 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)86 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)39 KV (org.apache.beam.sdk.values.KV)35 Job (com.google.api.services.dataflow.model.Job)26 DoFn (org.apache.beam.sdk.transforms.DoFn)24 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)22 DataflowPackage (com.google.api.services.dataflow.model.DataflowPackage)21 TableRow (com.google.api.services.bigquery.model.TableRow)16 PipelineResult (org.apache.beam.sdk.PipelineResult)14 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)13 TableSchema (com.google.api.services.bigquery.model.TableSchema)12 ApexPipelineOptions (org.apache.beam.runners.apex.ApexPipelineOptions)12 Map (java.util.Map)11 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)10 ArrayList (java.util.ArrayList)10 Instant (org.joda.time.Instant)10 TableReference (com.google.api.services.bigquery.model.TableReference)9 JsonSchemaToTableSchema (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema)9