Examples with DoFn - org.apache.beam.sdk.transforms.DoFn

Example 81 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.

the class BigQueryIOTest method testWriteRename.

@Test
public void testWriteRename() throws Exception {
    p.enableAbandonedNodeEnforcement(false);
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
    datasetService.createDataset("project-id", "dataset-id", "", "");
    final int numFinalTables = 3;
    final int numTempTablesPerFinalTable = 3;
    final int numRecordsPerTempTable = 10;
    Map<TableDestination, List<TableRow>> expectedRowsPerTable = Maps.newHashMap();
    String jobIdToken = "jobIdToken";
    Map<TableDestination, Iterable<String>> tempTables = Maps.newHashMap();
    for (int i = 0; i < numFinalTables; ++i) {
        String tableName = "project-id:dataset-id.table_" + i;
        TableDestination tableDestination = new TableDestination(tableName, "table_" + i + "_desc");
        List<String> tables = Lists.newArrayList();
        tempTables.put(tableDestination, tables);
        List<TableRow> expectedRows = expectedRowsPerTable.get(tableDestination);
        if (expectedRows == null) {
            expectedRows = Lists.newArrayList();
            expectedRowsPerTable.put(tableDestination, expectedRows);
        }
        for (int j = 0; i < numTempTablesPerFinalTable; ++i) {
            TableReference tempTable = new TableReference().setProjectId("project-id").setDatasetId("dataset-id").setTableId(String.format("%s_%05d_%05d", jobIdToken, i, j));
            datasetService.createTable(new Table().setTableReference(tempTable));
            List<TableRow> rows = Lists.newArrayList();
            for (int k = 0; k < numRecordsPerTempTable; ++k) {
                rows.add(new TableRow().set("number", j * numTempTablesPerFinalTable + k));
            }
            datasetService.insertAll(tempTable, rows, null);
            expectedRows.addAll(rows);
            tables.add(BigQueryHelpers.toJsonString(tempTable));
        }
    }
    PCollection<KV<TableDestination, String>> tempTablesPCollection = p.apply(Create.of(tempTables).withCoder(KvCoder.of(TableDestinationCoder.of(), IterableCoder.of(StringUtf8Coder.of())))).apply(ParDo.of(new DoFn<KV<TableDestination, Iterable<String>>, KV<TableDestination, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            TableDestination tableDestination = c.element().getKey();
            for (String tempTable : c.element().getValue()) {
                c.output(KV.of(tableDestination, tempTable));
            }
        }
    }));
    PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = PCollectionViews.multimapView(tempTablesPCollection, WindowingStrategy.globalDefault(), KvCoder.of(TableDestinationCoder.of(), StringUtf8Coder.of()));
    PCollectionView<String> jobIdTokenView = p.apply("CreateJobId", Create.of("jobId")).apply(View.<String>asSingleton());
    WriteRename writeRename = new WriteRename(fakeBqServices, jobIdTokenView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, tempTablesView);
    DoFnTester<Void, Void> tester = DoFnTester.of(writeRename);
    tester.setSideInput(tempTablesView, GlobalWindow.INSTANCE, tempTables);
    tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
    tester.processElement(null);
    for (Map.Entry<TableDestination, Iterable<String>> entry : tempTables.entrySet()) {
        TableDestination tableDestination = entry.getKey();
        TableReference tableReference = tableDestination.getTableReference();
        Table table = checkNotNull(datasetService.getTable(tableReference));
        assertEquals(tableReference.getTableId() + "_desc", tableDestination.getTableDescription());
        List<TableRow> expectedRows = expectedRowsPerTable.get(tableDestination);
        assertThat(datasetService.getAllRows(tableReference.getProjectId(), tableReference.getDatasetId(), tableReference.getTableId()), containsInAnyOrder(Iterables.toArray(expectedRows, TableRow.class)));
        // Temp tables should be deleted.
        for (String tempTableJson : entry.getValue()) {
            TableReference tempTable = BigQueryHelpers.fromJsonString(tempTableJson, TableReference.class);
            assertEquals(null, datasetService.getTable(tempTable));
        }
    }
}

Also used : BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) KV(org.apache.beam.sdk.values.KV) DoFn(org.apache.beam.sdk.transforms.DoFn) TableRow(com.google.api.services.bigquery.model.TableRow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Example 82 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.

the class DirectGraphVisitorTest method getStepNamesContainsAllTransforms.

@Test
public void getStepNamesContainsAllTransforms() {
    PCollection<String> created = p.apply(Create.of("1", "2", "3"));
    PCollection<String> transformed = created.apply(ParDo.of(new DoFn<String, String>() {

        @ProcessElement
        public void processElement(DoFn<String, String>.ProcessContext<String, String> c) throws Exception {
            c.output(Integer.toString(c.element().length()));
        }
    }));
    PDone finished = transformed.apply(new PTransform<PInput, PDone>() {

        @Override
        public PDone expand(PInput input) {
            return PDone.in(input.getPipeline());
        }
    });
    p.traverseTopologically(visitor);
    DirectGraph graph = visitor.getGraph();
    assertThat(graph.getStepName(graph.getProducer(created)), equalTo("s0"));
    assertThat(graph.getStepName(graph.getProducer(transformed)), equalTo("s1"));
// finished doesn't have a producer, because it's not a PValue.
// TODO: Demonstrate that PCollectionList/Tuple and other composite PValues are either safe to
// use, or make them so.
}

Also used : DoFn(org.apache.beam.sdk.transforms.DoFn) PDone(org.apache.beam.sdk.values.PDone) PInput(org.apache.beam.sdk.values.PInput) Test(org.junit.Test)

Example 83 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.

the class CalculateSchemas method expand.

@Override
public PCollectionView<Map<DestinationT, String>> expand(PCollection<KV<DestinationT, TableRow>> input) {
    List<PCollectionView<?>> sideInputs = Lists.newArrayList();
    sideInputs.addAll(dynamicDestinations.getSideInputs());
    return input.apply("Keys", Keys.<DestinationT>create()).apply("Distinct Keys", Distinct.<DestinationT>create()).apply("GetSchemas", ParDo.of(new DoFn<DestinationT, KV<DestinationT, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            dynamicDestinations.setSideInputAccessorFromProcessContext(c);
            TableSchema tableSchema = dynamicDestinations.getSchema(c.element());
            if (tableSchema != null) {
                // If the createDisposition is CREATE_NEVER, then there's no need for a
                // schema, and getSchema might return null. In this case, we simply
                // leave it out of the map.
                c.output(KV.of(c.element(), BigQueryHelpers.toJsonString(tableSchema)));
            }
        }
    }).withSideInputs(sideInputs)).apply("asMap", View.<DestinationT, String>asMap());
}

Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) TableSchema(com.google.api.services.bigquery.model.TableSchema)

Example 84 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project components by Talend.

the class TCompBoundedSourceSinkAdapterTest method testSink.

@Test
public void testSink() {
    Pipeline pipeline = TestPipeline.create();
    AssertResultProperties assertResultProperties = new AssertResultProperties("assertResultProperties");
    assertResultProperties.init();
    assertResultProperties.data.setValue("b;c;a");
    assertResultProperties.rowDelimited.setValue(";");
    AssertResultSink assertResultSink = new AssertResultSink();
    assertResultSink.initialize(null, assertResultProperties);
    TCompSinkAdapter sink = new TCompSinkAdapter(assertResultSink);
    final String schemaStr = assertResultProperties.schema.getValue().toString();
    pipeline.apply(Create.of("a", "b", "c")).apply(ParDo.of(new DoFn<String, IndexedRecord>() {

        @DoFn.ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            IndexedRecord row = new GenericData.Record(new Schema.Parser().parse(schemaStr));
            row.put(0, c.element());
            c.output(row);
        }
    })).setCoder(LazyAvroCoder.of()).apply(Write.to(sink));
    pipeline.run();
}

Also used : AssertResultProperties(org.talend.components.adapter.beam.example.AssertResultProperties) AssertResultSink(org.talend.components.adapter.beam.example.AssertResultSink) DoFn(org.apache.beam.sdk.transforms.DoFn) IndexedRecord(org.apache.avro.generic.IndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 85 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project components by Talend.

the class ElasticsearchInputRuntime method expand.

@Override
public PCollection<IndexedRecord> expand(PBegin in) {
    ElasticsearchIO.Read esRead = ElasticsearchIO.read().withConnectionConfiguration(createConnectionConf(properties.getDatasetProperties()));
    if (properties.query.getValue() != null) {
        esRead = esRead.withQuery(properties.query.getValue());
    }
    PCollection<String> readFromElasticsearch = in.apply(esRead);
    PCollection<IndexedRecord> elasticsearchDataAsAvro = readFromElasticsearch.apply("DocumentToIndexedRecord", ParDo.of(new DoFn<String, IndexedRecord>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            if (jsonGenericRecordConverter == null) {
                JsonSchemaInferrer jsonSchemaInferrer = new JsonSchemaInferrer(new ObjectMapper());
                Schema jsonSchema = jsonSchemaInferrer.inferSchema(c.element().toString());
                jsonGenericRecordConverter = new JsonGenericRecordConverter(jsonSchema);
            }
            GenericRecord outputRecord = jsonGenericRecordConverter.convertToAvro(c.element().toString());
            c.output(outputRecord);
        }
    }));
    return elasticsearchDataAsAvro;
}

Also used : ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) Schema(org.apache.avro.Schema) DoFn(org.apache.beam.sdk.transforms.DoFn) ElasticsearchIO(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO) JsonGenericRecordConverter(org.talend.daikon.avro.converter.JsonGenericRecordConverter) GenericRecord(org.apache.avro.generic.GenericRecord) JsonSchemaInferrer(org.talend.daikon.avro.inferrer.JsonSchemaInferrer) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Aggregations

DoFn (org.apache.beam.sdk.transforms.DoFn)154 Test (org.junit.Test)98 Pipeline (org.apache.beam.sdk.Pipeline)60 KV (org.apache.beam.sdk.values.KV)45 TupleTag (org.apache.beam.sdk.values.TupleTag)28 StateSpec (org.apache.beam.sdk.state.StateSpec)26 Instant (org.joda.time.Instant)26 ArrayList (java.util.ArrayList)23 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)23 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 PCollection (org.apache.beam.sdk.values.PCollection)21 TimerSpec (org.apache.beam.sdk.state.TimerSpec)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 PCollectionView (org.apache.beam.sdk.values.PCollectionView)18 HashMap (java.util.HashMap)17 Coder (org.apache.beam.sdk.coders.Coder)17 List (java.util.List)16 Map (java.util.Map)14 ValueState (org.apache.beam.sdk.state.ValueState)14 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)13