use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class BigQueryIOTest method testWriteRename.
@Test
public void testWriteRename() throws Exception {
p.enableAbandonedNodeEnforcement(false);
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
datasetService.createDataset("project-id", "dataset-id", "", "");
final int numFinalTables = 3;
final int numTempTablesPerFinalTable = 3;
final int numRecordsPerTempTable = 10;
Map<TableDestination, List<TableRow>> expectedRowsPerTable = Maps.newHashMap();
String jobIdToken = "jobIdToken";
Map<TableDestination, Iterable<String>> tempTables = Maps.newHashMap();
for (int i = 0; i < numFinalTables; ++i) {
String tableName = "project-id:dataset-id.table_" + i;
TableDestination tableDestination = new TableDestination(tableName, "table_" + i + "_desc");
List<String> tables = Lists.newArrayList();
tempTables.put(tableDestination, tables);
List<TableRow> expectedRows = expectedRowsPerTable.get(tableDestination);
if (expectedRows == null) {
expectedRows = Lists.newArrayList();
expectedRowsPerTable.put(tableDestination, expectedRows);
}
for (int j = 0; i < numTempTablesPerFinalTable; ++i) {
TableReference tempTable = new TableReference().setProjectId("project-id").setDatasetId("dataset-id").setTableId(String.format("%s_%05d_%05d", jobIdToken, i, j));
datasetService.createTable(new Table().setTableReference(tempTable));
List<TableRow> rows = Lists.newArrayList();
for (int k = 0; k < numRecordsPerTempTable; ++k) {
rows.add(new TableRow().set("number", j * numTempTablesPerFinalTable + k));
}
datasetService.insertAll(tempTable, rows, null);
expectedRows.addAll(rows);
tables.add(BigQueryHelpers.toJsonString(tempTable));
}
}
PCollection<KV<TableDestination, String>> tempTablesPCollection = p.apply(Create.of(tempTables).withCoder(KvCoder.of(TableDestinationCoder.of(), IterableCoder.of(StringUtf8Coder.of())))).apply(ParDo.of(new DoFn<KV<TableDestination, Iterable<String>>, KV<TableDestination, String>>() {
@ProcessElement
public void processElement(ProcessContext c) {
TableDestination tableDestination = c.element().getKey();
for (String tempTable : c.element().getValue()) {
c.output(KV.of(tableDestination, tempTable));
}
}
}));
PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = PCollectionViews.multimapView(tempTablesPCollection, WindowingStrategy.globalDefault(), KvCoder.of(TableDestinationCoder.of(), StringUtf8Coder.of()));
PCollectionView<String> jobIdTokenView = p.apply("CreateJobId", Create.of("jobId")).apply(View.<String>asSingleton());
WriteRename writeRename = new WriteRename(fakeBqServices, jobIdTokenView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, tempTablesView);
DoFnTester<Void, Void> tester = DoFnTester.of(writeRename);
tester.setSideInput(tempTablesView, GlobalWindow.INSTANCE, tempTables);
tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
tester.processElement(null);
for (Map.Entry<TableDestination, Iterable<String>> entry : tempTables.entrySet()) {
TableDestination tableDestination = entry.getKey();
TableReference tableReference = tableDestination.getTableReference();
Table table = checkNotNull(datasetService.getTable(tableReference));
assertEquals(tableReference.getTableId() + "_desc", tableDestination.getTableDescription());
List<TableRow> expectedRows = expectedRowsPerTable.get(tableDestination);
assertThat(datasetService.getAllRows(tableReference.getProjectId(), tableReference.getDatasetId(), tableReference.getTableId()), containsInAnyOrder(Iterables.toArray(expectedRows, TableRow.class)));
// Temp tables should be deleted.
for (String tempTableJson : entry.getValue()) {
TableReference tempTable = BigQueryHelpers.fromJsonString(tempTableJson, TableReference.class);
assertEquals(null, datasetService.getTable(tempTable));
}
}
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class DirectGraphVisitorTest method getStepNamesContainsAllTransforms.
@Test
public void getStepNamesContainsAllTransforms() {
PCollection<String> created = p.apply(Create.of("1", "2", "3"));
PCollection<String> transformed = created.apply(ParDo.of(new DoFn<String, String>() {
@ProcessElement
public void processElement(DoFn<String, String>.ProcessContext<String, String> c) throws Exception {
c.output(Integer.toString(c.element().length()));
}
}));
PDone finished = transformed.apply(new PTransform<PInput, PDone>() {
@Override
public PDone expand(PInput input) {
return PDone.in(input.getPipeline());
}
});
p.traverseTopologically(visitor);
DirectGraph graph = visitor.getGraph();
assertThat(graph.getStepName(graph.getProducer(created)), equalTo("s0"));
assertThat(graph.getStepName(graph.getProducer(transformed)), equalTo("s1"));
// finished doesn't have a producer, because it's not a PValue.
// TODO: Demonstrate that PCollectionList/Tuple and other composite PValues are either safe to
// use, or make them so.
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class CalculateSchemas method expand.
@Override
public PCollectionView<Map<DestinationT, String>> expand(PCollection<KV<DestinationT, TableRow>> input) {
List<PCollectionView<?>> sideInputs = Lists.newArrayList();
sideInputs.addAll(dynamicDestinations.getSideInputs());
return input.apply("Keys", Keys.<DestinationT>create()).apply("Distinct Keys", Distinct.<DestinationT>create()).apply("GetSchemas", ParDo.of(new DoFn<DestinationT, KV<DestinationT, String>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
dynamicDestinations.setSideInputAccessorFromProcessContext(c);
TableSchema tableSchema = dynamicDestinations.getSchema(c.element());
if (tableSchema != null) {
// If the createDisposition is CREATE_NEVER, then there's no need for a
// schema, and getSchema might return null. In this case, we simply
// leave it out of the map.
c.output(KV.of(c.element(), BigQueryHelpers.toJsonString(tableSchema)));
}
}
}).withSideInputs(sideInputs)).apply("asMap", View.<DestinationT, String>asMap());
}
use of org.apache.beam.sdk.transforms.DoFn in project components by Talend.
the class TCompBoundedSourceSinkAdapterTest method testSink.
@Test
public void testSink() {
Pipeline pipeline = TestPipeline.create();
AssertResultProperties assertResultProperties = new AssertResultProperties("assertResultProperties");
assertResultProperties.init();
assertResultProperties.data.setValue("b;c;a");
assertResultProperties.rowDelimited.setValue(";");
AssertResultSink assertResultSink = new AssertResultSink();
assertResultSink.initialize(null, assertResultProperties);
TCompSinkAdapter sink = new TCompSinkAdapter(assertResultSink);
final String schemaStr = assertResultProperties.schema.getValue().toString();
pipeline.apply(Create.of("a", "b", "c")).apply(ParDo.of(new DoFn<String, IndexedRecord>() {
@DoFn.ProcessElement
public void processElement(ProcessContext c) throws Exception {
IndexedRecord row = new GenericData.Record(new Schema.Parser().parse(schemaStr));
row.put(0, c.element());
c.output(row);
}
})).setCoder(LazyAvroCoder.of()).apply(Write.to(sink));
pipeline.run();
}
use of org.apache.beam.sdk.transforms.DoFn in project components by Talend.
the class ElasticsearchInputRuntime method expand.
@Override
public PCollection<IndexedRecord> expand(PBegin in) {
ElasticsearchIO.Read esRead = ElasticsearchIO.read().withConnectionConfiguration(createConnectionConf(properties.getDatasetProperties()));
if (properties.query.getValue() != null) {
esRead = esRead.withQuery(properties.query.getValue());
}
PCollection<String> readFromElasticsearch = in.apply(esRead);
PCollection<IndexedRecord> elasticsearchDataAsAvro = readFromElasticsearch.apply("DocumentToIndexedRecord", ParDo.of(new DoFn<String, IndexedRecord>() {
@ProcessElement
public void processElement(ProcessContext c) {
if (jsonGenericRecordConverter == null) {
JsonSchemaInferrer jsonSchemaInferrer = new JsonSchemaInferrer(new ObjectMapper());
Schema jsonSchema = jsonSchemaInferrer.inferSchema(c.element().toString());
jsonGenericRecordConverter = new JsonGenericRecordConverter(jsonSchema);
}
GenericRecord outputRecord = jsonGenericRecordConverter.convertToAvro(c.element().toString());
c.output(outputRecord);
}
}));
return elasticsearchDataAsAvro;
}
Aggregations