use of org.apache.beam.sdk.Pipeline in project beam by apache.
the class BigtableWriteIT method testE2EBigtableWrite.
@Test
public void testE2EBigtableWrite() throws Exception {
final String tableName = bigtableOptions.getInstanceName().toTableNameStr(tableId);
final String instanceName = bigtableOptions.getInstanceName().toString();
final int numRows = 1000;
final List<KV<ByteString, ByteString>> testData = generateTableData(numRows);
createEmptyTable(instanceName, tableId);
Pipeline p = Pipeline.create(options);
p.apply(GenerateSequence.from(0).to(numRows)).apply(ParDo.of(new DoFn<Long, KV<ByteString, Iterable<Mutation>>>() {
@ProcessElement
public void processElement(ProcessContext c) {
int index = c.element().intValue();
Iterable<Mutation> mutations = ImmutableList.of(Mutation.newBuilder().setSetCell(Mutation.SetCell.newBuilder().setValue(testData.get(index).getValue()).setFamilyName(COLUMN_FAMILY_NAME)).build());
c.output(KV.of(testData.get(index).getKey(), mutations));
}
})).apply(BigtableIO.write().withBigtableOptions(bigtableOptions).withTableId(tableId));
p.run();
// Test number of column families and column family name equality
Table table = getTable(tableName);
assertThat(table.getColumnFamiliesMap().keySet(), Matchers.hasSize(1));
assertThat(table.getColumnFamiliesMap(), Matchers.hasKey(COLUMN_FAMILY_NAME));
// Test table data equality
List<KV<ByteString, ByteString>> tableData = getTableData(tableName);
assertThat(tableData, Matchers.containsInAnyOrder(testData.toArray()));
}
use of org.apache.beam.sdk.Pipeline in project beam by apache.
the class DatastoreV1Test method testRuntimeOptionsNotCalledInApplyQuery.
/**
* Test to ensure that {@link ValueProvider} values are not accessed at pipeline construction time
* when built with {@link DatastoreV1.Read#withQuery(Query)}.
*/
@Test
public void testRuntimeOptionsNotCalledInApplyQuery() {
RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class);
Pipeline pipeline = TestPipeline.create(options);
pipeline.apply(DatastoreIO.v1().read().withProjectId(options.getDatastoreProject()).withQuery(QUERY).withNamespace(options.getNamespace())).apply(DatastoreIO.v1().write().withProjectId(options.getDatastoreProject()));
}
use of org.apache.beam.sdk.Pipeline in project beam by apache.
the class V1ReadIT method testE2EV1ReadWithGQLQuery.
/**
* An end-to-end test for {@link DatastoreV1.Read#withLiteralGqlQuery(String)}.
*
* <p>Write some test entities to datastore and then run a pipeline that
* reads and counts the total number of entities. Verify that the count matches
* the number of entities written.
*/
private void testE2EV1ReadWithGQLQuery(long limit) throws Exception {
String gqlQuery = String.format("SELECT * from %s WHERE __key__ HAS ANCESTOR KEY(%s, '%s')", options.getKind(), options.getKind(), ancestor);
long expectedNumEntities = numEntities;
if (limit > 0) {
gqlQuery = String.format("%s LIMIT %d", gqlQuery, limit);
expectedNumEntities = limit;
}
DatastoreV1.Read read = DatastoreIO.v1().read().withProjectId(project).withLiteralGqlQuery(gqlQuery).withNamespace(options.getNamespace());
// Count the total number of entities
Pipeline p = Pipeline.create(options);
PCollection<Long> count = p.apply(read).apply(Count.<Entity>globally());
PAssert.thatSingleton(count).isEqualTo(expectedNumEntities);
p.run();
}
use of org.apache.beam.sdk.Pipeline in project beam by apache.
the class WindowedWordCount method main.
public static void main(String[] args) throws IOException {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
final String output = options.getOutput();
final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
Pipeline pipeline = Pipeline.create(options);
/**
* Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
* unbounded input source.
*/
PCollection<String> input = pipeline.apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
/**
* Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
* minute (you can change this with a command-line option). See the documentation for more
* information on how fixed windows work, and for information on the other types of windowing
* available (e.g., sliding windows).
*/
PCollection<String> windowedWords = input.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
/**
* Concept #4: Re-use our existing CountWords transform that does not have knowledge of
* windows over a PCollection containing windowed values.
*/
PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
/**
* Concept #5: Format the results and write to a sharded file partitioned by window, using a
* simple ParDo operation. Because there may be failures followed by retries, the
* writes must be idempotent, but the details of writing to files is elided here.
*/
wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(new WriteOneFilePerWindow(output, options.getNumShards()));
PipelineResult result = pipeline.run();
try {
result.waitUntilFinish();
} catch (Exception exc) {
result.cancel();
}
}
use of org.apache.beam.sdk.Pipeline in project beam by apache.
the class WordCount method main.
public static void main(String[] args) {
WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
Pipeline p = Pipeline.create(options);
// Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
// static FormatAsTextFn() to the ParDo transform.
p.apply("ReadLines", TextIO.read().from(options.getInputFile())).apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn())).apply("WriteCounts", TextIO.write().to(options.getOutput()));
p.run().waitUntilFinish();
}
Aggregations