Search in sources :

Example 41 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class HIFIOElasticIT method testHifIOWithElasticQuery.

/**
   * This test reads data from the Elasticsearch instance based on a query and verifies if data is
   * read successfully.
   */
@Test
public void testHifIOWithElasticQuery() {
    String expectedHashCode = "d7a7e4e42c2ca7b83ef7c1ad1ebce000";
    Long expectedRecordsCount = 1L;
    Configuration conf = getConfiguration(options);
    String query = "{" + "  \"query\": {" + "  \"match\" : {" + "    \"Title\" : {" + "      \"query\" : \"Title9\"," + "      \"type\" : \"boolean\"" + "    }" + "  }" + "  }" + "}";
    conf.set(ConfigurationOptions.ES_QUERY, query);
    PCollection<KV<Text, LinkedMapWritable>> esData = pipeline.apply(HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
    PCollection<Long> count = esData.apply(Count.<KV<Text, LinkedMapWritable>>globally());
    // Verify that the count of objects fetched using HIFInputFormat IO is correct.
    PAssert.thatSingleton(count).isEqualTo(expectedRecordsCount);
    PCollection<LinkedMapWritable> values = esData.apply(Values.<LinkedMapWritable>create());
    PCollection<String> textValues = values.apply(transformFunc);
    // Verify the output values using checksum comparison.
    PCollection<String> consolidatedHashcode = textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
    PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
    pipeline.run().waitUntilFinish();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) LinkedMapWritable(org.elasticsearch.hadoop.mr.LinkedMapWritable) Text(org.apache.hadoop.io.Text) KV(org.apache.beam.sdk.values.KV) HashingFn(org.apache.beam.sdk.io.common.HashingFn) Test(org.junit.Test)

Example 42 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class JdbcIOTest method testWrite.

@Test
@Category(NeedsRunner.class)
public void testWrite() throws Exception {
    String tableName = JdbcTestDataSet.createWriteDataTable(dataSource);
    try {
        ArrayList<KV<Integer, String>> data = new ArrayList<>();
        for (int i = 0; i < 1000; i++) {
            KV<Integer, String> kv = KV.of(i, "Test");
            data.add(kv);
        }
        pipeline.apply(Create.of(data)).apply(JdbcIO.<KV<Integer, String>>write().withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create("org.apache.derby.jdbc.ClientDriver", "jdbc:derby://localhost:" + port + "/target/beam")).withStatement(String.format("insert into %s values(?, ?)", tableName)).withPreparedStatementSetter(new JdbcIO.PreparedStatementSetter<KV<Integer, String>>() {

            public void setParameters(KV<Integer, String> element, PreparedStatement statement) throws Exception {
                statement.setInt(1, element.getKey());
                statement.setString(2, element.getValue());
            }
        }));
        pipeline.run();
        try (Connection connection = dataSource.getConnection()) {
            try (Statement statement = connection.createStatement()) {
                try (ResultSet resultSet = statement.executeQuery("select count(*) from " + tableName)) {
                    resultSet.next();
                    int count = resultSet.getInt(1);
                    Assert.assertEquals(2000, count);
                }
            }
        }
    } finally {
        JdbcTestDataSet.cleanUpDataTable(dataSource, tableName);
    }
}
Also used : PreparedStatement(java.sql.PreparedStatement) Statement(java.sql.Statement) ArrayList(java.util.ArrayList) Connection(java.sql.Connection) ResultSet(java.sql.ResultSet) PreparedStatement(java.sql.PreparedStatement) KV(org.apache.beam.sdk.values.KV) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 43 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class DataflowPipelineTranslatorTest method testBatchStatefulParDoTranslation.

/**
   * Smoke test to fail fast if translation of a stateful ParDo
   * in batch breaks.
   */
@Test
public void testBatchStatefulParDoTranslation() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    options.setStreaming(false);
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    Pipeline pipeline = Pipeline.create(options);
    TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {
    };
    pipeline.apply(Create.of(KV.of(1, 1))).apply(ParDo.of(new DoFn<KV<Integer, Integer>, Integer>() {

        @StateId("unused")
        final StateSpec<ValueState<Integer>> stateSpec = StateSpecs.value(VarIntCoder.of());

        @ProcessElement
        public void process(ProcessContext c) {
        // noop
        }
    }).withOutputTags(mainOutputTag, TupleTagList.empty()));
    runner.replaceTransforms(pipeline);
    Job job = translator.translate(pipeline, runner, Collections.<DataflowPackage>emptyList()).getJob();
    // The job should look like:
    // 0. ParallelRead (Create)
    // 1. ParDo(ReifyWVs)
    // 2. GroupByKeyAndSortValuesONly
    // 3. A ParDo over grouped and sorted KVs that is executed via ungrouping service-side
    List<Step> steps = job.getSteps();
    assertEquals(4, steps.size());
    Step createStep = steps.get(0);
    assertEquals("ParallelRead", createStep.getKind());
    Step reifyWindowedValueStep = steps.get(1);
    assertEquals("ParallelDo", reifyWindowedValueStep.getKind());
    Step gbkStep = steps.get(2);
    assertEquals("GroupByKey", gbkStep.getKind());
    Step statefulParDoStep = steps.get(3);
    assertEquals("ParallelDo", statefulParDoStep.getKind());
    assertThat((String) statefulParDoStep.getProperties().get(PropertyNames.USES_KEYED_STATE), not(equalTo("true")));
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) Step(com.google.api.services.dataflow.model.Step) Pipeline(org.apache.beam.sdk.Pipeline) ValueState(org.apache.beam.sdk.state.ValueState) Job(com.google.api.services.dataflow.model.Job) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Test(org.junit.Test)

Example 44 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class BoundedReadFromUnboundedSourceTest method testForwardsDisplayData.

@Test
public void testForwardsDisplayData() {
    TestCountingSource src = new TestCountingSource(1234) {

        @Override
        public void populateDisplayData(DisplayData.Builder builder) {
            builder.add(DisplayData.item("foo", "bar"));
        }
    };
    BoundedReadFromUnboundedSource<KV<Integer, Integer>> read = Read.from(src).withMaxNumRecords(5);
    assertThat(DisplayData.from(read), includesDisplayDataFor("source", src));
}
Also used : TestCountingSource(org.apache.beam.sdk.runners.dataflow.TestCountingSource) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 45 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class AutoCompleteTest method testTinyAutoComplete.

@Test
public void testTinyAutoComplete() {
    List<String> words = Arrays.asList("x", "x", "x", "xy", "xy", "xyz");
    PCollection<String> input = p.apply(Create.of(words));
    PCollection<KV<String, List<CompletionCandidate>>> output = input.apply(new ComputeTopCompletions(2, recursive));
    PAssert.that(output).containsInAnyOrder(KV.of("x", parseList("x:3", "xy:2")), KV.of("xy", parseList("xy:2", "xyz:1")), KV.of("xyz", parseList("xyz:1")));
    p.run().waitUntilFinish();
}
Also used : CompletionCandidate(org.apache.beam.examples.complete.AutoComplete.CompletionCandidate) KV(org.apache.beam.sdk.values.KV) ComputeTopCompletions(org.apache.beam.examples.complete.AutoComplete.ComputeTopCompletions) Test(org.junit.Test)

Aggregations

KV (org.apache.beam.sdk.values.KV)192 Test (org.junit.Test)143 Instant (org.joda.time.Instant)66 Category (org.junit.experimental.categories.Category)62 Pipeline (org.apache.beam.sdk.Pipeline)35 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)34 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)33 Matchers.containsString (org.hamcrest.Matchers.containsString)33 StateSpec (org.apache.beam.sdk.state.StateSpec)25 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 ArrayList (java.util.ArrayList)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)19 TupleTag (org.apache.beam.sdk.values.TupleTag)16 TableRow (com.google.api.services.bigquery.model.TableRow)15 Map (java.util.Map)15 ValueState (org.apache.beam.sdk.state.ValueState)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 HashMap (java.util.HashMap)12 Timer (org.apache.beam.sdk.state.Timer)12