Search in sources :

Example 6 with Write

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.

the class ElasticsearchIOTestCommon method testWriteScriptedUpsert.

/**
 * Tests upsert script by adding a group field to each document in the standard test set. The
 * group field is populated as the modulo 2 of the document id allowing for a test to ensure the
 * documents are split into 2 groups.
 */
void testWriteScriptedUpsert() throws Exception {
    List<String> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS);
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withIdFn(new ExtractValueFn("id")).withUpsertScript(SCRIPT_SOURCE);
    // Test that documents can be inserted/created by using withUpsertScript
    pipeline.apply(Create.of(data)).apply(write);
    pipeline.run();
    // defensive coding to ensure our initial state is as expected
    long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient);
    // check we have not unwittingly modified existing behaviour
    assertEquals(numDocs, currentNumDocs);
    assertEquals(numDocs / NUM_SCIENTISTS, countByScientistName(connectionConfiguration, restClient, "Einstein", null));
    // All docs should have have group = 0 added by the script upon creation
    assertEquals(numDocs, countByMatch(connectionConfiguration, restClient, "group", "0", null, null));
    // Run the same data again. This time, because all docs exist in the index already, scripted
    // updates should happen rather than scripted inserts.
    pipeline.apply(Create.of(data)).apply(write);
    pipeline.run();
    currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient);
    // check we have not unwittingly modified existing behaviour
    assertEquals(numDocs, currentNumDocs);
    assertEquals(numDocs / NUM_SCIENTISTS, countByScientistName(connectionConfiguration, restClient, "Einstein", null));
    // The script will set either 0 or 1 for the group value on update operations
    assertEquals(numDocs / 2, countByMatch(connectionConfiguration, restClient, "group", "0", null, null));
    assertEquals(numDocs / 2, countByMatch(connectionConfiguration, restClient, "group", "1", null, null));
}
Also used : Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write)

Example 7 with Write

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.

the class ElasticsearchIOTestCommon method testMaxParallelRequestsPerWindow.

void testMaxParallelRequestsPerWindow() throws Exception {
    List<Document> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS).stream().map(doc -> Document.create().withInputDoc(doc).withTimestamp(Instant.now())).collect(Collectors.toList());
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxParallelRequestsPerWindow(1);
    PCollection<KV<Integer, Iterable<Document>>> batches = pipeline.apply(Create.of(data)).apply(StatefulBatching.fromSpec(write.getBulkIO()));
    PCollection<Integer> keyValues = batches.apply(MapElements.into(integers()).via((SerializableFunction<KV<Integer, Iterable<Document>>, Integer>) KV::getKey));
    // Number of unique keys produced should be number of maxParallelRequestsPerWindow * numWindows
    // There is only 1 request (key) per window, and 1 (global) window ie. one key total where
    // key value is 0
    PAssert.that(keyValues).containsInAnyOrder(0);
    PAssert.that(batches).satisfies(new AssertThatHasExpectedContents(0, data));
    pipeline.run();
}
Also used : Read(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read) Count(org.apache.beam.sdk.transforms.Count) Arrays(java.util.Arrays) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) InjectionMode(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.InjectionMode) ElasticsearchIOTestUtils.countByMatch(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.countByMatch) DoFnTester(org.apache.beam.sdk.transforms.DoFnTester) FAMOUS_SCIENTISTS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.FAMOUS_SCIENTISTS) PipedInputStream(java.io.PipedInputStream) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Is.is(org.hamcrest.core.Is.is) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Assert.fail(org.junit.Assert.fail) JsonNode(com.fasterxml.jackson.databind.JsonNode) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document) NStringEntity(org.apache.http.nio.entity.NStringEntity) ElasticsearchIO.getBackendVersion(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.getBackendVersion) ValueProvider(org.apache.beam.sdk.options.ValueProvider) DocToBulk(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.DocToBulk) MapElements(org.apache.beam.sdk.transforms.MapElements) ElasticsearchIOTestUtils.mapToInputId(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.mapToInputId) SourceTestUtils.readFromSource(org.apache.beam.sdk.testing.SourceTestUtils.readFromSource) CustomMatcher(org.hamcrest.CustomMatcher) ConnectionConfiguration(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.ConnectionConfiguration) HttpEntity(org.apache.http.HttpEntity) ContentType(org.apache.http.entity.ContentType) Set(java.util.Set) ElasticsearchIOTestUtils.flushAndRefreshAllIndices(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.flushAndRefreshAllIndices) Collectors(java.util.stream.Collectors) TypeSafeMatcher(org.hamcrest.TypeSafeMatcher) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) Assert.assertFalse(org.junit.Assert.assertFalse) Response(org.elasticsearch.client.Response) State(org.apache.beam.sdk.PipelineResult.State) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) SCRIPT_SOURCE(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.SCRIPT_SOURCE) BulkIO(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO) IntStream(java.util.stream.IntStream) RestClient(org.elasticsearch.client.RestClient) TypeDescriptors.integers(org.apache.beam.sdk.values.TypeDescriptors.integers) KV(org.apache.beam.sdk.values.KV) ElasticsearchIOTestUtils.refreshIndexAndGetCurrentNumDocs(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.refreshIndexAndGetCurrentNumDocs) Duration(org.joda.time.Duration) RetryPredicate(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.RetryPredicate) ElasticsearchIOTestUtils.insertTestDocuments(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.insertTestDocuments) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) INVALID_DOCS_IDS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.INVALID_DOCS_IDS) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) ArrayList(java.util.ArrayList) Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) DefaultRetryPredicate(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.DefaultRetryPredicate) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Matchers.lessThan(org.hamcrest.Matchers.lessThan) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExpectedException(org.junit.rules.ExpectedException) BoundedElasticsearchSource(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BoundedElasticsearchSource) Description(org.hamcrest.Description) SourceTestUtils(org.apache.beam.sdk.testing.SourceTestUtils) Logger(org.slf4j.Logger) StatefulBatching(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO.StatefulBatching) PAssert(org.apache.beam.sdk.testing.PAssert) ElasticsearchIOTestUtils.countByScientistName(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.countByScientistName) DocumentCoder(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.DocumentCoder) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) PipedOutputStream(java.io.PipedOutputStream) PCollection(org.apache.beam.sdk.values.PCollection) Request(org.elasticsearch.client.Request) Is.isA(org.hamcrest.core.Is.isA) NUM_SCIENTISTS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.NUM_SCIENTISTS) BoundedSource(org.apache.beam.sdk.io.BoundedSource) DEFAULT_RETRY_PREDICATE(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.DEFAULT_RETRY_PREDICATE) Matcher(org.hamcrest.Matcher) Instant(org.joda.time.Instant) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) IsIterableContainingInAnyOrder(org.hamcrest.collection.IsIterableContainingInAnyOrder) Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) KV(org.apache.beam.sdk.values.KV) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)

Example 8 with Write

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.

the class ElasticsearchIOTestCommon method testWriteAppendOnly.

void testWriteAppendOnly() throws Exception {
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withIdFn(new ExtractValueFn("id")).withAppendOnly(true);
    executeWriteTest(write);
}
Also used : Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write)

Example 9 with Write

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.

the class ElasticsearchIOTestCommon method testWriteWithErrorsReturnedAllowedErrors.

void testWriteWithErrorsReturnedAllowedErrors() throws Exception {
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSize(BATCH_SIZE).withThrowWriteErrors(false).withAllowableResponseErrors(Collections.singleton("json_parse_exception"));
    List<String> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS);
    PCollectionTuple outputs = pipeline.apply(Create.of(data)).apply(write);
    PCollection<Integer> success = outputs.get(Write.SUCCESSFUL_WRITES).apply("Convert success to input ID", MapElements.via(mapToInputId));
    PCollection<Integer> fail = outputs.get(Write.FAILED_WRITES).apply("Convert fails to input ID", MapElements.via(mapToInputId));
    // Successful IDs should be all IDs, as we're explicitly telling the ES transform that we
    // want to ignore failures of a certain kind, therefore treat those failures as having been
    // successfully processed
    Set<Integer> successfulIds = IntStream.range(0, data.size()).boxed().collect(Collectors.toSet());
    PAssert.that(success).containsInAnyOrder(successfulIds);
    PAssert.that(fail).empty();
    pipeline.run();
}
Also used : Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 10 with Write

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.

the class ElasticsearchIOTestCommon method testWriteWithErrorsReturned.

void testWriteWithErrorsReturned() throws Exception {
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSize(BATCH_SIZE).withThrowWriteErrors(false);
    List<String> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS);
    PCollectionTuple outputs = pipeline.apply(Create.of(data)).apply(write);
    PCollection<Integer> success = outputs.get(Write.SUCCESSFUL_WRITES).apply("Convert success to input ID", MapElements.via(mapToInputId));
    PCollection<Integer> fail = outputs.get(Write.FAILED_WRITES).apply("Convert fails to input ID", MapElements.via(mapToInputId));
    Set<Integer> successfulIds = IntStream.range(0, data.size()).boxed().collect(Collectors.toSet());
    successfulIds.removeAll(INVALID_DOCS_IDS);
    PAssert.that(success).containsInAnyOrder(successfulIds);
    PAssert.that(fail).containsInAnyOrder(INVALID_DOCS_IDS);
    pipeline.run();
}
Also used : Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Aggregations

Write (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write)17 Document (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)6 BulkIO (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO)5 ArrayList (java.util.ArrayList)4 IOException (java.io.IOException)3 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)3 ObjectNode (com.fasterxml.jackson.databind.node.ObjectNode)2 PipedInputStream (java.io.PipedInputStream)2 PipedOutputStream (java.io.PipedOutputStream)2 DocumentCoder (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.DocumentCoder)2 Instant (org.joda.time.Instant)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 Serializable (java.io.Serializable)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 ThreadLocalRandom (java.util.concurrent.ThreadLocalRandom)1