use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.
the class ElasticsearchIOTestCommon method testWriteScriptedUpsert.
/**
* Tests upsert script by adding a group field to each document in the standard test set. The
* group field is populated as the modulo 2 of the document id allowing for a test to ensure the
* documents are split into 2 groups.
*/
void testWriteScriptedUpsert() throws Exception {
List<String> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS);
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withIdFn(new ExtractValueFn("id")).withUpsertScript(SCRIPT_SOURCE);
// Test that documents can be inserted/created by using withUpsertScript
pipeline.apply(Create.of(data)).apply(write);
pipeline.run();
// defensive coding to ensure our initial state is as expected
long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient);
// check we have not unwittingly modified existing behaviour
assertEquals(numDocs, currentNumDocs);
assertEquals(numDocs / NUM_SCIENTISTS, countByScientistName(connectionConfiguration, restClient, "Einstein", null));
// All docs should have have group = 0 added by the script upon creation
assertEquals(numDocs, countByMatch(connectionConfiguration, restClient, "group", "0", null, null));
// Run the same data again. This time, because all docs exist in the index already, scripted
// updates should happen rather than scripted inserts.
pipeline.apply(Create.of(data)).apply(write);
pipeline.run();
currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient);
// check we have not unwittingly modified existing behaviour
assertEquals(numDocs, currentNumDocs);
assertEquals(numDocs / NUM_SCIENTISTS, countByScientistName(connectionConfiguration, restClient, "Einstein", null));
// The script will set either 0 or 1 for the group value on update operations
assertEquals(numDocs / 2, countByMatch(connectionConfiguration, restClient, "group", "0", null, null));
assertEquals(numDocs / 2, countByMatch(connectionConfiguration, restClient, "group", "1", null, null));
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.
the class ElasticsearchIOTestCommon method testMaxParallelRequestsPerWindow.
void testMaxParallelRequestsPerWindow() throws Exception {
List<Document> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS).stream().map(doc -> Document.create().withInputDoc(doc).withTimestamp(Instant.now())).collect(Collectors.toList());
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxParallelRequestsPerWindow(1);
PCollection<KV<Integer, Iterable<Document>>> batches = pipeline.apply(Create.of(data)).apply(StatefulBatching.fromSpec(write.getBulkIO()));
PCollection<Integer> keyValues = batches.apply(MapElements.into(integers()).via((SerializableFunction<KV<Integer, Iterable<Document>>, Integer>) KV::getKey));
// Number of unique keys produced should be number of maxParallelRequestsPerWindow * numWindows
// There is only 1 request (key) per window, and 1 (global) window ie. one key total where
// key value is 0
PAssert.that(keyValues).containsInAnyOrder(0);
PAssert.that(batches).satisfies(new AssertThatHasExpectedContents(0, data));
pipeline.run();
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.
the class ElasticsearchIOTestCommon method testWriteAppendOnly.
void testWriteAppendOnly() throws Exception {
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withIdFn(new ExtractValueFn("id")).withAppendOnly(true);
executeWriteTest(write);
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.
the class ElasticsearchIOTestCommon method testWriteWithErrorsReturnedAllowedErrors.
void testWriteWithErrorsReturnedAllowedErrors() throws Exception {
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSize(BATCH_SIZE).withThrowWriteErrors(false).withAllowableResponseErrors(Collections.singleton("json_parse_exception"));
List<String> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS);
PCollectionTuple outputs = pipeline.apply(Create.of(data)).apply(write);
PCollection<Integer> success = outputs.get(Write.SUCCESSFUL_WRITES).apply("Convert success to input ID", MapElements.via(mapToInputId));
PCollection<Integer> fail = outputs.get(Write.FAILED_WRITES).apply("Convert fails to input ID", MapElements.via(mapToInputId));
// Successful IDs should be all IDs, as we're explicitly telling the ES transform that we
// want to ignore failures of a certain kind, therefore treat those failures as having been
// successfully processed
Set<Integer> successfulIds = IntStream.range(0, data.size()).boxed().collect(Collectors.toSet());
PAssert.that(success).containsInAnyOrder(successfulIds);
PAssert.that(fail).empty();
pipeline.run();
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write in project beam by apache.
the class ElasticsearchIOTestCommon method testWriteWithErrorsReturned.
void testWriteWithErrorsReturned() throws Exception {
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSize(BATCH_SIZE).withThrowWriteErrors(false);
List<String> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS);
PCollectionTuple outputs = pipeline.apply(Create.of(data)).apply(write);
PCollection<Integer> success = outputs.get(Write.SUCCESSFUL_WRITES).apply("Convert success to input ID", MapElements.via(mapToInputId));
PCollection<Integer> fail = outputs.get(Write.FAILED_WRITES).apply("Convert fails to input ID", MapElements.via(mapToInputId));
Set<Integer> successfulIds = IntStream.range(0, data.size()).boxed().collect(Collectors.toSet());
successfulIds.removeAll(INVALID_DOCS_IDS);
PAssert.that(success).containsInAnyOrder(successfulIds);
PAssert.that(fail).containsInAnyOrder(INVALID_DOCS_IDS);
pipeline.run();
}
Aggregations