use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.
the class ElasticsearchIOTestCommon method testWriteWithAllowedErrors.
void testWriteWithAllowedErrors() throws Exception {
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSize(BATCH_SIZE).withAllowableResponseErrors(Collections.singleton("json_parse_exception"));
List<String> input = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS);
// so we test the Writer as a DoFn outside of a runner.
try (DoFnTester<Document, Document> fnTester = DoFnTester.of(new BulkIO.BulkIOBundleFn(write.getBulkIO()))) {
// inserts into Elasticsearch
fnTester.processBundle(serializeDocs(write, input));
}
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.
the class ElasticsearchIOTestCommon method testWriteWithMaxBatchSizeBytes.
void testWriteWithMaxBatchSizeBytes() throws Exception {
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSizeBytes(BATCH_SIZE_BYTES);
// so we test the Writer as a DoFn outside of a runner.
try (DoFnTester<Document, Document> fnTester = DoFnTester.of(new BulkIO.BulkIOBundleFn(write.getBulkIO()))) {
List<String> input = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS);
List<Document> serializedInput = new ArrayList<>();
for (String doc : input) {
String bulkDoc = DocToBulk.createBulkApiEntity(write.getDocToBulk(), doc, getBackendVersion(connectionConfiguration));
Document r = Document.create().withInputDoc(doc).withBulkDirective(bulkDoc).withTimestamp(Instant.now());
serializedInput.add(r);
}
long numDocsProcessed = 0;
long sizeProcessed = 0;
long numDocsInserted = 0;
long batchInserted = 0;
for (Document document : serializedInput) {
fnTester.processElement(document);
numDocsProcessed++;
sizeProcessed += document.getBulkDirective().getBytes(StandardCharsets.UTF_8).length;
// test every 40 docs to avoid overloading ES
if ((numDocsProcessed % 40) == 0) {
// force the index to upgrade after inserting for the inserted docs
// to be searchable immediately
long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient);
if (sizeProcessed / BATCH_SIZE_BYTES > batchInserted) {
/* bundle end */
assertThat("we have passed a bundle size, we should have inserted some documents", currentNumDocs, greaterThan(numDocsInserted));
numDocsInserted = currentNumDocs;
batchInserted = (sizeProcessed / BATCH_SIZE_BYTES);
} else {
/* not bundle end */
assertEquals("we are not at the end of a bundle, we should have inserted no more documents", numDocsInserted, currentNumDocs);
}
}
}
}
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.
the class ElasticsearchIOTestCommon method testMaxParallelRequestsPerWindow.
void testMaxParallelRequestsPerWindow() throws Exception {
List<Document> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS).stream().map(doc -> Document.create().withInputDoc(doc).withTimestamp(Instant.now())).collect(Collectors.toList());
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxParallelRequestsPerWindow(1);
PCollection<KV<Integer, Iterable<Document>>> batches = pipeline.apply(Create.of(data)).apply(StatefulBatching.fromSpec(write.getBulkIO()));
PCollection<Integer> keyValues = batches.apply(MapElements.into(integers()).via((SerializableFunction<KV<Integer, Iterable<Document>>, Integer>) KV::getKey));
// Number of unique keys produced should be number of maxParallelRequestsPerWindow * numWindows
// There is only 1 request (key) per window, and 1 (global) window ie. one key total where
// key value is 0
PAssert.that(keyValues).containsInAnyOrder(0);
PAssert.that(batches).satisfies(new AssertThatHasExpectedContents(0, data));
pipeline.run();
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.
the class ElasticsearchIOTestCommon method serializeDocs.
List<Document> serializeDocs(ElasticsearchIO.Write write, List<String> jsonDocs) throws IOException {
List<Document> serializedInput = new ArrayList<>();
for (String doc : jsonDocs) {
String bulkDoc = DocToBulk.createBulkApiEntity(write.getDocToBulk(), doc, getBackendVersion(connectionConfiguration));
Document r = Document.create().withInputDoc(doc).withBulkDirective(bulkDoc).withTimestamp(Instant.now());
serializedInput.add(r);
}
return serializedInput;
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.
the class ElasticsearchIOTestCommon method testWriteWithErrors.
void testWriteWithErrors() throws Exception {
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSize(BATCH_SIZE);
List<String> input = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS);
expectedException.expect(isA(IOException.class));
expectedException.expectMessage(new CustomMatcher<String>("RegExp matcher") {
@Override
public boolean matches(Object o) {
String message = (String) o;
// the other messages are matched using .+
return message.matches("(?is).*Error writing to Elasticsearch, some elements could not be inserted" + ".*Document id .+: failed to parse \\(.+\\).*Caused by: .+ \\(.+\\).*" + "Document id .+: failed to parse \\(.+\\).*Caused by: .+ \\(.+\\).*");
}
});
// so we test the Writer as a DoFn outside of a runner.
try (DoFnTester<Document, Document> fnTester = DoFnTester.of(new BulkIO.BulkIOBundleFn(write.getBulkIO()))) {
// inserts into Elasticsearch
fnTester.processBundle(serializeDocs(write, input));
}
}
Aggregations