Search in sources :

Example 1 with Document

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.

the class ElasticsearchIOTestCommon method testWriteWithAllowedErrors.

void testWriteWithAllowedErrors() throws Exception {
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSize(BATCH_SIZE).withAllowableResponseErrors(Collections.singleton("json_parse_exception"));
    List<String> input = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS);
    // so we test the Writer as a DoFn outside of a runner.
    try (DoFnTester<Document, Document> fnTester = DoFnTester.of(new BulkIO.BulkIOBundleFn(write.getBulkIO()))) {
        // inserts into Elasticsearch
        fnTester.processBundle(serializeDocs(write, input));
    }
}
Also used : Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) BulkIO(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)

Example 2 with Document

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.

the class ElasticsearchIOTestCommon method testWriteWithMaxBatchSizeBytes.

void testWriteWithMaxBatchSizeBytes() throws Exception {
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSizeBytes(BATCH_SIZE_BYTES);
    // so we test the Writer as a DoFn outside of a runner.
    try (DoFnTester<Document, Document> fnTester = DoFnTester.of(new BulkIO.BulkIOBundleFn(write.getBulkIO()))) {
        List<String> input = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS);
        List<Document> serializedInput = new ArrayList<>();
        for (String doc : input) {
            String bulkDoc = DocToBulk.createBulkApiEntity(write.getDocToBulk(), doc, getBackendVersion(connectionConfiguration));
            Document r = Document.create().withInputDoc(doc).withBulkDirective(bulkDoc).withTimestamp(Instant.now());
            serializedInput.add(r);
        }
        long numDocsProcessed = 0;
        long sizeProcessed = 0;
        long numDocsInserted = 0;
        long batchInserted = 0;
        for (Document document : serializedInput) {
            fnTester.processElement(document);
            numDocsProcessed++;
            sizeProcessed += document.getBulkDirective().getBytes(StandardCharsets.UTF_8).length;
            // test every 40 docs to avoid overloading ES
            if ((numDocsProcessed % 40) == 0) {
                // force the index to upgrade after inserting for the inserted docs
                // to be searchable immediately
                long currentNumDocs = refreshIndexAndGetCurrentNumDocs(connectionConfiguration, restClient);
                if (sizeProcessed / BATCH_SIZE_BYTES > batchInserted) {
                    /* bundle end */
                    assertThat("we have passed a bundle size, we should have inserted some documents", currentNumDocs, greaterThan(numDocsInserted));
                    numDocsInserted = currentNumDocs;
                    batchInserted = (sizeProcessed / BATCH_SIZE_BYTES);
                } else {
                    /* not bundle end */
                    assertEquals("we are not at the end of a bundle, we should have inserted no more documents", numDocsInserted, currentNumDocs);
                }
            }
        }
    }
}
Also used : Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) ArrayList(java.util.ArrayList) BulkIO(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)

Example 3 with Document

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.

the class ElasticsearchIOTestCommon method testMaxParallelRequestsPerWindow.

void testMaxParallelRequestsPerWindow() throws Exception {
    List<Document> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS).stream().map(doc -> Document.create().withInputDoc(doc).withTimestamp(Instant.now())).collect(Collectors.toList());
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxParallelRequestsPerWindow(1);
    PCollection<KV<Integer, Iterable<Document>>> batches = pipeline.apply(Create.of(data)).apply(StatefulBatching.fromSpec(write.getBulkIO()));
    PCollection<Integer> keyValues = batches.apply(MapElements.into(integers()).via((SerializableFunction<KV<Integer, Iterable<Document>>, Integer>) KV::getKey));
    // Number of unique keys produced should be number of maxParallelRequestsPerWindow * numWindows
    // There is only 1 request (key) per window, and 1 (global) window ie. one key total where
    // key value is 0
    PAssert.that(keyValues).containsInAnyOrder(0);
    PAssert.that(batches).satisfies(new AssertThatHasExpectedContents(0, data));
    pipeline.run();
}
Also used : Read(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read) Count(org.apache.beam.sdk.transforms.Count) Arrays(java.util.Arrays) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) InjectionMode(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.InjectionMode) ElasticsearchIOTestUtils.countByMatch(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.countByMatch) DoFnTester(org.apache.beam.sdk.transforms.DoFnTester) FAMOUS_SCIENTISTS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.FAMOUS_SCIENTISTS) PipedInputStream(java.io.PipedInputStream) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Is.is(org.hamcrest.core.Is.is) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Assert.fail(org.junit.Assert.fail) JsonNode(com.fasterxml.jackson.databind.JsonNode) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document) NStringEntity(org.apache.http.nio.entity.NStringEntity) ElasticsearchIO.getBackendVersion(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.getBackendVersion) ValueProvider(org.apache.beam.sdk.options.ValueProvider) DocToBulk(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.DocToBulk) MapElements(org.apache.beam.sdk.transforms.MapElements) ElasticsearchIOTestUtils.mapToInputId(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.mapToInputId) SourceTestUtils.readFromSource(org.apache.beam.sdk.testing.SourceTestUtils.readFromSource) CustomMatcher(org.hamcrest.CustomMatcher) ConnectionConfiguration(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.ConnectionConfiguration) HttpEntity(org.apache.http.HttpEntity) ContentType(org.apache.http.entity.ContentType) Set(java.util.Set) ElasticsearchIOTestUtils.flushAndRefreshAllIndices(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.flushAndRefreshAllIndices) Collectors(java.util.stream.Collectors) TypeSafeMatcher(org.hamcrest.TypeSafeMatcher) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) Assert.assertFalse(org.junit.Assert.assertFalse) Response(org.elasticsearch.client.Response) State(org.apache.beam.sdk.PipelineResult.State) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) SCRIPT_SOURCE(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.SCRIPT_SOURCE) BulkIO(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO) IntStream(java.util.stream.IntStream) RestClient(org.elasticsearch.client.RestClient) TypeDescriptors.integers(org.apache.beam.sdk.values.TypeDescriptors.integers) KV(org.apache.beam.sdk.values.KV) ElasticsearchIOTestUtils.refreshIndexAndGetCurrentNumDocs(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.refreshIndexAndGetCurrentNumDocs) Duration(org.joda.time.Duration) RetryPredicate(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.RetryPredicate) ElasticsearchIOTestUtils.insertTestDocuments(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.insertTestDocuments) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) INVALID_DOCS_IDS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.INVALID_DOCS_IDS) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) ArrayList(java.util.ArrayList) Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) DefaultRetryPredicate(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.DefaultRetryPredicate) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Matchers.lessThan(org.hamcrest.Matchers.lessThan) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExpectedException(org.junit.rules.ExpectedException) BoundedElasticsearchSource(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BoundedElasticsearchSource) Description(org.hamcrest.Description) SourceTestUtils(org.apache.beam.sdk.testing.SourceTestUtils) Logger(org.slf4j.Logger) StatefulBatching(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO.StatefulBatching) PAssert(org.apache.beam.sdk.testing.PAssert) ElasticsearchIOTestUtils.countByScientistName(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.countByScientistName) DocumentCoder(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.DocumentCoder) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) PipedOutputStream(java.io.PipedOutputStream) PCollection(org.apache.beam.sdk.values.PCollection) Request(org.elasticsearch.client.Request) Is.isA(org.hamcrest.core.Is.isA) NUM_SCIENTISTS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.NUM_SCIENTISTS) BoundedSource(org.apache.beam.sdk.io.BoundedSource) DEFAULT_RETRY_PREDICATE(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.DEFAULT_RETRY_PREDICATE) Matcher(org.hamcrest.Matcher) Instant(org.joda.time.Instant) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) IsIterableContainingInAnyOrder(org.hamcrest.collection.IsIterableContainingInAnyOrder) Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) KV(org.apache.beam.sdk.values.KV) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)

Example 4 with Document

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.

the class ElasticsearchIOTestCommon method serializeDocs.

List<Document> serializeDocs(ElasticsearchIO.Write write, List<String> jsonDocs) throws IOException {
    List<Document> serializedInput = new ArrayList<>();
    for (String doc : jsonDocs) {
        String bulkDoc = DocToBulk.createBulkApiEntity(write.getDocToBulk(), doc, getBackendVersion(connectionConfiguration));
        Document r = Document.create().withInputDoc(doc).withBulkDirective(bulkDoc).withTimestamp(Instant.now());
        serializedInput.add(r);
    }
    return serializedInput;
}
Also used : ArrayList(java.util.ArrayList) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)

Example 5 with Document

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document in project beam by apache.

the class ElasticsearchIOTestCommon method testWriteWithErrors.

void testWriteWithErrors() throws Exception {
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxBatchSize(BATCH_SIZE);
    List<String> input = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.INJECT_SOME_INVALID_DOCS);
    expectedException.expect(isA(IOException.class));
    expectedException.expectMessage(new CustomMatcher<String>("RegExp matcher") {

        @Override
        public boolean matches(Object o) {
            String message = (String) o;
            // the other messages are matched using .+
            return message.matches("(?is).*Error writing to Elasticsearch, some elements could not be inserted" + ".*Document id .+: failed to parse \\(.+\\).*Caused by: .+ \\(.+\\).*" + "Document id .+: failed to parse \\(.+\\).*Caused by: .+ \\(.+\\).*");
        }
    });
    // so we test the Writer as a DoFn outside of a runner.
    try (DoFnTester<Document, Document> fnTester = DoFnTester.of(new BulkIO.BulkIOBundleFn(write.getBulkIO()))) {
        // inserts into Elasticsearch
        fnTester.processBundle(serializeDocs(write, input));
    }
}
Also used : Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) IOException(java.io.IOException) BulkIO(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)

Aggregations

Document (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)7 Write (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write)6 BulkIO (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO)5 ArrayList (java.util.ArrayList)4 IOException (java.io.IOException)2 PipedInputStream (java.io.PipedInputStream)2 PipedOutputStream (java.io.PipedOutputStream)2 DocumentCoder (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.DocumentCoder)2 Instant (org.joda.time.Instant)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 ObjectNode (com.fasterxml.jackson.databind.node.ObjectNode)1 Serializable (java.io.Serializable)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 ThreadLocalRandom (java.util.concurrent.ThreadLocalRandom)1 BiFunction (java.util.function.BiFunction)1