Search in sources :

Example 1 with Read

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read in project beam by apache.

the class ElasticsearchIOTestCommon method testSplit.

void testSplit(final int desiredBundleSizeBytes) throws Exception {
    if (!useAsITests) {
        ElasticsearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient);
    }
    PipelineOptions options = PipelineOptionsFactory.create();
    Read read = ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration);
    BoundedElasticsearchSource initialSource = new BoundedElasticsearchSource(read, null, null, null);
    List<? extends BoundedSource<String>> splits = initialSource.split(desiredBundleSizeBytes, options);
    SourceTestUtils.assertSourcesEqualReferenceSource(initialSource, splits, options);
    long indexSize = BoundedElasticsearchSource.estimateIndexSize(connectionConfiguration);
    int expectedNumSources;
    if (desiredBundleSizeBytes == 0) {
        // desiredBundleSize is ignored because in ES 2.x there is no way to split shards.
        // 5 is the number of ES shards
        // (By default, each index in Elasticsearch is allocated 5 primary shards)
        expectedNumSources = 5;
    } else {
        float expectedNumSourcesFloat = (float) indexSize / desiredBundleSizeBytes;
        expectedNumSources = (int) Math.ceil(expectedNumSourcesFloat);
    }
    assertEquals("Wrong number of splits", expectedNumSources, splits.size());
    int emptySplits = 0;
    for (BoundedSource<String> subSource : splits) {
        if (readFromSource(subSource, options).isEmpty()) {
            emptySplits += 1;
        }
    }
    assertThat("There are too many empty splits, parallelism is sub-optimal", emptySplits, lessThan((int) (ACCEPTABLE_EMPTY_SPLITS_PERCENTAGE * splits.size())));
}
Also used : Read(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read) BoundedElasticsearchSource(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BoundedElasticsearchSource) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions)

Example 2 with Read

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read in project beam by apache.

the class ElasticsearchIOTestCommon method testReadWithQueryInternal.

private void testReadWithQueryInternal(BiFunction<Read, String, Read> queryConfigurer) throws IOException {
    if (!useAsITests) {
        ElasticsearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient);
    }
    String query = "{\n" + "  \"query\": {\n" + "  \"match\" : {\n" + "    \"scientist\" : {\n" + "      \"query\" : \"Einstein\"\n" + "    }\n" + "  }\n" + "  }\n" + "}";
    Read read = ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration);
    read = queryConfigurer.apply(read, query);
    PCollection<String> output = pipeline.apply(read);
    PAssert.thatSingleton(output.apply("Count", Count.globally())).isEqualTo(numDocs / NUM_SCIENTISTS);
    pipeline.run();
}
Also used : Read(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read)

Example 3 with Read

use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read in project beam by apache.

the class ElasticsearchIOTestCommon method testSizes.

void testSizes() throws Exception {
    if (!useAsITests) {
        ElasticsearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient);
    }
    PipelineOptions options = PipelineOptionsFactory.create();
    Read read = ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration);
    BoundedElasticsearchSource initialSource = new BoundedElasticsearchSource(read, null, null, null);
    // can't use equal assert as Elasticsearch indexes never have same size
    // (due to internal Elasticsearch implementation)
    long estimatedSize = initialSource.getEstimatedSizeBytes(options);
    LOG.info("Estimated size: {}", estimatedSize);
    assertThat("Wrong estimated size", estimatedSize, greaterThan(AVERAGE_DOC_SIZE * numDocs));
}
Also used : Read(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read) BoundedElasticsearchSource(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BoundedElasticsearchSource) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions)

Aggregations

Read (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read)3 BoundedElasticsearchSource (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BoundedElasticsearchSource)2 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)2