use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read in project beam by apache.
the class ElasticsearchIOTestCommon method testSplit.
void testSplit(final int desiredBundleSizeBytes) throws Exception {
if (!useAsITests) {
ElasticsearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient);
}
PipelineOptions options = PipelineOptionsFactory.create();
Read read = ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration);
BoundedElasticsearchSource initialSource = new BoundedElasticsearchSource(read, null, null, null);
List<? extends BoundedSource<String>> splits = initialSource.split(desiredBundleSizeBytes, options);
SourceTestUtils.assertSourcesEqualReferenceSource(initialSource, splits, options);
long indexSize = BoundedElasticsearchSource.estimateIndexSize(connectionConfiguration);
int expectedNumSources;
if (desiredBundleSizeBytes == 0) {
// desiredBundleSize is ignored because in ES 2.x there is no way to split shards.
// 5 is the number of ES shards
// (By default, each index in Elasticsearch is allocated 5 primary shards)
expectedNumSources = 5;
} else {
float expectedNumSourcesFloat = (float) indexSize / desiredBundleSizeBytes;
expectedNumSources = (int) Math.ceil(expectedNumSourcesFloat);
}
assertEquals("Wrong number of splits", expectedNumSources, splits.size());
int emptySplits = 0;
for (BoundedSource<String> subSource : splits) {
if (readFromSource(subSource, options).isEmpty()) {
emptySplits += 1;
}
}
assertThat("There are too many empty splits, parallelism is sub-optimal", emptySplits, lessThan((int) (ACCEPTABLE_EMPTY_SPLITS_PERCENTAGE * splits.size())));
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read in project beam by apache.
the class ElasticsearchIOTestCommon method testReadWithQueryInternal.
private void testReadWithQueryInternal(BiFunction<Read, String, Read> queryConfigurer) throws IOException {
if (!useAsITests) {
ElasticsearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient);
}
String query = "{\n" + " \"query\": {\n" + " \"match\" : {\n" + " \"scientist\" : {\n" + " \"query\" : \"Einstein\"\n" + " }\n" + " }\n" + " }\n" + "}";
Read read = ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration);
read = queryConfigurer.apply(read, query);
PCollection<String> output = pipeline.apply(read);
PAssert.thatSingleton(output.apply("Count", Count.globally())).isEqualTo(numDocs / NUM_SCIENTISTS);
pipeline.run();
}
use of org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read in project beam by apache.
the class ElasticsearchIOTestCommon method testSizes.
void testSizes() throws Exception {
if (!useAsITests) {
ElasticsearchIOTestUtils.insertTestDocuments(connectionConfiguration, numDocs, restClient);
}
PipelineOptions options = PipelineOptionsFactory.create();
Read read = ElasticsearchIO.read().withConnectionConfiguration(connectionConfiguration);
BoundedElasticsearchSource initialSource = new BoundedElasticsearchSource(read, null, null, null);
// can't use equal assert as Elasticsearch indexes never have same size
// (due to internal Elasticsearch implementation)
long estimatedSize = initialSource.getEstimatedSizeBytes(options);
LOG.info("Estimated size: {}", estimatedSize);
assertThat("Wrong estimated size", estimatedSize, greaterThan(AVERAGE_DOC_SIZE * numDocs));
}
Aggregations