use of io.druid.query.spec.MultipleIntervalSegmentSpec in project hive by apache.
the class DruidQueryBasedInputFormat method splitSelectQuery.
/* Method that splits Select query depending on the threshold so read can be
* parallelized. We will only contact the Druid broker to obtain all results. */
private static HiveDruidSplit[] splitSelectQuery(Configuration conf, String address, SelectQuery query, Path dummyPath) throws IOException {
final int selectThreshold = (int) HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_SELECT_THRESHOLD);
final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
final boolean isFetch = query.getContextBoolean(Constants.DRUID_QUERY_FETCH, false);
if (isFetch) {
// If it has a limit, we use it and we do not split the query
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
// We do not have the number of rows, thus we need to execute a
// Segment Metadata query to obtain number of rows
SegmentMetadataQueryBuilder metadataBuilder = new Druids.SegmentMetadataQueryBuilder();
metadataBuilder.dataSource(query.getDataSource());
metadataBuilder.intervals(query.getIntervals());
metadataBuilder.merge(true);
metadataBuilder.analysisTypes();
SegmentMetadataQuery metadataQuery = metadataBuilder.build();
Lifecycle lifecycle = new Lifecycle();
HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
LOG.error("Lifecycle start issue");
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
InputStream response;
try {
response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, metadataQuery));
} catch (Exception e) {
lifecycle.stop();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
// Retrieve results
List<SegmentAnalysis> metadataList;
try {
metadataList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<SegmentAnalysis>>() {
});
} catch (Exception e) {
response.close();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
} finally {
lifecycle.stop();
}
if (metadataList == null) {
throw new IOException("Connected to Druid but could not retrieve datasource information");
}
if (metadataList.isEmpty()) {
// There are no rows for that time range, we can submit query as it is
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
if (metadataList.size() != 1) {
throw new IOException("Information about segments should have been merged");
}
final long numRows = metadataList.get(0).getNumRows();
query = query.withPagingSpec(PagingSpec.newSpec(Integer.MAX_VALUE));
if (numRows <= selectThreshold) {
// We are not going to split it
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
// If the query does not specify a timestamp, we obtain the total time using
// a Time Boundary query. Then, we use the information to split the query
// following the Select threshold configuration property
final List<Interval> intervals = new ArrayList<>();
if (query.getIntervals().size() == 1 && query.getIntervals().get(0).withChronology(ISOChronology.getInstanceUTC()).equals(DruidTable.DEFAULT_INTERVAL)) {
// Default max and min, we should execute a time boundary query to get a
// more precise range
TimeBoundaryQueryBuilder timeBuilder = new Druids.TimeBoundaryQueryBuilder();
timeBuilder.dataSource(query.getDataSource());
TimeBoundaryQuery timeQuery = timeBuilder.build();
lifecycle = new Lifecycle();
client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
LOG.error("Lifecycle start issue");
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
try {
response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, timeQuery));
} catch (Exception e) {
lifecycle.stop();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
// Retrieve results
List<Result<TimeBoundaryResultValue>> timeList;
try {
timeList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<Result<TimeBoundaryResultValue>>>() {
});
} catch (Exception e) {
response.close();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
} finally {
lifecycle.stop();
}
if (timeList == null || timeList.isEmpty()) {
throw new IOException("Connected to Druid but could not retrieve time boundary information");
}
if (timeList.size() != 1) {
throw new IOException("We should obtain a single time boundary");
}
intervals.add(new Interval(timeList.get(0).getValue().getMinTime().getMillis(), timeList.get(0).getValue().getMaxTime().getMillis(), ISOChronology.getInstanceUTC()));
} else {
intervals.addAll(query.getIntervals());
}
// Create (numRows/default threshold) input splits
int numSplits = (int) Math.ceil((double) numRows / selectThreshold);
List<List<Interval>> newIntervals = createSplitsIntervals(intervals, numSplits);
HiveDruidSplit[] splits = new HiveDruidSplit[numSplits];
for (int i = 0; i < numSplits; i++) {
// Create partial Select query
final SelectQuery partialQuery = query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(newIntervals.get(i)));
splits[i] = new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(partialQuery), dummyPath, new String[] { address });
}
return splits;
}
use of io.druid.query.spec.MultipleIntervalSegmentSpec in project druid by druid-io.
the class CachingClusteredClientTest method testIfNoneMatch.
@Test
public void testIfNoneMatch() throws Exception {
Interval interval = new Interval("2016/2017");
final DataSegment dataSegment = new DataSegment("dataSource", interval, "ver", ImmutableMap.<String, Object>of("type", "hdfs", "path", "/tmp"), ImmutableList.of("product"), ImmutableList.of("visited_sum"), NoneShardSpec.instance(), 9, 12334);
final ServerSelector selector = new ServerSelector(dataSegment, new HighestPriorityTierSelectorStrategy(new RandomServerSelectorStrategy()));
selector.addServerAndUpdateSegment(new QueryableDruidServer(servers[0], null), dataSegment);
timeline.add(interval, "ver", new SingleElementPartitionChunk<>(selector));
TimeBoundaryQuery query = Druids.newTimeBoundaryQueryBuilder().dataSource(DATA_SOURCE).intervals(new MultipleIntervalSegmentSpec(ImmutableList.of(interval))).context(ImmutableMap.<String, Object>of("If-None-Match", "aVJV29CJY93rszVW/QBy0arWZo0=")).build();
Map<String, String> responseContext = new HashMap<>();
client.run(query, responseContext);
Assert.assertEquals("Z/eS4rQz5v477iq7Aashr6JPZa0=", responseContext.get("ETag"));
}
use of io.druid.query.spec.MultipleIntervalSegmentSpec in project druid by druid-io.
the class CachingClusteredClientTest method testQueryCachingWithFilter.
@SuppressWarnings("unchecked")
public void testQueryCachingWithFilter(final QueryRunner runner, final int numTimesToQuery, final Query query, final List<Iterable<Result<TimeseriesResultValue>>> filteredExpected, // does this assume query intervals must be ordered?
Object... args) {
final List<Interval> queryIntervals = Lists.newArrayListWithCapacity(args.length / 2);
final List<List<Iterable<Result<Object>>>> expectedResults = Lists.newArrayListWithCapacity(queryIntervals.size());
parseResults(queryIntervals, expectedResults, args);
for (int i = 0; i < queryIntervals.size(); ++i) {
List<Object> mocks = Lists.newArrayList();
mocks.add(serverView);
final Interval actualQueryInterval = new Interval(queryIntervals.get(0).getStart(), queryIntervals.get(i).getEnd());
final List<Map<DruidServer, ServerExpectations>> serverExpectationList = populateTimeline(queryIntervals, expectedResults, i, mocks);
final Map<DruidServer, ServerExpectations> finalExpectation = serverExpectationList.get(serverExpectationList.size() - 1);
for (Map.Entry<DruidServer, ServerExpectations> entry : finalExpectation.entrySet()) {
DruidServer server = entry.getKey();
ServerExpectations expectations = entry.getValue();
EasyMock.expect(serverView.getQueryRunner(server)).andReturn(expectations.getQueryRunner()).times(0, 1);
final Capture<? extends Query> capture = new Capture();
final Capture<? extends Map> context = new Capture();
QueryRunner queryable = expectations.getQueryRunner();
if (query instanceof TimeseriesQuery) {
final List<String> segmentIds = Lists.newArrayList();
final List<Iterable<Result<TimeseriesResultValue>>> results = Lists.newArrayList();
for (ServerExpectation expectation : expectations) {
segmentIds.add(expectation.getSegmentId());
results.add(expectation.getResults());
}
EasyMock.expect(queryable.run(EasyMock.capture(capture), EasyMock.capture(context))).andAnswer(new IAnswer<Sequence>() {
@Override
public Sequence answer() throws Throwable {
return toFilteredQueryableTimeseriesResults((TimeseriesQuery) capture.getValue(), segmentIds, queryIntervals, results);
}
}).times(0, 1);
} else {
throw new ISE("Unknown query type[%s]", query.getClass());
}
}
final Iterable<Result<Object>> expected = new ArrayList<>();
for (int intervalNo = 0; intervalNo < i + 1; intervalNo++) {
Iterables.addAll((List) expected, filteredExpected.get(intervalNo));
}
runWithMocks(new Runnable() {
@Override
public void run() {
HashMap<String, List> context = new HashMap<String, List>();
for (int i = 0; i < numTimesToQuery; ++i) {
TestHelper.assertExpectedResults(expected, runner.run(query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(ImmutableList.of(actualQueryInterval))), context));
if (queryCompletedCallback != null) {
queryCompletedCallback.run();
}
}
}
}, mocks.toArray());
}
}
use of io.druid.query.spec.MultipleIntervalSegmentSpec in project druid by druid-io.
the class TieredBrokerHostSelectorTest method testSelectMultiInterval.
@Test
public void testSelectMultiInterval() throws Exception {
String brokerName = (String) brokerSelector.select(Druids.newTimeseriesQueryBuilder().dataSource("test").aggregators(Arrays.<AggregatorFactory>asList(new CountAggregatorFactory("count"))).intervals(new MultipleIntervalSegmentSpec(Arrays.<Interval>asList(new Interval("2013-08-31/2013-09-01"), new Interval("2012-08-31/2012-09-01"), new Interval("2011-08-31/2011-09-01")))).build()).lhs;
Assert.assertEquals("coldBroker", brokerName);
}
use of io.druid.query.spec.MultipleIntervalSegmentSpec in project druid by druid-io.
the class GroupByTypeInterfaceBenchmark method setupQueries.
private void setupQueries() {
// queries for the basic schema
Map<String, GroupByQuery> basicQueries = new LinkedHashMap<>();
BenchmarkSchemaInfo basicSchema = BenchmarkSchemas.SCHEMA_MAP.get("basic");
{
// basic.A
QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));
List<AggregatorFactory> queryAggs = new ArrayList<>();
queryAggs.add(new LongSumAggregatorFactory("sumLongSequential", "sumLongSequential"));
GroupByQuery queryString = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("dimSequential", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularity.fromString(queryGranularity)).build();
GroupByQuery queryLongFloat = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("metLongUniform", null), new DefaultDimensionSpec("metFloatNormal", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularity.fromString(queryGranularity)).build();
GroupByQuery queryLong = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("metLongUniform", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularity.fromString(queryGranularity)).build();
GroupByQuery queryFloat = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("metFloatNormal", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularity.fromString(queryGranularity)).build();
basicQueries.put("string", queryString);
basicQueries.put("longFloat", queryLongFloat);
basicQueries.put("long", queryLong);
basicQueries.put("float", queryFloat);
}
{
// basic.nested
QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));
List<AggregatorFactory> queryAggs = new ArrayList<>();
queryAggs.add(new LongSumAggregatorFactory("sumLongSequential", "sumLongSequential"));
GroupByQuery subqueryA = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("dimSequential", null), new DefaultDimensionSpec("dimZipf", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularities.DAY).build();
GroupByQuery queryA = GroupByQuery.builder().setDataSource(subqueryA).setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("dimSequential", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularities.WEEK).build();
basicQueries.put("nested", queryA);
}
SCHEMA_QUERY_MAP.put("basic", basicQueries);
}
Aggregations