Search in sources :

Example 1 with MultipleIntervalSegmentSpec

use of io.druid.query.spec.MultipleIntervalSegmentSpec in project hive by apache.

the class DruidQueryBasedInputFormat method splitSelectQuery.

/* Method that splits Select query depending on the threshold so read can be
   * parallelized. We will only contact the Druid broker to obtain all results. */
private static HiveDruidSplit[] splitSelectQuery(Configuration conf, String address, SelectQuery query, Path dummyPath) throws IOException {
    final int selectThreshold = (int) HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_SELECT_THRESHOLD);
    final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
    final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
    final boolean isFetch = query.getContextBoolean(Constants.DRUID_QUERY_FETCH, false);
    if (isFetch) {
        // If it has a limit, we use it and we do not split the query
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    // We do not have the number of rows, thus we need to execute a
    // Segment Metadata query to obtain number of rows
    SegmentMetadataQueryBuilder metadataBuilder = new Druids.SegmentMetadataQueryBuilder();
    metadataBuilder.dataSource(query.getDataSource());
    metadataBuilder.intervals(query.getIntervals());
    metadataBuilder.merge(true);
    metadataBuilder.analysisTypes();
    SegmentMetadataQuery metadataQuery = metadataBuilder.build();
    Lifecycle lifecycle = new Lifecycle();
    HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
    try {
        lifecycle.start();
    } catch (Exception e) {
        LOG.error("Lifecycle start issue");
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
    InputStream response;
    try {
        response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, metadataQuery));
    } catch (Exception e) {
        lifecycle.stop();
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
    // Retrieve results
    List<SegmentAnalysis> metadataList;
    try {
        metadataList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<SegmentAnalysis>>() {
        });
    } catch (Exception e) {
        response.close();
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    } finally {
        lifecycle.stop();
    }
    if (metadataList == null) {
        throw new IOException("Connected to Druid but could not retrieve datasource information");
    }
    if (metadataList.isEmpty()) {
        // There are no rows for that time range, we can submit query as it is
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    if (metadataList.size() != 1) {
        throw new IOException("Information about segments should have been merged");
    }
    final long numRows = metadataList.get(0).getNumRows();
    query = query.withPagingSpec(PagingSpec.newSpec(Integer.MAX_VALUE));
    if (numRows <= selectThreshold) {
        // We are not going to split it
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    // If the query does not specify a timestamp, we obtain the total time using
    // a Time Boundary query. Then, we use the information to split the query
    // following the Select threshold configuration property
    final List<Interval> intervals = new ArrayList<>();
    if (query.getIntervals().size() == 1 && query.getIntervals().get(0).withChronology(ISOChronology.getInstanceUTC()).equals(DruidTable.DEFAULT_INTERVAL)) {
        // Default max and min, we should execute a time boundary query to get a
        // more precise range
        TimeBoundaryQueryBuilder timeBuilder = new Druids.TimeBoundaryQueryBuilder();
        timeBuilder.dataSource(query.getDataSource());
        TimeBoundaryQuery timeQuery = timeBuilder.build();
        lifecycle = new Lifecycle();
        client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
        try {
            lifecycle.start();
        } catch (Exception e) {
            LOG.error("Lifecycle start issue");
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        }
        try {
            response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, timeQuery));
        } catch (Exception e) {
            lifecycle.stop();
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        }
        // Retrieve results
        List<Result<TimeBoundaryResultValue>> timeList;
        try {
            timeList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<Result<TimeBoundaryResultValue>>>() {
            });
        } catch (Exception e) {
            response.close();
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        } finally {
            lifecycle.stop();
        }
        if (timeList == null || timeList.isEmpty()) {
            throw new IOException("Connected to Druid but could not retrieve time boundary information");
        }
        if (timeList.size() != 1) {
            throw new IOException("We should obtain a single time boundary");
        }
        intervals.add(new Interval(timeList.get(0).getValue().getMinTime().getMillis(), timeList.get(0).getValue().getMaxTime().getMillis(), ISOChronology.getInstanceUTC()));
    } else {
        intervals.addAll(query.getIntervals());
    }
    // Create (numRows/default threshold) input splits
    int numSplits = (int) Math.ceil((double) numRows / selectThreshold);
    List<List<Interval>> newIntervals = createSplitsIntervals(intervals, numSplits);
    HiveDruidSplit[] splits = new HiveDruidSplit[numSplits];
    for (int i = 0; i < numSplits; i++) {
        // Create partial Select query
        final SelectQuery partialQuery = query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(newIntervals.get(i)));
        splits[i] = new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(partialQuery), dummyPath, new String[] { address });
    }
    return splits;
}
Also used : ArrayList(java.util.ArrayList) MultipleIntervalSegmentSpec(io.druid.query.spec.MultipleIntervalSegmentSpec) TimeBoundaryQuery(io.druid.query.timeboundary.TimeBoundaryQuery) Result(io.druid.query.Result) SegmentMetadataQuery(io.druid.query.metadata.metadata.SegmentMetadataQuery) SegmentMetadataQueryBuilder(io.druid.query.Druids.SegmentMetadataQueryBuilder) SegmentAnalysis(io.druid.query.metadata.metadata.SegmentAnalysis) TimeBoundaryQueryBuilder(io.druid.query.Druids.TimeBoundaryQueryBuilder) List(java.util.List) ArrayList(java.util.ArrayList) TypeReference(com.fasterxml.jackson.core.type.TypeReference) InputStream(java.io.InputStream) Lifecycle(com.metamx.common.lifecycle.Lifecycle) Period(org.joda.time.Period) IOException(java.io.IOException) JsonParseException(com.fasterxml.jackson.core.JsonParseException) JsonMappingException(com.fasterxml.jackson.databind.JsonMappingException) IOException(java.io.IOException) SelectQuery(io.druid.query.select.SelectQuery) HttpClient(com.metamx.http.client.HttpClient) Interval(org.joda.time.Interval)

Example 2 with MultipleIntervalSegmentSpec

use of io.druid.query.spec.MultipleIntervalSegmentSpec in project druid by druid-io.

the class CachingClusteredClientTest method testIfNoneMatch.

@Test
public void testIfNoneMatch() throws Exception {
    Interval interval = new Interval("2016/2017");
    final DataSegment dataSegment = new DataSegment("dataSource", interval, "ver", ImmutableMap.<String, Object>of("type", "hdfs", "path", "/tmp"), ImmutableList.of("product"), ImmutableList.of("visited_sum"), NoneShardSpec.instance(), 9, 12334);
    final ServerSelector selector = new ServerSelector(dataSegment, new HighestPriorityTierSelectorStrategy(new RandomServerSelectorStrategy()));
    selector.addServerAndUpdateSegment(new QueryableDruidServer(servers[0], null), dataSegment);
    timeline.add(interval, "ver", new SingleElementPartitionChunk<>(selector));
    TimeBoundaryQuery query = Druids.newTimeBoundaryQueryBuilder().dataSource(DATA_SOURCE).intervals(new MultipleIntervalSegmentSpec(ImmutableList.of(interval))).context(ImmutableMap.<String, Object>of("If-None-Match", "aVJV29CJY93rszVW/QBy0arWZo0=")).build();
    Map<String, String> responseContext = new HashMap<>();
    client.run(query, responseContext);
    Assert.assertEquals("Z/eS4rQz5v477iq7Aashr6JPZa0=", responseContext.get("ETag"));
}
Also used : HashMap(java.util.HashMap) MultipleIntervalSegmentSpec(io.druid.query.spec.MultipleIntervalSegmentSpec) TimeBoundaryQuery(io.druid.query.timeboundary.TimeBoundaryQuery) DataSegment(io.druid.timeline.DataSegment) QueryableDruidServer(io.druid.client.selector.QueryableDruidServer) ServerSelector(io.druid.client.selector.ServerSelector) HighestPriorityTierSelectorStrategy(io.druid.client.selector.HighestPriorityTierSelectorStrategy) RandomServerSelectorStrategy(io.druid.client.selector.RandomServerSelectorStrategy) Interval(org.joda.time.Interval) Test(org.junit.Test) GroupByQueryRunnerTest(io.druid.query.groupby.GroupByQueryRunnerTest)

Example 3 with MultipleIntervalSegmentSpec

use of io.druid.query.spec.MultipleIntervalSegmentSpec in project druid by druid-io.

the class CachingClusteredClientTest method testQueryCachingWithFilter.

@SuppressWarnings("unchecked")
public void testQueryCachingWithFilter(final QueryRunner runner, final int numTimesToQuery, final Query query, final List<Iterable<Result<TimeseriesResultValue>>> filteredExpected, // does this assume query intervals must be ordered?
Object... args) {
    final List<Interval> queryIntervals = Lists.newArrayListWithCapacity(args.length / 2);
    final List<List<Iterable<Result<Object>>>> expectedResults = Lists.newArrayListWithCapacity(queryIntervals.size());
    parseResults(queryIntervals, expectedResults, args);
    for (int i = 0; i < queryIntervals.size(); ++i) {
        List<Object> mocks = Lists.newArrayList();
        mocks.add(serverView);
        final Interval actualQueryInterval = new Interval(queryIntervals.get(0).getStart(), queryIntervals.get(i).getEnd());
        final List<Map<DruidServer, ServerExpectations>> serverExpectationList = populateTimeline(queryIntervals, expectedResults, i, mocks);
        final Map<DruidServer, ServerExpectations> finalExpectation = serverExpectationList.get(serverExpectationList.size() - 1);
        for (Map.Entry<DruidServer, ServerExpectations> entry : finalExpectation.entrySet()) {
            DruidServer server = entry.getKey();
            ServerExpectations expectations = entry.getValue();
            EasyMock.expect(serverView.getQueryRunner(server)).andReturn(expectations.getQueryRunner()).times(0, 1);
            final Capture<? extends Query> capture = new Capture();
            final Capture<? extends Map> context = new Capture();
            QueryRunner queryable = expectations.getQueryRunner();
            if (query instanceof TimeseriesQuery) {
                final List<String> segmentIds = Lists.newArrayList();
                final List<Iterable<Result<TimeseriesResultValue>>> results = Lists.newArrayList();
                for (ServerExpectation expectation : expectations) {
                    segmentIds.add(expectation.getSegmentId());
                    results.add(expectation.getResults());
                }
                EasyMock.expect(queryable.run(EasyMock.capture(capture), EasyMock.capture(context))).andAnswer(new IAnswer<Sequence>() {

                    @Override
                    public Sequence answer() throws Throwable {
                        return toFilteredQueryableTimeseriesResults((TimeseriesQuery) capture.getValue(), segmentIds, queryIntervals, results);
                    }
                }).times(0, 1);
            } else {
                throw new ISE("Unknown query type[%s]", query.getClass());
            }
        }
        final Iterable<Result<Object>> expected = new ArrayList<>();
        for (int intervalNo = 0; intervalNo < i + 1; intervalNo++) {
            Iterables.addAll((List) expected, filteredExpected.get(intervalNo));
        }
        runWithMocks(new Runnable() {

            @Override
            public void run() {
                HashMap<String, List> context = new HashMap<String, List>();
                for (int i = 0; i < numTimesToQuery; ++i) {
                    TestHelper.assertExpectedResults(expected, runner.run(query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(ImmutableList.of(actualQueryInterval))), context));
                    if (queryCompletedCallback != null) {
                        queryCompletedCallback.run();
                    }
                }
            }
        }, mocks.toArray());
    }
}
Also used : TimeseriesResultValue(io.druid.query.timeseries.TimeseriesResultValue) MergeIterable(io.druid.java.util.common.guava.MergeIterable) FunctionalIterable(io.druid.java.util.common.guava.FunctionalIterable) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MultipleIntervalSegmentSpec(io.druid.query.spec.MultipleIntervalSegmentSpec) Capture(org.easymock.Capture) Result(io.druid.query.Result) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ISE(io.druid.java.util.common.ISE) TimeseriesQuery(io.druid.query.timeseries.TimeseriesQuery) QueryableDruidServer(io.druid.client.selector.QueryableDruidServer) FinalizeResultsQueryRunner(io.druid.query.FinalizeResultsQueryRunner) QueryRunner(io.druid.query.QueryRunner) IAnswer(org.easymock.IAnswer) Map(java.util.Map) TreeMap(java.util.TreeMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Interval(org.joda.time.Interval)

Example 4 with MultipleIntervalSegmentSpec

use of io.druid.query.spec.MultipleIntervalSegmentSpec in project druid by druid-io.

the class TieredBrokerHostSelectorTest method testSelectMultiInterval.

@Test
public void testSelectMultiInterval() throws Exception {
    String brokerName = (String) brokerSelector.select(Druids.newTimeseriesQueryBuilder().dataSource("test").aggregators(Arrays.<AggregatorFactory>asList(new CountAggregatorFactory("count"))).intervals(new MultipleIntervalSegmentSpec(Arrays.<Interval>asList(new Interval("2013-08-31/2013-09-01"), new Interval("2012-08-31/2012-09-01"), new Interval("2011-08-31/2011-09-01")))).build()).lhs;
    Assert.assertEquals("coldBroker", brokerName);
}
Also used : CountAggregatorFactory(io.druid.query.aggregation.CountAggregatorFactory) MultipleIntervalSegmentSpec(io.druid.query.spec.MultipleIntervalSegmentSpec) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) CountAggregatorFactory(io.druid.query.aggregation.CountAggregatorFactory) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 5 with MultipleIntervalSegmentSpec

use of io.druid.query.spec.MultipleIntervalSegmentSpec in project druid by druid-io.

the class GroupByTypeInterfaceBenchmark method setupQueries.

private void setupQueries() {
    // queries for the basic schema
    Map<String, GroupByQuery> basicQueries = new LinkedHashMap<>();
    BenchmarkSchemaInfo basicSchema = BenchmarkSchemas.SCHEMA_MAP.get("basic");
    {
        // basic.A
        QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));
        List<AggregatorFactory> queryAggs = new ArrayList<>();
        queryAggs.add(new LongSumAggregatorFactory("sumLongSequential", "sumLongSequential"));
        GroupByQuery queryString = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("dimSequential", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularity.fromString(queryGranularity)).build();
        GroupByQuery queryLongFloat = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("metLongUniform", null), new DefaultDimensionSpec("metFloatNormal", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularity.fromString(queryGranularity)).build();
        GroupByQuery queryLong = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("metLongUniform", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularity.fromString(queryGranularity)).build();
        GroupByQuery queryFloat = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("metFloatNormal", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularity.fromString(queryGranularity)).build();
        basicQueries.put("string", queryString);
        basicQueries.put("longFloat", queryLongFloat);
        basicQueries.put("long", queryLong);
        basicQueries.put("float", queryFloat);
    }
    {
        // basic.nested
        QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));
        List<AggregatorFactory> queryAggs = new ArrayList<>();
        queryAggs.add(new LongSumAggregatorFactory("sumLongSequential", "sumLongSequential"));
        GroupByQuery subqueryA = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("dimSequential", null), new DefaultDimensionSpec("dimZipf", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularities.DAY).build();
        GroupByQuery queryA = GroupByQuery.builder().setDataSource(subqueryA).setQuerySegmentSpec(intervalSpec).setDimensions(Lists.<DimensionSpec>newArrayList(new DefaultDimensionSpec("dimSequential", null))).setAggregatorSpecs(queryAggs).setGranularity(Granularities.WEEK).build();
        basicQueries.put("nested", queryA);
    }
    SCHEMA_QUERY_MAP.put("basic", basicQueries);
}
Also used : GroupByQuery(io.druid.query.groupby.GroupByQuery) BenchmarkSchemaInfo(io.druid.benchmark.datagen.BenchmarkSchemaInfo) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) MultipleIntervalSegmentSpec(io.druid.query.spec.MultipleIntervalSegmentSpec) QuerySegmentSpec(io.druid.query.spec.QuerySegmentSpec) List(java.util.List) ArrayList(java.util.ArrayList) DefaultDimensionSpec(io.druid.query.dimension.DefaultDimensionSpec) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

MultipleIntervalSegmentSpec (io.druid.query.spec.MultipleIntervalSegmentSpec)37 Interval (org.joda.time.Interval)26 Test (org.junit.Test)20 LongSumAggregatorFactory (io.druid.query.aggregation.LongSumAggregatorFactory)13 DefaultDimensionSpec (io.druid.query.dimension.DefaultDimensionSpec)12 QuerySegmentSpec (io.druid.query.spec.QuerySegmentSpec)10 Result (io.druid.query.Result)9 ArrayList (java.util.ArrayList)9 Row (io.druid.data.input.Row)8 List (java.util.List)8 BenchmarkSchemaInfo (io.druid.benchmark.datagen.BenchmarkSchemaInfo)7 Sequence (io.druid.java.util.common.guava.Sequence)7 Query (io.druid.query.Query)7 QueryRunner (io.druid.query.QueryRunner)7 TableDataSource (io.druid.query.TableDataSource)7 FinalizeResultsQueryRunner (io.druid.query.FinalizeResultsQueryRunner)6 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)6 CountAggregatorFactory (io.druid.query.aggregation.CountAggregatorFactory)6 DimensionSpec (io.druid.query.dimension.DimensionSpec)6 ExtractionDimensionSpec (io.druid.query.dimension.ExtractionDimensionSpec)6