use of io.druid.query.select.SelectQuery in project hive by apache.
the class DruidQueryBasedInputFormat method splitSelectQuery.
/* Method that splits Select query depending on the threshold so read can be
* parallelized. We will only contact the Druid broker to obtain all results. */
private static HiveDruidSplit[] splitSelectQuery(Configuration conf, String address, SelectQuery query, Path dummyPath) throws IOException {
final int selectThreshold = (int) HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_SELECT_THRESHOLD);
final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
final boolean isFetch = query.getContextBoolean(Constants.DRUID_QUERY_FETCH, false);
if (isFetch) {
// If it has a limit, we use it and we do not split the query
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
// We do not have the number of rows, thus we need to execute a
// Segment Metadata query to obtain number of rows
SegmentMetadataQueryBuilder metadataBuilder = new Druids.SegmentMetadataQueryBuilder();
metadataBuilder.dataSource(query.getDataSource());
metadataBuilder.intervals(query.getIntervals());
metadataBuilder.merge(true);
metadataBuilder.analysisTypes();
SegmentMetadataQuery metadataQuery = metadataBuilder.build();
Lifecycle lifecycle = new Lifecycle();
HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
LOG.error("Lifecycle start issue");
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
InputStream response;
try {
response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, metadataQuery));
} catch (Exception e) {
lifecycle.stop();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
// Retrieve results
List<SegmentAnalysis> metadataList;
try {
metadataList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<SegmentAnalysis>>() {
});
} catch (Exception e) {
response.close();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
} finally {
lifecycle.stop();
}
if (metadataList == null) {
throw new IOException("Connected to Druid but could not retrieve datasource information");
}
if (metadataList.isEmpty()) {
// There are no rows for that time range, we can submit query as it is
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
if (metadataList.size() != 1) {
throw new IOException("Information about segments should have been merged");
}
final long numRows = metadataList.get(0).getNumRows();
query = query.withPagingSpec(PagingSpec.newSpec(Integer.MAX_VALUE));
if (numRows <= selectThreshold) {
// We are not going to split it
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
// If the query does not specify a timestamp, we obtain the total time using
// a Time Boundary query. Then, we use the information to split the query
// following the Select threshold configuration property
final List<Interval> intervals = new ArrayList<>();
if (query.getIntervals().size() == 1 && query.getIntervals().get(0).withChronology(ISOChronology.getInstanceUTC()).equals(DruidTable.DEFAULT_INTERVAL)) {
// Default max and min, we should execute a time boundary query to get a
// more precise range
TimeBoundaryQueryBuilder timeBuilder = new Druids.TimeBoundaryQueryBuilder();
timeBuilder.dataSource(query.getDataSource());
TimeBoundaryQuery timeQuery = timeBuilder.build();
lifecycle = new Lifecycle();
client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
LOG.error("Lifecycle start issue");
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
try {
response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, timeQuery));
} catch (Exception e) {
lifecycle.stop();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
// Retrieve results
List<Result<TimeBoundaryResultValue>> timeList;
try {
timeList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<Result<TimeBoundaryResultValue>>>() {
});
} catch (Exception e) {
response.close();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
} finally {
lifecycle.stop();
}
if (timeList == null || timeList.isEmpty()) {
throw new IOException("Connected to Druid but could not retrieve time boundary information");
}
if (timeList.size() != 1) {
throw new IOException("We should obtain a single time boundary");
}
intervals.add(new Interval(timeList.get(0).getValue().getMinTime().getMillis(), timeList.get(0).getValue().getMaxTime().getMillis(), ISOChronology.getInstanceUTC()));
} else {
intervals.addAll(query.getIntervals());
}
// Create (numRows/default threshold) input splits
int numSplits = (int) Math.ceil((double) numRows / selectThreshold);
List<List<Interval>> newIntervals = createSplitsIntervals(intervals, numSplits);
HiveDruidSplit[] splits = new HiveDruidSplit[numSplits];
for (int i = 0; i < numSplits; i++) {
// Create partial Select query
final SelectQuery partialQuery = query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(newIntervals.get(i)));
splits[i] = new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(partialQuery), dummyPath, new String[] { address });
}
return splits;
}
use of io.druid.query.select.SelectQuery in project hive by apache.
the class DruidQueryBasedInputFormat method distributeSelectQuery.
/* New method that distributes the Select query by creating splits containing
* information about different Druid nodes that have the data for the given
* query. */
private static HiveDruidSplit[] distributeSelectQuery(Configuration conf, String address, SelectQuery query, Path dummyPath) throws IOException {
// If it has a limit, we use it and we do not distribute the query
final boolean isFetch = query.getContextBoolean(Constants.DRUID_QUERY_FETCH, false);
if (isFetch) {
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
// Properties from configuration
final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
// Create request to obtain nodes that are holding data for the given datasource and intervals
final Lifecycle lifecycle = new Lifecycle();
final HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
LOG.error("Lifecycle start issue");
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
final String intervals = // Comma-separated intervals without brackets
StringUtils.join(query.getIntervals(), ",");
final String request = String.format("http://%s/druid/v2/datasources/%s/candidates?intervals=%s", address, query.getDataSource().getNames().get(0), intervals);
final InputStream response;
try {
response = DruidStorageHandlerUtils.submitRequest(client, new Request(HttpMethod.GET, new URL(request)));
} catch (Exception e) {
lifecycle.stop();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
// Retrieve results
final List<LocatedSegmentDescriptor> segmentDescriptors;
try {
segmentDescriptors = DruidStorageHandlerUtils.JSON_MAPPER.readValue(response, new TypeReference<List<LocatedSegmentDescriptor>>() {
});
} catch (Exception e) {
response.close();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
} finally {
lifecycle.stop();
}
// Create one input split for each segment
final int numSplits = segmentDescriptors.size();
final HiveDruidSplit[] splits = new HiveDruidSplit[segmentDescriptors.size()];
for (int i = 0; i < numSplits; i++) {
final LocatedSegmentDescriptor locatedSD = segmentDescriptors.get(i);
final String[] hosts = new String[locatedSD.getLocations().size()];
for (int j = 0; j < locatedSD.getLocations().size(); j++) {
hosts[j] = locatedSD.getLocations().get(j).getHost();
}
// Create partial Select query
final SegmentDescriptor newSD = new SegmentDescriptor(locatedSD.getInterval(), locatedSD.getVersion(), locatedSD.getPartitionNumber());
final SelectQuery partialQuery = query.withQuerySegmentSpec(new MultipleSpecificSegmentSpec(Lists.newArrayList(newSD)));
splits[i] = new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(partialQuery), dummyPath, hosts);
}
return splits;
}
use of io.druid.query.select.SelectQuery in project hive by apache.
the class DruidSerDe method initialize.
@Override
public void initialize(Configuration configuration, Properties properties) throws SerDeException {
// Init connection properties
numConnection = HiveConf.getIntVar(configuration, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
readTimeout = new Period(HiveConf.getVar(configuration, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
final List<String> columnNames = new ArrayList<>();
final List<PrimitiveTypeInfo> columnTypes = new ArrayList<>();
List<ObjectInspector> inspectors = new ArrayList<>();
// Druid query
String druidQuery = properties.getProperty(Constants.DRUID_QUERY_JSON);
if (druidQuery == null) {
// the data source (dimensions and metrics).
if (!org.apache.commons.lang3.StringUtils.isEmpty(properties.getProperty(serdeConstants.LIST_COLUMNS)) && !org.apache.commons.lang3.StringUtils.isEmpty(properties.getProperty(serdeConstants.LIST_COLUMN_TYPES))) {
columnNames.addAll(Utilities.getColumnNames(properties));
if (!columnNames.contains(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
throw new SerDeException("Timestamp column (' " + DruidTable.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + properties.getProperty(serdeConstants.LIST_COLUMNS));
}
columnTypes.addAll(Lists.transform(Utilities.getColumnTypes(properties), new Function<String, PrimitiveTypeInfo>() {
@Override
public PrimitiveTypeInfo apply(String type) {
return TypeInfoFactory.getPrimitiveTypeInfo(type);
}
}));
inspectors.addAll(Lists.transform(columnTypes, new Function<PrimitiveTypeInfo, ObjectInspector>() {
@Override
public ObjectInspector apply(PrimitiveTypeInfo type) {
return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type);
}
}));
columns = columnNames.toArray(new String[columnNames.size()]);
types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
} else {
String dataSource = properties.getProperty(Constants.DRUID_DATA_SOURCE);
if (dataSource == null) {
throw new SerDeException("Druid data source not specified; use " + Constants.DRUID_DATA_SOURCE + " in table properties");
}
SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
builder.dataSource(dataSource);
builder.merge(true);
builder.analysisTypes();
SegmentMetadataQuery query = builder.build();
// Execute query in Druid
String address = HiveConf.getVar(configuration, HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
throw new SerDeException("Druid broker address not specified in configuration");
}
// Infer schema
SegmentAnalysis schemaInfo;
try {
schemaInfo = submitMetadataRequest(address, query);
} catch (IOException e) {
throw new SerDeException(e);
}
for (Entry<String, ColumnAnalysis> columnInfo : schemaInfo.getColumns().entrySet()) {
if (columnInfo.getKey().equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
// Special handling for timestamp column
// field name
columnNames.add(columnInfo.getKey());
// field type
PrimitiveTypeInfo type = TypeInfoFactory.timestampTypeInfo;
columnTypes.add(type);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
continue;
}
// field name
columnNames.add(columnInfo.getKey());
PrimitiveTypeInfo type = DruidSerDeUtils.convertDruidToHiveType(// field type
columnInfo.getValue().getType());
columnTypes.add(type);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
}
columns = columnNames.toArray(new String[columnNames.size()]);
types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
}
} else {
// Query is specified, we can extract the results schema from the query
Query<?> query;
try {
query = DruidStorageHandlerUtils.JSON_MAPPER.readValue(druidQuery, Query.class);
switch(query.getType()) {
case Query.TIMESERIES:
inferSchema((TimeseriesQuery) query, columnNames, columnTypes);
break;
case Query.TOPN:
inferSchema((TopNQuery) query, columnNames, columnTypes);
break;
case Query.SELECT:
String address = HiveConf.getVar(configuration, HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
throw new SerDeException("Druid broker address not specified in configuration");
}
inferSchema((SelectQuery) query, columnNames, columnTypes, address);
break;
case Query.GROUP_BY:
inferSchema((GroupByQuery) query, columnNames, columnTypes);
break;
default:
throw new SerDeException("Not supported Druid query");
}
} catch (Exception e) {
throw new SerDeException(e);
}
columns = new String[columnNames.size()];
types = new PrimitiveTypeInfo[columnNames.size()];
for (int i = 0; i < columnTypes.size(); ++i) {
columns[i] = columnNames.get(i);
types[i] = columnTypes.get(i);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(types[i]));
}
inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
}
if (LOG.isDebugEnabled()) {
LOG.debug("DruidSerDe initialized with\n" + "\t columns: " + columnNames + "\n\t types: " + columnTypes);
}
}
use of io.druid.query.select.SelectQuery in project druid by druid-io.
the class QueryMaker method runQuery.
public Sequence<Object[]> runQuery(final DataSource dataSource, final RowSignature sourceRowSignature, final DruidQueryBuilder queryBuilder) {
if (dataSource instanceof QueryDataSource) {
final GroupByQuery outerQuery = queryBuilder.toGroupByQuery(dataSource, sourceRowSignature, plannerContext.getQueryContext());
if (outerQuery == null) {
// Bug in the planner rules. They shouldn't allow this to happen.
throw new IllegalStateException("Can't use QueryDataSource without an outer groupBy query!");
}
return executeGroupBy(queryBuilder, outerQuery);
}
final TimeseriesQuery timeseriesQuery = queryBuilder.toTimeseriesQuery(dataSource, sourceRowSignature, plannerContext.getQueryContext());
if (timeseriesQuery != null) {
return executeTimeseries(queryBuilder, timeseriesQuery);
}
final TopNQuery topNQuery = queryBuilder.toTopNQuery(dataSource, sourceRowSignature, plannerContext.getQueryContext(), plannerContext.getPlannerConfig().getMaxTopNLimit(), plannerContext.getPlannerConfig().isUseApproximateTopN());
if (topNQuery != null) {
return executeTopN(queryBuilder, topNQuery);
}
final GroupByQuery groupByQuery = queryBuilder.toGroupByQuery(dataSource, sourceRowSignature, plannerContext.getQueryContext());
if (groupByQuery != null) {
return executeGroupBy(queryBuilder, groupByQuery);
}
final SelectQuery selectQuery = queryBuilder.toSelectQuery(dataSource, sourceRowSignature, plannerContext.getQueryContext());
if (selectQuery != null) {
return executeSelect(queryBuilder, selectQuery);
}
throw new IllegalStateException("WTF?! Cannot execute query even though we planned it?");
}
use of io.druid.query.select.SelectQuery in project druid by druid-io.
the class SelectBenchmark method queryMultiQueryableIndex.
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public void queryMultiQueryableIndex(Blackhole blackhole) throws Exception {
SelectQuery queryCopy = query.withPagingSpec(PagingSpec.newSpec(pagingThreshold));
String segmentName;
List<QueryRunner<Result<SelectResultValue>>> singleSegmentRunners = Lists.newArrayList();
QueryToolChest toolChest = factory.getToolchest();
for (int i = 0; i < numSegments; i++) {
segmentName = "qIndex" + i;
QueryRunner<Result<SelectResultValue>> runner = QueryBenchmarkUtil.makeQueryRunner(factory, segmentName, new QueryableIndexSegment(segmentName, qIndexes.get(i)));
singleSegmentRunners.add(toolChest.preMergeQueryDecoration(runner));
}
QueryRunner theRunner = toolChest.postMergeQueryDecoration(new FinalizeResultsQueryRunner<>(toolChest.mergeResults(factory.mergeRunners(executorService, singleSegmentRunners)), toolChest));
boolean done = false;
while (!done) {
Sequence<Result<SelectResultValue>> queryResult = theRunner.run(queryCopy, Maps.<String, Object>newHashMap());
List<Result<SelectResultValue>> results = Sequences.toList(queryResult, Lists.<Result<SelectResultValue>>newArrayList());
SelectResultValue result = results.get(0).getValue();
if (result.getEvents().size() == 0) {
done = true;
} else {
for (EventHolder eh : result.getEvents()) {
blackhole.consume(eh);
}
queryCopy = incrementQueryPagination(queryCopy, result);
}
}
}
Aggregations