Search in sources :

Example 11 with SegmentAnalysis

use of io.druid.query.metadata.metadata.SegmentAnalysis in project druid by druid-io.

the class DruidSchema method computeTable.

private DruidTable computeTable(final String dataSource) {
    final SegmentMetadataQuery segmentMetadataQuery = new SegmentMetadataQuery(new TableDataSource(dataSource), null, null, false, ImmutableMap.<String, Object>of("useCache", false, "populateCache", false), EnumSet.of(SegmentMetadataQuery.AnalysisType.INTERVAL), null, true);
    final Sequence<SegmentAnalysis> sequence = segmentMetadataQuery.run(walker, Maps.<String, Object>newHashMap());
    final List<SegmentAnalysis> results = Sequences.toList(sequence, Lists.<SegmentAnalysis>newArrayList());
    if (results.isEmpty()) {
        return null;
    }
    final Map<String, ValueType> columnTypes = Maps.newLinkedHashMap();
    // Resolve conflicts by taking the latest metadata. This aids in gradual schema evolution.
    long maxTimestamp = JodaUtils.MIN_INSTANT;
    for (SegmentAnalysis analysis : results) {
        final long timestamp;
        if (analysis.getIntervals() != null && analysis.getIntervals().size() > 0) {
            timestamp = analysis.getIntervals().get(analysis.getIntervals().size() - 1).getEndMillis();
        } else {
            timestamp = JodaUtils.MIN_INSTANT;
        }
        for (Map.Entry<String, ColumnAnalysis> entry : analysis.getColumns().entrySet()) {
            if (entry.getValue().isError()) {
                // Skip columns with analysis errors.
                continue;
            }
            if (!columnTypes.containsKey(entry.getKey()) || timestamp >= maxTimestamp) {
                ValueType valueType;
                try {
                    valueType = ValueType.valueOf(entry.getValue().getType().toUpperCase());
                } catch (IllegalArgumentException e) {
                    // Assume unrecognized types are some flavor of COMPLEX. This throws away information about exactly
                    // what kind of complex column it is, which we may want to preserve some day.
                    valueType = ValueType.COMPLEX;
                }
                columnTypes.put(entry.getKey(), valueType);
                maxTimestamp = timestamp;
            }
        }
    }
    final RowSignature.Builder rowSignature = RowSignature.builder();
    for (Map.Entry<String, ValueType> entry : columnTypes.entrySet()) {
        rowSignature.add(entry.getKey(), entry.getValue());
    }
    return new DruidTable(new TableDataSource(dataSource), rowSignature.build());
}
Also used : ValueType(io.druid.segment.column.ValueType) DruidTable(io.druid.sql.calcite.table.DruidTable) TableDataSource(io.druid.query.TableDataSource) SegmentMetadataQuery(io.druid.query.metadata.metadata.SegmentMetadataQuery) ColumnAnalysis(io.druid.query.metadata.metadata.ColumnAnalysis) SegmentAnalysis(io.druid.query.metadata.metadata.SegmentAnalysis) ConcurrentMap(java.util.concurrent.ConcurrentMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) RowSignature(io.druid.sql.calcite.table.RowSignature)

Example 12 with SegmentAnalysis

use of io.druid.query.metadata.metadata.SegmentAnalysis in project hive by apache.

the class DruidQueryBasedInputFormat method splitSelectQuery.

/* Method that splits Select query depending on the threshold so read can be
   * parallelized. We will only contact the Druid broker to obtain all results. */
private static HiveDruidSplit[] splitSelectQuery(Configuration conf, String address, SelectQuery query, Path dummyPath) throws IOException {
    final int selectThreshold = (int) HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_SELECT_THRESHOLD);
    final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
    final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
    final boolean isFetch = query.getContextBoolean(Constants.DRUID_QUERY_FETCH, false);
    if (isFetch) {
        // If it has a limit, we use it and we do not split the query
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    // We do not have the number of rows, thus we need to execute a
    // Segment Metadata query to obtain number of rows
    SegmentMetadataQueryBuilder metadataBuilder = new Druids.SegmentMetadataQueryBuilder();
    metadataBuilder.dataSource(query.getDataSource());
    metadataBuilder.intervals(query.getIntervals());
    metadataBuilder.merge(true);
    metadataBuilder.analysisTypes();
    SegmentMetadataQuery metadataQuery = metadataBuilder.build();
    Lifecycle lifecycle = new Lifecycle();
    HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
    try {
        lifecycle.start();
    } catch (Exception e) {
        LOG.error("Lifecycle start issue");
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
    InputStream response;
    try {
        response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, metadataQuery));
    } catch (Exception e) {
        lifecycle.stop();
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
    // Retrieve results
    List<SegmentAnalysis> metadataList;
    try {
        metadataList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<SegmentAnalysis>>() {
        });
    } catch (Exception e) {
        response.close();
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    } finally {
        lifecycle.stop();
    }
    if (metadataList == null) {
        throw new IOException("Connected to Druid but could not retrieve datasource information");
    }
    if (metadataList.isEmpty()) {
        // There are no rows for that time range, we can submit query as it is
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    if (metadataList.size() != 1) {
        throw new IOException("Information about segments should have been merged");
    }
    final long numRows = metadataList.get(0).getNumRows();
    query = query.withPagingSpec(PagingSpec.newSpec(Integer.MAX_VALUE));
    if (numRows <= selectThreshold) {
        // We are not going to split it
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    // If the query does not specify a timestamp, we obtain the total time using
    // a Time Boundary query. Then, we use the information to split the query
    // following the Select threshold configuration property
    final List<Interval> intervals = new ArrayList<>();
    if (query.getIntervals().size() == 1 && query.getIntervals().get(0).withChronology(ISOChronology.getInstanceUTC()).equals(DruidTable.DEFAULT_INTERVAL)) {
        // Default max and min, we should execute a time boundary query to get a
        // more precise range
        TimeBoundaryQueryBuilder timeBuilder = new Druids.TimeBoundaryQueryBuilder();
        timeBuilder.dataSource(query.getDataSource());
        TimeBoundaryQuery timeQuery = timeBuilder.build();
        lifecycle = new Lifecycle();
        client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
        try {
            lifecycle.start();
        } catch (Exception e) {
            LOG.error("Lifecycle start issue");
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        }
        try {
            response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, timeQuery));
        } catch (Exception e) {
            lifecycle.stop();
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        }
        // Retrieve results
        List<Result<TimeBoundaryResultValue>> timeList;
        try {
            timeList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<Result<TimeBoundaryResultValue>>>() {
            });
        } catch (Exception e) {
            response.close();
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        } finally {
            lifecycle.stop();
        }
        if (timeList == null || timeList.isEmpty()) {
            throw new IOException("Connected to Druid but could not retrieve time boundary information");
        }
        if (timeList.size() != 1) {
            throw new IOException("We should obtain a single time boundary");
        }
        intervals.add(new Interval(timeList.get(0).getValue().getMinTime().getMillis(), timeList.get(0).getValue().getMaxTime().getMillis(), ISOChronology.getInstanceUTC()));
    } else {
        intervals.addAll(query.getIntervals());
    }
    // Create (numRows/default threshold) input splits
    int numSplits = (int) Math.ceil((double) numRows / selectThreshold);
    List<List<Interval>> newIntervals = createSplitsIntervals(intervals, numSplits);
    HiveDruidSplit[] splits = new HiveDruidSplit[numSplits];
    for (int i = 0; i < numSplits; i++) {
        // Create partial Select query
        final SelectQuery partialQuery = query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(newIntervals.get(i)));
        splits[i] = new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(partialQuery), dummyPath, new String[] { address });
    }
    return splits;
}
Also used : ArrayList(java.util.ArrayList) MultipleIntervalSegmentSpec(io.druid.query.spec.MultipleIntervalSegmentSpec) TimeBoundaryQuery(io.druid.query.timeboundary.TimeBoundaryQuery) Result(io.druid.query.Result) SegmentMetadataQuery(io.druid.query.metadata.metadata.SegmentMetadataQuery) SegmentMetadataQueryBuilder(io.druid.query.Druids.SegmentMetadataQueryBuilder) SegmentAnalysis(io.druid.query.metadata.metadata.SegmentAnalysis) TimeBoundaryQueryBuilder(io.druid.query.Druids.TimeBoundaryQueryBuilder) List(java.util.List) ArrayList(java.util.ArrayList) TypeReference(com.fasterxml.jackson.core.type.TypeReference) InputStream(java.io.InputStream) Lifecycle(com.metamx.common.lifecycle.Lifecycle) Period(org.joda.time.Period) IOException(java.io.IOException) JsonParseException(com.fasterxml.jackson.core.JsonParseException) JsonMappingException(com.fasterxml.jackson.databind.JsonMappingException) IOException(java.io.IOException) SelectQuery(io.druid.query.select.SelectQuery) HttpClient(com.metamx.http.client.HttpClient) Interval(org.joda.time.Interval)

Example 13 with SegmentAnalysis

use of io.druid.query.metadata.metadata.SegmentAnalysis in project hive by apache.

the class DruidSerDe method inferSchema.

/* Select query */
private void inferSchema(SelectQuery query, List<String> columnNames, List<PrimitiveTypeInfo> columnTypes, String address) throws SerDeException {
    // Timestamp column
    columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
    columnTypes.add(TypeInfoFactory.timestampTypeInfo);
    // Dimension columns
    for (DimensionSpec ds : query.getDimensions()) {
        columnNames.add(ds.getOutputName());
        columnTypes.add(TypeInfoFactory.stringTypeInfo);
    }
    // The type for metric columns is not explicit in the query, thus in this case
    // we need to emit a metadata query to know their type
    SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
    builder.dataSource(query.getDataSource());
    builder.merge(true);
    builder.analysisTypes();
    SegmentMetadataQuery metadataQuery = builder.build();
    // Execute query in Druid
    SegmentAnalysis schemaInfo;
    try {
        schemaInfo = submitMetadataRequest(address, metadataQuery);
    } catch (IOException e) {
        throw new SerDeException(e);
    }
    if (schemaInfo == null) {
        throw new SerDeException("Connected to Druid but could not retrieve datasource information");
    }
    for (String metric : query.getMetrics()) {
        columnNames.add(metric);
        columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(schemaInfo.getColumns().get(metric).getType()));
    }
}
Also used : DimensionSpec(io.druid.query.dimension.DimensionSpec) SegmentMetadataQuery(io.druid.query.metadata.metadata.SegmentMetadataQuery) SegmentMetadataQueryBuilder(io.druid.query.Druids.SegmentMetadataQueryBuilder) SegmentAnalysis(io.druid.query.metadata.metadata.SegmentAnalysis) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 14 with SegmentAnalysis

use of io.druid.query.metadata.metadata.SegmentAnalysis in project druid by druid-io.

the class SegmentMetadataQueryQueryToolChest method mergeAnalyses.

@VisibleForTesting
public static SegmentAnalysis mergeAnalyses(final SegmentAnalysis arg1, final SegmentAnalysis arg2, boolean lenientAggregatorMerge) {
    if (arg1 == null) {
        return arg2;
    }
    if (arg2 == null) {
        return arg1;
    }
    List<Interval> newIntervals = null;
    if (arg1.getIntervals() != null) {
        newIntervals = Lists.newArrayList();
        newIntervals.addAll(arg1.getIntervals());
    }
    if (arg2.getIntervals() != null) {
        if (newIntervals == null) {
            newIntervals = Lists.newArrayList();
        }
        newIntervals.addAll(arg2.getIntervals());
    }
    final Map<String, ColumnAnalysis> leftColumns = arg1.getColumns();
    final Map<String, ColumnAnalysis> rightColumns = arg2.getColumns();
    Map<String, ColumnAnalysis> columns = Maps.newTreeMap();
    Set<String> rightColumnNames = Sets.newHashSet(rightColumns.keySet());
    for (Map.Entry<String, ColumnAnalysis> entry : leftColumns.entrySet()) {
        final String columnName = entry.getKey();
        columns.put(columnName, entry.getValue().fold(rightColumns.get(columnName)));
        rightColumnNames.remove(columnName);
    }
    for (String columnName : rightColumnNames) {
        columns.put(columnName, rightColumns.get(columnName));
    }
    final Map<String, AggregatorFactory> aggregators = Maps.newHashMap();
    if (lenientAggregatorMerge) {
        // Merge each aggregator individually, ignoring nulls
        for (SegmentAnalysis analysis : ImmutableList.of(arg1, arg2)) {
            if (analysis.getAggregators() != null) {
                for (Map.Entry<String, AggregatorFactory> entry : analysis.getAggregators().entrySet()) {
                    final String aggregatorName = entry.getKey();
                    final AggregatorFactory aggregator = entry.getValue();
                    AggregatorFactory merged = aggregators.get(aggregatorName);
                    if (merged != null) {
                        try {
                            merged = merged.getMergingFactory(aggregator);
                        } catch (AggregatorFactoryNotMergeableException e) {
                            merged = null;
                        }
                    } else {
                        merged = aggregator;
                    }
                    aggregators.put(aggregatorName, merged);
                }
            }
        }
    } else {
        final AggregatorFactory[] aggs1 = arg1.getAggregators() != null ? arg1.getAggregators().values().toArray(new AggregatorFactory[arg1.getAggregators().size()]) : null;
        final AggregatorFactory[] aggs2 = arg2.getAggregators() != null ? arg2.getAggregators().values().toArray(new AggregatorFactory[arg2.getAggregators().size()]) : null;
        final AggregatorFactory[] merged = AggregatorFactory.mergeAggregators(Arrays.asList(aggs1, aggs2));
        if (merged != null) {
            for (AggregatorFactory aggregator : merged) {
                aggregators.put(aggregator.getName(), aggregator);
            }
        }
    }
    final TimestampSpec timestampSpec = TimestampSpec.mergeTimestampSpec(Lists.newArrayList(arg1.getTimestampSpec(), arg2.getTimestampSpec()));
    final Granularity queryGranularity = Granularity.mergeGranularities(Lists.newArrayList(arg1.getQueryGranularity(), arg2.getQueryGranularity()));
    final String mergedId;
    if (arg1.getId() != null && arg2.getId() != null && arg1.getId().equals(arg2.getId())) {
        mergedId = arg1.getId();
    } else {
        mergedId = "merged";
    }
    final Boolean rollup;
    if (arg1.isRollup() != null && arg2.isRollup() != null && arg1.isRollup().equals(arg2.isRollup())) {
        rollup = arg1.isRollup();
    } else {
        rollup = null;
    }
    return new SegmentAnalysis(mergedId, newIntervals, columns, arg1.getSize() + arg2.getSize(), arg1.getNumRows() + arg2.getNumRows(), aggregators.isEmpty() ? null : aggregators, timestampSpec, queryGranularity, rollup);
}
Also used : AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) Granularity(io.druid.java.util.common.granularity.Granularity) AggregatorFactoryNotMergeableException(io.druid.query.aggregation.AggregatorFactoryNotMergeableException) ColumnAnalysis(io.druid.query.metadata.metadata.ColumnAnalysis) TimestampSpec(io.druid.data.input.impl.TimestampSpec) SegmentAnalysis(io.druid.query.metadata.metadata.SegmentAnalysis) Map(java.util.Map) Interval(org.joda.time.Interval) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 15 with SegmentAnalysis

use of io.druid.query.metadata.metadata.SegmentAnalysis in project druid by druid-io.

the class SegmentMetadataQueryRunnerFactory method createRunner.

@Override
public QueryRunner<SegmentAnalysis> createRunner(final Segment segment) {
    return new QueryRunner<SegmentAnalysis>() {

        @Override
        public Sequence<SegmentAnalysis> run(Query<SegmentAnalysis> inQ, Map<String, Object> responseContext) {
            SegmentMetadataQuery query = (SegmentMetadataQuery) inQ;
            final SegmentAnalyzer analyzer = new SegmentAnalyzer(query.getAnalysisTypes());
            final Map<String, ColumnAnalysis> analyzedColumns = analyzer.analyze(segment);
            final long numRows = analyzer.numRows(segment);
            long totalSize = 0;
            if (analyzer.analyzingSize()) {
                // Initialize with the size of the whitespace, 1 byte per
                totalSize = analyzedColumns.size() * numRows;
            }
            Map<String, ColumnAnalysis> columns = Maps.newTreeMap();
            ColumnIncluderator includerator = query.getToInclude();
            for (Map.Entry<String, ColumnAnalysis> entry : analyzedColumns.entrySet()) {
                final String columnName = entry.getKey();
                final ColumnAnalysis column = entry.getValue();
                if (!column.isError()) {
                    totalSize += column.getSize();
                }
                if (includerator.include(columnName)) {
                    columns.put(columnName, column);
                }
            }
            List<Interval> retIntervals = query.analyzingInterval() ? Arrays.asList(segment.getDataInterval()) : null;
            final Map<String, AggregatorFactory> aggregators;
            Metadata metadata = null;
            if (query.hasAggregators()) {
                metadata = segment.asStorageAdapter().getMetadata();
                if (metadata != null && metadata.getAggregators() != null) {
                    aggregators = Maps.newHashMap();
                    for (AggregatorFactory aggregator : metadata.getAggregators()) {
                        aggregators.put(aggregator.getName(), aggregator);
                    }
                } else {
                    aggregators = null;
                }
            } else {
                aggregators = null;
            }
            final TimestampSpec timestampSpec;
            if (query.hasTimestampSpec()) {
                if (metadata == null) {
                    metadata = segment.asStorageAdapter().getMetadata();
                }
                timestampSpec = metadata != null ? metadata.getTimestampSpec() : null;
            } else {
                timestampSpec = null;
            }
            final Granularity queryGranularity;
            if (query.hasQueryGranularity()) {
                if (metadata == null) {
                    metadata = segment.asStorageAdapter().getMetadata();
                }
                queryGranularity = metadata != null ? metadata.getQueryGranularity() : null;
            } else {
                queryGranularity = null;
            }
            Boolean rollup = null;
            if (query.hasRollup()) {
                if (metadata == null) {
                    metadata = segment.asStorageAdapter().getMetadata();
                }
                rollup = metadata != null ? metadata.isRollup() : null;
                if (rollup == null) {
                    // in this case, this segment is built before no-rollup function is coded,
                    // thus it is built with rollup
                    rollup = Boolean.TRUE;
                }
            }
            return Sequences.simple(Arrays.asList(new SegmentAnalysis(segment.getIdentifier(), retIntervals, columns, totalSize, numRows, aggregators, timestampSpec, queryGranularity, rollup)));
        }
    };
}
Also used : BaseQuery(io.druid.query.BaseQuery) SegmentMetadataQuery(io.druid.query.metadata.metadata.SegmentMetadataQuery) Query(io.druid.query.Query) Metadata(io.druid.segment.Metadata) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) Granularity(io.druid.java.util.common.granularity.Granularity) ColumnIncluderator(io.druid.query.metadata.metadata.ColumnIncluderator) QueryRunner(io.druid.query.QueryRunner) ConcatQueryRunner(io.druid.query.ConcatQueryRunner) SegmentMetadataQuery(io.druid.query.metadata.metadata.SegmentMetadataQuery) ColumnAnalysis(io.druid.query.metadata.metadata.ColumnAnalysis) TimestampSpec(io.druid.data.input.impl.TimestampSpec) SegmentAnalysis(io.druid.query.metadata.metadata.SegmentAnalysis) Map(java.util.Map) Interval(org.joda.time.Interval)

Aggregations

SegmentAnalysis (io.druid.query.metadata.metadata.SegmentAnalysis)25 ColumnAnalysis (io.druid.query.metadata.metadata.ColumnAnalysis)15 Test (org.junit.Test)15 QueryRunner (io.druid.query.QueryRunner)11 ListColumnIncluderator (io.druid.query.metadata.metadata.ListColumnIncluderator)10 FinalizeResultsQueryRunner (io.druid.query.FinalizeResultsQueryRunner)9 QueryToolChest (io.druid.query.QueryToolChest)9 SegmentMetadataQuery (io.druid.query.metadata.metadata.SegmentMetadataQuery)9 ExecutorService (java.util.concurrent.ExecutorService)9 Interval (org.joda.time.Interval)5 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)4 TimestampSpec (io.druid.data.input.impl.TimestampSpec)3 Query (io.druid.query.Query)3 TableDataSource (io.druid.query.TableDataSource)3 DoubleSumAggregatorFactory (io.druid.query.aggregation.DoubleSumAggregatorFactory)3 LongSumAggregatorFactory (io.druid.query.aggregation.LongSumAggregatorFactory)3 IOException (java.io.IOException)3 Map (java.util.Map)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 DimensionSchema (io.druid.data.input.impl.DimensionSchema)2