Search in sources :

Example 16 with ColumnAnalysis

use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project druid by druid-io.

the class SegmentAnalyzer method analyzeStringColumn.

private ColumnAnalysis analyzeStringColumn(final ColumnCapabilities capabilities, final StorageAdapter storageAdapter, final String columnName) {
    int cardinality = 0;
    long size = 0;
    Comparable min = null;
    Comparable max = null;
    if (analyzingCardinality()) {
        cardinality = storageAdapter.getDimensionCardinality(columnName);
    }
    if (analyzingSize()) {
        final DateTime start = storageAdapter.getMinTime();
        final DateTime end = storageAdapter.getMaxTime();
        final Sequence<Cursor> cursors = storageAdapter.makeCursors(null, new Interval(start, end), VirtualColumns.EMPTY, Granularities.ALL, false, null);
        size = cursors.accumulate(0L, new Accumulator<Long, Cursor>() {

            @Override
            public Long accumulate(Long accumulated, Cursor cursor) {
                DimensionSelector selector = cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec(columnName, columnName));
                if (selector == null) {
                    return accumulated;
                }
                long current = accumulated;
                while (!cursor.isDone()) {
                    final IndexedInts row = selector.getRow();
                    for (int i = 0, rowSize = row.size(); i < rowSize; ++i) {
                        final String dimVal = selector.lookupName(row.get(i));
                        if (dimVal != null && !dimVal.isEmpty()) {
                            current += StringUtils.estimatedBinaryLengthAsUTF8(dimVal);
                        }
                    }
                    cursor.advance();
                }
                return current;
            }
        });
    }
    if (analyzingMinMax()) {
        min = storageAdapter.getMinValue(columnName);
        max = storageAdapter.getMaxValue(columnName);
    }
    return new ColumnAnalysis(capabilities.toColumnType(), capabilities.getType().name(), capabilities.hasMultipleValues().isTrue(), // if we don't know for sure, then we should plan to check for nulls
    capabilities.hasNulls().isMaybeTrue(), size, cardinality, min, max, null);
}
Also used : Accumulator(org.apache.druid.java.util.common.guava.Accumulator) DimensionSelector(org.apache.druid.segment.DimensionSelector) Cursor(org.apache.druid.segment.Cursor) DateTime(org.joda.time.DateTime) DefaultDimensionSpec(org.apache.druid.query.dimension.DefaultDimensionSpec) IndexedInts(org.apache.druid.segment.data.IndexedInts) ColumnAnalysis(org.apache.druid.query.metadata.metadata.ColumnAnalysis) Interval(org.joda.time.Interval)

Example 17 with ColumnAnalysis

use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project hive by apache.

the class DruidSerDe method initFromMetaDataQuery.

private void initFromMetaDataQuery(final Configuration configuration, final Properties properties) throws SerDeException {
    final List<String> columnNames = new ArrayList<>();
    final List<PrimitiveTypeInfo> columnTypes = new ArrayList<>();
    final List<ObjectInspector> inspectors = new ArrayList<>();
    String dataSource = properties.getProperty(Constants.DRUID_DATA_SOURCE);
    if (dataSource == null) {
        throw new SerDeException("Druid data source not specified; use " + Constants.DRUID_DATA_SOURCE + " in table properties");
    }
    SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
    builder.dataSource(dataSource);
    builder.merge(true);
    builder.analysisTypes();
    SegmentMetadataQuery query = builder.build();
    // Execute query in Druid
    String address = HiveConf.getVar(configuration, HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
    if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
        throw new SerDeException("Druid broker address not specified in configuration");
    }
    // Infer schema
    SegmentAnalysis schemaInfo;
    try {
        schemaInfo = submitMetadataRequest(address, query);
    } catch (IOException e) {
        throw new SerDeException(e);
    }
    for (Entry<String, ColumnAnalysis> columnInfo : schemaInfo.getColumns().entrySet()) {
        if (columnInfo.getKey().equals(DruidConstants.DEFAULT_TIMESTAMP_COLUMN)) {
            // Special handling for timestamp column
            // field name
            columnNames.add(columnInfo.getKey());
            // field type
            PrimitiveTypeInfo type = tsTZTypeInfo;
            columnTypes.add(type);
            inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
            continue;
        }
        // field name
        columnNames.add(columnInfo.getKey());
        // field type
        PrimitiveTypeInfo type = DruidSerDeUtils.convertDruidToHiveType(columnInfo.getValue().getType());
        columnTypes.add(type instanceof TimestampLocalTZTypeInfo ? tsTZTypeInfo : type);
        inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
    }
    columns = columnNames.toArray(new String[0]);
    types = columnTypes.toArray(new PrimitiveTypeInfo[0]);
    inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
}
Also used : BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) TimestampLocalTZObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampLocalTZObjectInspector) HiveVarcharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector) HiveCharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) ArrayList(java.util.ArrayList) TimestampLocalTZTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TimestampLocalTZTypeInfo) IOException(java.io.IOException) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) SegmentMetadataQuery(org.apache.druid.query.metadata.metadata.SegmentMetadataQuery) ColumnAnalysis(org.apache.druid.query.metadata.metadata.ColumnAnalysis) SegmentMetadataQueryBuilder(org.apache.druid.query.Druids.SegmentMetadataQueryBuilder) SegmentAnalysis(org.apache.druid.query.metadata.metadata.SegmentAnalysis) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 18 with ColumnAnalysis

use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project druid by druid-io.

the class SegmentAnalyzer method analyzeComplexColumn.

private ColumnAnalysis analyzeComplexColumn(@Nullable final ColumnCapabilities capabilities, @Nullable final ColumnHolder columnHolder) {
    final TypeSignature<ValueType> typeSignature = capabilities == null ? ColumnType.UNKNOWN_COMPLEX : capabilities;
    final String typeName = typeSignature.getComplexTypeName();
    try (final ComplexColumn complexColumn = columnHolder != null ? (ComplexColumn) columnHolder.getColumn() : null) {
        final boolean hasMultipleValues = capabilities != null && capabilities.hasMultipleValues().isTrue();
        final boolean hasNulls = capabilities != null && capabilities.hasNulls().isMaybeTrue();
        long size = 0;
        if (analyzingSize() && complexColumn != null) {
            final ComplexMetricSerde serde = typeName == null ? null : ComplexMetrics.getSerdeForType(typeName);
            if (serde == null) {
                return ColumnAnalysis.error(StringUtils.format("unknown_complex_%s", typeName));
            }
            final Function<Object, Long> inputSizeFn = serde.inputSizeFn();
            if (inputSizeFn == null) {
                return new ColumnAnalysis(ColumnTypeFactory.ofType(typeSignature), typeName, hasMultipleValues, hasNulls, 0, null, null, null, null);
            }
            final int length = complexColumn.getLength();
            for (int i = 0; i < length; ++i) {
                size += inputSizeFn.apply(complexColumn.getRowValue(i));
            }
        }
        return new ColumnAnalysis(ColumnTypeFactory.ofType(typeSignature), typeName, hasMultipleValues, hasNulls, size, null, null, null, null);
    }
}
Also used : ComplexMetricSerde(org.apache.druid.segment.serde.ComplexMetricSerde) ValueType(org.apache.druid.segment.column.ValueType) ColumnAnalysis(org.apache.druid.query.metadata.metadata.ColumnAnalysis) ComplexColumn(org.apache.druid.segment.column.ComplexColumn)

Example 19 with ColumnAnalysis

use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project druid by druid-io.

the class SegmentAnalyzer method analyze.

public Map<String, ColumnAnalysis> analyze(Segment segment) {
    Preconditions.checkNotNull(segment, "segment");
    // index is null for incremental-index-based segments, but storageAdapter is always available
    final QueryableIndex index = segment.asQueryableIndex();
    final StorageAdapter storageAdapter = segment.asStorageAdapter();
    // get length and column names from storageAdapter
    final int length = storageAdapter.getNumRows();
    Map<String, ColumnAnalysis> columns = new TreeMap<>();
    final RowSignature rowSignature = storageAdapter.getRowSignature();
    for (String columnName : rowSignature.getColumnNames()) {
        final ColumnCapabilities capabilities;
        if (storageAdapter instanceof IncrementalIndexStorageAdapter) {
            // See javadocs for getSnapshotColumnCapabilities for a discussion of why we need to do this.
            capabilities = ((IncrementalIndexStorageAdapter) storageAdapter).getSnapshotColumnCapabilities(columnName);
        } else {
            capabilities = storageAdapter.getColumnCapabilities(columnName);
        }
        final ColumnAnalysis analysis;
        switch(capabilities.getType()) {
            case LONG:
                final int bytesPerRow = ColumnHolder.TIME_COLUMN_NAME.equals(columnName) ? NUM_BYTES_IN_TIMESTAMP : Long.BYTES;
                analysis = analyzeNumericColumn(capabilities, length, bytesPerRow);
                break;
            case FLOAT:
                analysis = analyzeNumericColumn(capabilities, length, NUM_BYTES_IN_TEXT_FLOAT);
                break;
            case DOUBLE:
                analysis = analyzeNumericColumn(capabilities, length, Double.BYTES);
                break;
            case STRING:
                if (index != null) {
                    analysis = analyzeStringColumn(capabilities, index.getColumnHolder(columnName));
                } else {
                    analysis = analyzeStringColumn(capabilities, storageAdapter, columnName);
                }
                break;
            case COMPLEX:
                final ColumnHolder columnHolder = index != null ? index.getColumnHolder(columnName) : null;
                analysis = analyzeComplexColumn(capabilities, columnHolder);
                break;
            default:
                log.warn("Unknown column type[%s].", capabilities.asTypeString());
                analysis = ColumnAnalysis.error(StringUtils.format("unknown_type_%s", capabilities.asTypeString()));
        }
        columns.put(columnName, analysis);
    }
    return columns;
}
Also used : ColumnHolder(org.apache.druid.segment.column.ColumnHolder) QueryableIndex(org.apache.druid.segment.QueryableIndex) ColumnAnalysis(org.apache.druid.query.metadata.metadata.ColumnAnalysis) IncrementalIndexStorageAdapter(org.apache.druid.segment.incremental.IncrementalIndexStorageAdapter) StorageAdapter(org.apache.druid.segment.StorageAdapter) IncrementalIndexStorageAdapter(org.apache.druid.segment.incremental.IncrementalIndexStorageAdapter) TreeMap(java.util.TreeMap) RowSignature(org.apache.druid.segment.column.RowSignature) ColumnCapabilities(org.apache.druid.segment.column.ColumnCapabilities)

Example 20 with ColumnAnalysis

use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project druid by druid-io.

the class SegmentMetadataQueryQueryToolChest method mergeAnalyses.

@VisibleForTesting
public static SegmentAnalysis mergeAnalyses(final SegmentAnalysis arg1, final SegmentAnalysis arg2, boolean lenientAggregatorMerge) {
    if (arg1 == null) {
        return arg2;
    }
    if (arg2 == null) {
        return arg1;
    }
    List<Interval> newIntervals = null;
    if (arg1.getIntervals() != null) {
        newIntervals = new ArrayList<>(arg1.getIntervals());
    }
    if (arg2.getIntervals() != null) {
        if (newIntervals == null) {
            newIntervals = new ArrayList<>();
        }
        newIntervals.addAll(arg2.getIntervals());
    }
    final Map<String, ColumnAnalysis> leftColumns = arg1.getColumns();
    final Map<String, ColumnAnalysis> rightColumns = arg2.getColumns();
    Map<String, ColumnAnalysis> columns = new TreeMap<>();
    Set<String> rightColumnNames = Sets.newHashSet(rightColumns.keySet());
    for (Map.Entry<String, ColumnAnalysis> entry : leftColumns.entrySet()) {
        final String columnName = entry.getKey();
        columns.put(columnName, entry.getValue().fold(rightColumns.get(columnName)));
        rightColumnNames.remove(columnName);
    }
    for (String columnName : rightColumnNames) {
        columns.put(columnName, rightColumns.get(columnName));
    }
    final Map<String, AggregatorFactory> aggregators = new HashMap<>();
    if (lenientAggregatorMerge) {
        // Merge each aggregator individually, ignoring nulls
        for (SegmentAnalysis analysis : ImmutableList.of(arg1, arg2)) {
            if (analysis.getAggregators() != null) {
                for (Map.Entry<String, AggregatorFactory> entry : analysis.getAggregators().entrySet()) {
                    final String aggregatorName = entry.getKey();
                    final AggregatorFactory aggregator = entry.getValue();
                    AggregatorFactory merged = aggregators.get(aggregatorName);
                    if (merged != null) {
                        try {
                            merged = merged.getMergingFactory(aggregator);
                        } catch (AggregatorFactoryNotMergeableException e) {
                            merged = null;
                        }
                    } else {
                        merged = aggregator;
                    }
                    aggregators.put(aggregatorName, merged);
                }
            }
        }
    } else {
        final AggregatorFactory[] aggs1 = arg1.getAggregators() != null ? arg1.getAggregators().values().toArray(new AggregatorFactory[0]) : null;
        final AggregatorFactory[] aggs2 = arg2.getAggregators() != null ? arg2.getAggregators().values().toArray(new AggregatorFactory[0]) : null;
        final AggregatorFactory[] merged = AggregatorFactory.mergeAggregators(Arrays.asList(aggs1, aggs2));
        if (merged != null) {
            for (AggregatorFactory aggregator : merged) {
                aggregators.put(aggregator.getName(), aggregator);
            }
        }
    }
    final TimestampSpec timestampSpec = TimestampSpec.mergeTimestampSpec(Lists.newArrayList(arg1.getTimestampSpec(), arg2.getTimestampSpec()));
    final Granularity queryGranularity = Granularity.mergeGranularities(Lists.newArrayList(arg1.getQueryGranularity(), arg2.getQueryGranularity()));
    final String mergedId;
    if (arg1.getId() != null && arg2.getId() != null && arg1.getId().equals(arg2.getId())) {
        mergedId = arg1.getId();
    } else {
        mergedId = "merged";
    }
    final Boolean rollup;
    if (arg1.isRollup() != null && arg2.isRollup() != null && arg1.isRollup().equals(arg2.isRollup())) {
        rollup = arg1.isRollup();
    } else {
        rollup = null;
    }
    return new SegmentAnalysis(mergedId, newIntervals, columns, arg1.getSize() + arg2.getSize(), arg1.getNumRows() + arg2.getNumRows(), aggregators.isEmpty() ? null : aggregators, timestampSpec, queryGranularity, rollup);
}
Also used : HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) Granularity(org.apache.druid.java.util.common.granularity.Granularity) AggregatorFactoryNotMergeableException(org.apache.druid.query.aggregation.AggregatorFactoryNotMergeableException) ColumnAnalysis(org.apache.druid.query.metadata.metadata.ColumnAnalysis) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) SegmentAnalysis(org.apache.druid.query.metadata.metadata.SegmentAnalysis) HashMap(java.util.HashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) Interval(org.joda.time.Interval) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

ColumnAnalysis (org.apache.druid.query.metadata.metadata.ColumnAnalysis)30 SegmentAnalysis (org.apache.druid.query.metadata.metadata.SegmentAnalysis)19 Test (org.junit.Test)18 SegmentMetadataQuery (org.apache.druid.query.metadata.metadata.SegmentMetadataQuery)12 QueryRunner (org.apache.druid.query.QueryRunner)9 ListColumnIncluderator (org.apache.druid.query.metadata.metadata.ListColumnIncluderator)9 ExecutorService (java.util.concurrent.ExecutorService)8 FinalizeResultsQueryRunner (org.apache.druid.query.FinalizeResultsQueryRunner)8 QueryToolChest (org.apache.druid.query.QueryToolChest)8 Map (java.util.Map)4 TreeMap (java.util.TreeMap)4 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)4 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)4 RowSignature (org.apache.druid.segment.column.RowSignature)4 HashMap (java.util.HashMap)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 IOException (java.io.IOException)2 List (java.util.List)2 DimensionSchema (org.apache.druid.data.input.impl.DimensionSchema)2 Granularity (org.apache.druid.java.util.common.granularity.Granularity)2