use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project druid by druid-io.
the class SegmentAnalyzer method analyzeStringColumn.
private ColumnAnalysis analyzeStringColumn(final ColumnCapabilities capabilities, final StorageAdapter storageAdapter, final String columnName) {
int cardinality = 0;
long size = 0;
Comparable min = null;
Comparable max = null;
if (analyzingCardinality()) {
cardinality = storageAdapter.getDimensionCardinality(columnName);
}
if (analyzingSize()) {
final DateTime start = storageAdapter.getMinTime();
final DateTime end = storageAdapter.getMaxTime();
final Sequence<Cursor> cursors = storageAdapter.makeCursors(null, new Interval(start, end), VirtualColumns.EMPTY, Granularities.ALL, false, null);
size = cursors.accumulate(0L, new Accumulator<Long, Cursor>() {
@Override
public Long accumulate(Long accumulated, Cursor cursor) {
DimensionSelector selector = cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec(columnName, columnName));
if (selector == null) {
return accumulated;
}
long current = accumulated;
while (!cursor.isDone()) {
final IndexedInts row = selector.getRow();
for (int i = 0, rowSize = row.size(); i < rowSize; ++i) {
final String dimVal = selector.lookupName(row.get(i));
if (dimVal != null && !dimVal.isEmpty()) {
current += StringUtils.estimatedBinaryLengthAsUTF8(dimVal);
}
}
cursor.advance();
}
return current;
}
});
}
if (analyzingMinMax()) {
min = storageAdapter.getMinValue(columnName);
max = storageAdapter.getMaxValue(columnName);
}
return new ColumnAnalysis(capabilities.toColumnType(), capabilities.getType().name(), capabilities.hasMultipleValues().isTrue(), // if we don't know for sure, then we should plan to check for nulls
capabilities.hasNulls().isMaybeTrue(), size, cardinality, min, max, null);
}
use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project hive by apache.
the class DruidSerDe method initFromMetaDataQuery.
private void initFromMetaDataQuery(final Configuration configuration, final Properties properties) throws SerDeException {
final List<String> columnNames = new ArrayList<>();
final List<PrimitiveTypeInfo> columnTypes = new ArrayList<>();
final List<ObjectInspector> inspectors = new ArrayList<>();
String dataSource = properties.getProperty(Constants.DRUID_DATA_SOURCE);
if (dataSource == null) {
throw new SerDeException("Druid data source not specified; use " + Constants.DRUID_DATA_SOURCE + " in table properties");
}
SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
builder.dataSource(dataSource);
builder.merge(true);
builder.analysisTypes();
SegmentMetadataQuery query = builder.build();
// Execute query in Druid
String address = HiveConf.getVar(configuration, HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
throw new SerDeException("Druid broker address not specified in configuration");
}
// Infer schema
SegmentAnalysis schemaInfo;
try {
schemaInfo = submitMetadataRequest(address, query);
} catch (IOException e) {
throw new SerDeException(e);
}
for (Entry<String, ColumnAnalysis> columnInfo : schemaInfo.getColumns().entrySet()) {
if (columnInfo.getKey().equals(DruidConstants.DEFAULT_TIMESTAMP_COLUMN)) {
// Special handling for timestamp column
// field name
columnNames.add(columnInfo.getKey());
// field type
PrimitiveTypeInfo type = tsTZTypeInfo;
columnTypes.add(type);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
continue;
}
// field name
columnNames.add(columnInfo.getKey());
// field type
PrimitiveTypeInfo type = DruidSerDeUtils.convertDruidToHiveType(columnInfo.getValue().getType());
columnTypes.add(type instanceof TimestampLocalTZTypeInfo ? tsTZTypeInfo : type);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
}
columns = columnNames.toArray(new String[0]);
types = columnTypes.toArray(new PrimitiveTypeInfo[0]);
inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
}
use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project druid by druid-io.
the class SegmentAnalyzer method analyzeComplexColumn.
private ColumnAnalysis analyzeComplexColumn(@Nullable final ColumnCapabilities capabilities, @Nullable final ColumnHolder columnHolder) {
final TypeSignature<ValueType> typeSignature = capabilities == null ? ColumnType.UNKNOWN_COMPLEX : capabilities;
final String typeName = typeSignature.getComplexTypeName();
try (final ComplexColumn complexColumn = columnHolder != null ? (ComplexColumn) columnHolder.getColumn() : null) {
final boolean hasMultipleValues = capabilities != null && capabilities.hasMultipleValues().isTrue();
final boolean hasNulls = capabilities != null && capabilities.hasNulls().isMaybeTrue();
long size = 0;
if (analyzingSize() && complexColumn != null) {
final ComplexMetricSerde serde = typeName == null ? null : ComplexMetrics.getSerdeForType(typeName);
if (serde == null) {
return ColumnAnalysis.error(StringUtils.format("unknown_complex_%s", typeName));
}
final Function<Object, Long> inputSizeFn = serde.inputSizeFn();
if (inputSizeFn == null) {
return new ColumnAnalysis(ColumnTypeFactory.ofType(typeSignature), typeName, hasMultipleValues, hasNulls, 0, null, null, null, null);
}
final int length = complexColumn.getLength();
for (int i = 0; i < length; ++i) {
size += inputSizeFn.apply(complexColumn.getRowValue(i));
}
}
return new ColumnAnalysis(ColumnTypeFactory.ofType(typeSignature), typeName, hasMultipleValues, hasNulls, size, null, null, null, null);
}
}
use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project druid by druid-io.
the class SegmentAnalyzer method analyze.
public Map<String, ColumnAnalysis> analyze(Segment segment) {
Preconditions.checkNotNull(segment, "segment");
// index is null for incremental-index-based segments, but storageAdapter is always available
final QueryableIndex index = segment.asQueryableIndex();
final StorageAdapter storageAdapter = segment.asStorageAdapter();
// get length and column names from storageAdapter
final int length = storageAdapter.getNumRows();
Map<String, ColumnAnalysis> columns = new TreeMap<>();
final RowSignature rowSignature = storageAdapter.getRowSignature();
for (String columnName : rowSignature.getColumnNames()) {
final ColumnCapabilities capabilities;
if (storageAdapter instanceof IncrementalIndexStorageAdapter) {
// See javadocs for getSnapshotColumnCapabilities for a discussion of why we need to do this.
capabilities = ((IncrementalIndexStorageAdapter) storageAdapter).getSnapshotColumnCapabilities(columnName);
} else {
capabilities = storageAdapter.getColumnCapabilities(columnName);
}
final ColumnAnalysis analysis;
switch(capabilities.getType()) {
case LONG:
final int bytesPerRow = ColumnHolder.TIME_COLUMN_NAME.equals(columnName) ? NUM_BYTES_IN_TIMESTAMP : Long.BYTES;
analysis = analyzeNumericColumn(capabilities, length, bytesPerRow);
break;
case FLOAT:
analysis = analyzeNumericColumn(capabilities, length, NUM_BYTES_IN_TEXT_FLOAT);
break;
case DOUBLE:
analysis = analyzeNumericColumn(capabilities, length, Double.BYTES);
break;
case STRING:
if (index != null) {
analysis = analyzeStringColumn(capabilities, index.getColumnHolder(columnName));
} else {
analysis = analyzeStringColumn(capabilities, storageAdapter, columnName);
}
break;
case COMPLEX:
final ColumnHolder columnHolder = index != null ? index.getColumnHolder(columnName) : null;
analysis = analyzeComplexColumn(capabilities, columnHolder);
break;
default:
log.warn("Unknown column type[%s].", capabilities.asTypeString());
analysis = ColumnAnalysis.error(StringUtils.format("unknown_type_%s", capabilities.asTypeString()));
}
columns.put(columnName, analysis);
}
return columns;
}
use of org.apache.druid.query.metadata.metadata.ColumnAnalysis in project druid by druid-io.
the class SegmentMetadataQueryQueryToolChest method mergeAnalyses.
@VisibleForTesting
public static SegmentAnalysis mergeAnalyses(final SegmentAnalysis arg1, final SegmentAnalysis arg2, boolean lenientAggregatorMerge) {
if (arg1 == null) {
return arg2;
}
if (arg2 == null) {
return arg1;
}
List<Interval> newIntervals = null;
if (arg1.getIntervals() != null) {
newIntervals = new ArrayList<>(arg1.getIntervals());
}
if (arg2.getIntervals() != null) {
if (newIntervals == null) {
newIntervals = new ArrayList<>();
}
newIntervals.addAll(arg2.getIntervals());
}
final Map<String, ColumnAnalysis> leftColumns = arg1.getColumns();
final Map<String, ColumnAnalysis> rightColumns = arg2.getColumns();
Map<String, ColumnAnalysis> columns = new TreeMap<>();
Set<String> rightColumnNames = Sets.newHashSet(rightColumns.keySet());
for (Map.Entry<String, ColumnAnalysis> entry : leftColumns.entrySet()) {
final String columnName = entry.getKey();
columns.put(columnName, entry.getValue().fold(rightColumns.get(columnName)));
rightColumnNames.remove(columnName);
}
for (String columnName : rightColumnNames) {
columns.put(columnName, rightColumns.get(columnName));
}
final Map<String, AggregatorFactory> aggregators = new HashMap<>();
if (lenientAggregatorMerge) {
// Merge each aggregator individually, ignoring nulls
for (SegmentAnalysis analysis : ImmutableList.of(arg1, arg2)) {
if (analysis.getAggregators() != null) {
for (Map.Entry<String, AggregatorFactory> entry : analysis.getAggregators().entrySet()) {
final String aggregatorName = entry.getKey();
final AggregatorFactory aggregator = entry.getValue();
AggregatorFactory merged = aggregators.get(aggregatorName);
if (merged != null) {
try {
merged = merged.getMergingFactory(aggregator);
} catch (AggregatorFactoryNotMergeableException e) {
merged = null;
}
} else {
merged = aggregator;
}
aggregators.put(aggregatorName, merged);
}
}
}
} else {
final AggregatorFactory[] aggs1 = arg1.getAggregators() != null ? arg1.getAggregators().values().toArray(new AggregatorFactory[0]) : null;
final AggregatorFactory[] aggs2 = arg2.getAggregators() != null ? arg2.getAggregators().values().toArray(new AggregatorFactory[0]) : null;
final AggregatorFactory[] merged = AggregatorFactory.mergeAggregators(Arrays.asList(aggs1, aggs2));
if (merged != null) {
for (AggregatorFactory aggregator : merged) {
aggregators.put(aggregator.getName(), aggregator);
}
}
}
final TimestampSpec timestampSpec = TimestampSpec.mergeTimestampSpec(Lists.newArrayList(arg1.getTimestampSpec(), arg2.getTimestampSpec()));
final Granularity queryGranularity = Granularity.mergeGranularities(Lists.newArrayList(arg1.getQueryGranularity(), arg2.getQueryGranularity()));
final String mergedId;
if (arg1.getId() != null && arg2.getId() != null && arg1.getId().equals(arg2.getId())) {
mergedId = arg1.getId();
} else {
mergedId = "merged";
}
final Boolean rollup;
if (arg1.isRollup() != null && arg2.isRollup() != null && arg1.isRollup().equals(arg2.isRollup())) {
rollup = arg1.isRollup();
} else {
rollup = null;
}
return new SegmentAnalysis(mergedId, newIntervals, columns, arg1.getSize() + arg2.getSize(), arg1.getNumRows() + arg2.getNumRows(), aggregators.isEmpty() ? null : aggregators, timestampSpec, queryGranularity, rollup);
}
Aggregations