Search in sources :

Example 26 with TupleReader

use of org.apache.drill.exec.vector.accessor.TupleReader in project drill by apache.

the class MetadataControllerBatch method getColumnStatistics.

private Map<SchemaPath, ColumnStatistics<?>> getColumnStatistics(TupleReader reader, TupleMetadata columnMetadata, Long rowCount) {
    Multimap<String, StatisticsHolder<?>> columnStatistics = ArrayListMultimap.create();
    Map<String, TypeProtos.MinorType> columnTypes = new HashMap<>();
    for (ColumnMetadata column : columnMetadata) {
        if (AnalyzeColumnUtils.isColumnStatisticsField(column.name())) {
            String fieldName = AnalyzeColumnUtils.getColumnName(column.name());
            StatisticsKind<?> statisticsKind = AnalyzeColumnUtils.getStatisticsKind(column.name());
            columnStatistics.put(fieldName, new StatisticsHolder<>(getConvertedColumnValue(reader.column(column.name())), statisticsKind));
            if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MIN_VALUE.getName()) || statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MAX_VALUE.getName())) {
                columnTypes.putIfAbsent(fieldName, column.type());
            }
        }
    }
    // adds NON_NULL_COUNT to use it during filter pushdown
    if (rowCount != null) {
        Map<String, StatisticsHolder<?>> nullsCountColumnStatistics = new HashMap<>();
        columnStatistics.asMap().forEach((key, value) -> value.stream().filter(statisticsHolder -> statisticsHolder.getStatisticsKind() == ColumnStatisticsKind.NON_NULL_VALUES_COUNT).findAny().map(statisticsHolder -> (Long) statisticsHolder.getStatisticsValue()).ifPresent(nonNullCount -> nullsCountColumnStatistics.put(key, new StatisticsHolder<>(rowCount - nonNullCount, ColumnStatisticsKind.NULLS_COUNT))));
        nullsCountColumnStatistics.forEach(columnStatistics::put);
    }
    Map<SchemaPath, ColumnStatistics<?>> resultingStats = new HashMap<>();
    columnStatistics.asMap().forEach((fieldName, statisticsHolders) -> resultingStats.put(SchemaPath.parseFromString(fieldName), new ColumnStatistics<>(statisticsHolders, columnTypes.get(fieldName))));
    return resultingStats;
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) BaseStatisticsKind(org.apache.drill.metastore.statistics.BaseStatisticsKind) MetastoreColumn(org.apache.drill.metastore.MetastoreColumn) UserException(org.apache.drill.common.exceptions.UserException) LoggerFactory(org.slf4j.LoggerFactory) Types(org.apache.drill.common.types.Types) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) RowSetReader(org.apache.drill.exec.physical.rowSet.RowSetReader) VectorContainer(org.apache.drill.exec.record.VectorContainer) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) StringUtils(org.apache.commons.lang3.StringUtils) ArrayReader(org.apache.drill.exec.vector.accessor.ArrayReader) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) StatisticsRecordWriterImpl(org.apache.drill.exec.store.StatisticsRecordWriterImpl) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) Map(java.util.Map) FieldConverter(org.apache.drill.exec.store.EventBasedRecordWriter.FieldConverter) Path(org.apache.hadoop.fs.Path) BatchSchema(org.apache.drill.exec.record.BatchSchema) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) Delete(org.apache.drill.metastore.operate.Delete) TableMetadataUnit(org.apache.drill.metastore.components.tables.TableMetadataUnit) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) RecordBatch(org.apache.drill.exec.record.RecordBatch) Set(java.util.Set) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) Collectors(java.util.stream.Collectors) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) FieldReader(org.apache.drill.exec.vector.complex.reader.FieldReader) TypeProtos(org.apache.drill.common.types.TypeProtos) List(java.util.List) AbstractBinaryRecordBatch(org.apache.drill.exec.record.AbstractBinaryRecordBatch) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) ObjectReader(org.apache.drill.exec.vector.accessor.ObjectReader) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetadataIdentifierUtils(org.apache.drill.exec.metastore.analyze.MetadataIdentifierUtils) TupleReader(org.apache.drill.exec.vector.accessor.TupleReader) Modify(org.apache.drill.metastore.operate.Modify) MetadataControllerContext(org.apache.drill.exec.metastore.analyze.MetadataControllerContext) HashMap(java.util.HashMap) BitVector(org.apache.drill.exec.vector.BitVector) Function(java.util.function.Function) VectorWrapper(org.apache.drill.exec.record.VectorWrapper) ArrayList(java.util.ArrayList) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) HashSet(java.util.HashSet) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) DirectRowSet(org.apache.drill.exec.physical.rowSet.DirectRowSet) DrillStatsTable(org.apache.drill.exec.planner.common.DrillStatsTable) WriterPrel(org.apache.drill.exec.planner.physical.WriterPrel) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FragmentContext(org.apache.drill.exec.ops.FragmentContext) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) StatisticsRecordCollector(org.apache.drill.exec.store.StatisticsRecordCollector) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) IOException(java.io.IOException) FilterExpression(org.apache.drill.metastore.expressions.FilterExpression) StatisticsCollectorImpl(org.apache.drill.exec.store.easy.json.StatisticsCollectorImpl) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) ParquetTableMetadataUtils(org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils) VarCharVector(org.apache.drill.exec.vector.VarCharVector) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) MetadataControllerPOP(org.apache.drill.exec.physical.config.MetadataControllerPOP) Tables(org.apache.drill.metastore.components.tables.Tables) Collections(java.util.Collections) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) ObjectType(org.apache.drill.exec.vector.accessor.ObjectType) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) HashMap(java.util.HashMap) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath)

Aggregations

TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)26 TupleReader (org.apache.drill.exec.vector.accessor.TupleReader)26 SubOperatorTest (org.apache.drill.test.SubOperatorTest)21 Test (org.junit.Test)21 TupleWriter (org.apache.drill.exec.vector.accessor.TupleWriter)20 ArrayReader (org.apache.drill.exec.vector.accessor.ArrayReader)17 ScalarWriter (org.apache.drill.exec.vector.accessor.ScalarWriter)17 SingleRowSet (org.apache.drill.exec.physical.rowSet.RowSet.SingleRowSet)14 SchemaBuilder (org.apache.drill.exec.record.metadata.SchemaBuilder)14 ArrayWriter (org.apache.drill.exec.vector.accessor.ArrayWriter)11 RowSetReader (org.apache.drill.exec.physical.rowSet.RowSetReader)10 BatchSchema (org.apache.drill.exec.record.BatchSchema)7 ObjectReader (org.apache.drill.exec.vector.accessor.ObjectReader)7 ScalarReader (org.apache.drill.exec.vector.accessor.ScalarReader)6 IOException (java.io.IOException)5 ArrayList (java.util.ArrayList)5 Collections (java.util.Collections)5 HashMap (java.util.HashMap)5 HashSet (java.util.HashSet)5 List (java.util.List)5