Search in sources :

Example 26 with StatisticsHolder

use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.

the class MetadataControllerBatch method getMetadataStatistics.

private List<StatisticsHolder<?>> getMetadataStatistics(TupleReader reader, TupleMetadata columnMetadata) {
    List<StatisticsHolder<?>> metadataStatistics = new ArrayList<>();
    String rgs = columnNamesOptions.rowGroupStart();
    String rgl = columnNamesOptions.rowGroupLength();
    for (ColumnMetadata column : columnMetadata) {
        String columnName = column.name();
        ObjectReader objectReader = reader.column(columnName);
        if (AnalyzeColumnUtils.isMetadataStatisticsField(columnName)) {
            metadataStatistics.add(new StatisticsHolder<>(objectReader.getObject(), AnalyzeColumnUtils.getStatisticsKind(columnName)));
        } else if (!objectReader.isNull()) {
            if (columnName.equals(rgs)) {
                metadataStatistics.add(new StatisticsHolder<>(Long.parseLong(objectReader.scalar().getString()), new BaseStatisticsKind<>(ExactStatisticsConstants.START, true)));
            } else if (columnName.equals(rgl)) {
                metadataStatistics.add(new StatisticsHolder<>(Long.parseLong(objectReader.scalar().getString()), new BaseStatisticsKind<>(ExactStatisticsConstants.LENGTH, true)));
            }
        }
    }
    return metadataStatistics;
}
Also used : StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) ArrayList(java.util.ArrayList) ObjectReader(org.apache.drill.exec.vector.accessor.ObjectReader) BaseStatisticsKind(org.apache.drill.metastore.statistics.BaseStatisticsKind)

Example 27 with StatisticsHolder

use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.

the class MetadataControllerBatch method getTableMetadata.

private BaseTableMetadata getTableMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics) {
    List<StatisticsHolder<?>> updatedMetaStats = new ArrayList<>(metadataStatistics);
    updatedMetaStats.add(new StatisticsHolder<>(popConfig.getContext().analyzeMetadataLevel(), TableStatisticsKind.ANALYZE_METADATA_LEVEL));
    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.TABLE).key(MetadataInfo.GENERAL_INFO_KEY).build();
    BaseTableMetadata tableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(updatedMetaStats).partitionKeys(Collections.emptyMap()).interestingColumns(popConfig.getContext().interestingColumns()).location(popConfig.getContext().location()).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
    if (context.getOptions().getOption(PlannerSettings.STATISTICS_USE)) {
        DrillStatsTable statistics = new DrillStatsTable(statisticsCollector.getStatistics());
        Map<SchemaPath, ColumnStatistics<?>> tableColumnStatistics = ParquetTableMetadataUtils.getColumnStatistics(tableMetadata.getSchema(), statistics);
        tableMetadata = tableMetadata.cloneWithStats(tableColumnStatistics, DrillStatsTable.getEstimatedTableStats(statistics));
    }
    return tableMetadata;
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) DrillStatsTable(org.apache.drill.exec.planner.common.DrillStatsTable) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) ArrayList(java.util.ArrayList)

Example 28 with StatisticsHolder

use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.

the class MetadataControllerBatch method getMetadataUnits.

private List<TableMetadataUnit> getMetadataUnits(TupleReader reader, int nestingLevel) {
    List<TableMetadataUnit> metadataUnits = new ArrayList<>();
    TupleMetadata columnMetadata = reader.tupleSchema();
    ObjectReader metadataColumnReader = reader.column(MetastoreAnalyzeConstants.METADATA_TYPE);
    Preconditions.checkNotNull(metadataColumnReader, "metadataType column wasn't found");
    ObjectReader underlyingMetadataReader = reader.column(MetastoreAnalyzeConstants.COLLECTED_MAP_FIELD);
    if (underlyingMetadataReader != null) {
        if (!underlyingMetadataReader.schema().isArray()) {
            throw new IllegalStateException("Incoming vector with name `collected_map` should be repeated map");
        }
        // current row contains information about underlying metadata
        ArrayReader array = underlyingMetadataReader.array();
        while (array.next()) {
            metadataUnits.addAll(getMetadataUnits(array.tuple(), nestingLevel + 1));
        }
    }
    List<StatisticsHolder<?>> metadataStatistics = getMetadataStatistics(reader, columnMetadata);
    Long rowCount = (Long) metadataStatistics.stream().filter(statisticsHolder -> statisticsHolder.getStatisticsKind() == TableStatisticsKind.ROW_COUNT).findAny().map(StatisticsHolder::getStatisticsValue).orElse(null);
    Map<SchemaPath, ColumnStatistics<?>> columnStatistics = getColumnStatistics(reader, columnMetadata, rowCount);
    MetadataType metadataType = MetadataType.valueOf(metadataColumnReader.scalar().getString());
    BaseMetadata metadata;
    switch(metadataType) {
        case TABLE:
            {
                metadata = getTableMetadata(reader, metadataStatistics, columnStatistics);
                break;
            }
        case SEGMENT:
            {
                metadata = getSegmentMetadata(reader, metadataStatistics, columnStatistics, nestingLevel);
                break;
            }
        case PARTITION:
            {
                metadata = getPartitionMetadata(reader, metadataStatistics, columnStatistics, nestingLevel);
                break;
            }
        case FILE:
            {
                metadata = getFileMetadata(reader, metadataStatistics, columnStatistics, nestingLevel);
                break;
            }
        case ROW_GROUP:
            {
                metadata = getRowGroupMetadata(reader, metadataStatistics, columnStatistics, nestingLevel);
                break;
            }
        default:
            throw new UnsupportedOperationException("Unsupported metadata type: " + metadataType);
    }
    metadataUnits.add(metadata.toMetadataUnit());
    return metadataUnits;
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) TableMetadataUnit(org.apache.drill.metastore.components.tables.TableMetadataUnit) ArrayList(java.util.ArrayList) MetadataType(org.apache.drill.metastore.metadata.MetadataType) ArrayReader(org.apache.drill.exec.vector.accessor.ArrayReader) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) ObjectReader(org.apache.drill.exec.vector.accessor.ObjectReader)

Example 29 with StatisticsHolder

use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.

the class MetadataControllerBatch method getColumnStatistics.

private Map<SchemaPath, ColumnStatistics<?>> getColumnStatistics(TupleReader reader, TupleMetadata columnMetadata, Long rowCount) {
    Multimap<String, StatisticsHolder<?>> columnStatistics = ArrayListMultimap.create();
    Map<String, TypeProtos.MinorType> columnTypes = new HashMap<>();
    for (ColumnMetadata column : columnMetadata) {
        if (AnalyzeColumnUtils.isColumnStatisticsField(column.name())) {
            String fieldName = AnalyzeColumnUtils.getColumnName(column.name());
            StatisticsKind<?> statisticsKind = AnalyzeColumnUtils.getStatisticsKind(column.name());
            columnStatistics.put(fieldName, new StatisticsHolder<>(getConvertedColumnValue(reader.column(column.name())), statisticsKind));
            if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MIN_VALUE.getName()) || statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MAX_VALUE.getName())) {
                columnTypes.putIfAbsent(fieldName, column.type());
            }
        }
    }
    // adds NON_NULL_COUNT to use it during filter pushdown
    if (rowCount != null) {
        Map<String, StatisticsHolder<?>> nullsCountColumnStatistics = new HashMap<>();
        columnStatistics.asMap().forEach((key, value) -> value.stream().filter(statisticsHolder -> statisticsHolder.getStatisticsKind() == ColumnStatisticsKind.NON_NULL_VALUES_COUNT).findAny().map(statisticsHolder -> (Long) statisticsHolder.getStatisticsValue()).ifPresent(nonNullCount -> nullsCountColumnStatistics.put(key, new StatisticsHolder<>(rowCount - nonNullCount, ColumnStatisticsKind.NULLS_COUNT))));
        nullsCountColumnStatistics.forEach(columnStatistics::put);
    }
    Map<SchemaPath, ColumnStatistics<?>> resultingStats = new HashMap<>();
    columnStatistics.asMap().forEach((fieldName, statisticsHolders) -> resultingStats.put(SchemaPath.parseFromString(fieldName), new ColumnStatistics<>(statisticsHolders, columnTypes.get(fieldName))));
    return resultingStats;
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) BaseStatisticsKind(org.apache.drill.metastore.statistics.BaseStatisticsKind) MetastoreColumn(org.apache.drill.metastore.MetastoreColumn) UserException(org.apache.drill.common.exceptions.UserException) LoggerFactory(org.slf4j.LoggerFactory) Types(org.apache.drill.common.types.Types) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) RowSetReader(org.apache.drill.exec.physical.rowSet.RowSetReader) VectorContainer(org.apache.drill.exec.record.VectorContainer) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) StringUtils(org.apache.commons.lang3.StringUtils) ArrayReader(org.apache.drill.exec.vector.accessor.ArrayReader) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) StatisticsRecordWriterImpl(org.apache.drill.exec.store.StatisticsRecordWriterImpl) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) Map(java.util.Map) FieldConverter(org.apache.drill.exec.store.EventBasedRecordWriter.FieldConverter) Path(org.apache.hadoop.fs.Path) BatchSchema(org.apache.drill.exec.record.BatchSchema) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) Delete(org.apache.drill.metastore.operate.Delete) TableMetadataUnit(org.apache.drill.metastore.components.tables.TableMetadataUnit) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) RecordBatch(org.apache.drill.exec.record.RecordBatch) Set(java.util.Set) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) Collectors(java.util.stream.Collectors) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) FieldReader(org.apache.drill.exec.vector.complex.reader.FieldReader) TypeProtos(org.apache.drill.common.types.TypeProtos) List(java.util.List) AbstractBinaryRecordBatch(org.apache.drill.exec.record.AbstractBinaryRecordBatch) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) ObjectReader(org.apache.drill.exec.vector.accessor.ObjectReader) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetadataIdentifierUtils(org.apache.drill.exec.metastore.analyze.MetadataIdentifierUtils) TupleReader(org.apache.drill.exec.vector.accessor.TupleReader) Modify(org.apache.drill.metastore.operate.Modify) MetadataControllerContext(org.apache.drill.exec.metastore.analyze.MetadataControllerContext) HashMap(java.util.HashMap) BitVector(org.apache.drill.exec.vector.BitVector) Function(java.util.function.Function) VectorWrapper(org.apache.drill.exec.record.VectorWrapper) ArrayList(java.util.ArrayList) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) HashSet(java.util.HashSet) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) DirectRowSet(org.apache.drill.exec.physical.rowSet.DirectRowSet) DrillStatsTable(org.apache.drill.exec.planner.common.DrillStatsTable) WriterPrel(org.apache.drill.exec.planner.physical.WriterPrel) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FragmentContext(org.apache.drill.exec.ops.FragmentContext) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) StatisticsRecordCollector(org.apache.drill.exec.store.StatisticsRecordCollector) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) IOException(java.io.IOException) FilterExpression(org.apache.drill.metastore.expressions.FilterExpression) StatisticsCollectorImpl(org.apache.drill.exec.store.easy.json.StatisticsCollectorImpl) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) ParquetTableMetadataUtils(org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils) VarCharVector(org.apache.drill.exec.vector.VarCharVector) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) MetadataControllerPOP(org.apache.drill.exec.physical.config.MetadataControllerPOP) Tables(org.apache.drill.metastore.components.tables.Tables) Collections(java.util.Collections) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) ObjectType(org.apache.drill.exec.vector.accessor.ObjectType) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) HashMap(java.util.HashMap) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath)

Example 30 with StatisticsHolder

use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.

the class ParquetTableMetadataUtils method getRowGroupColumnStatistics.

/**
 * Converts specified {@link MetadataBase.RowGroupMetadata} into the map of {@link ColumnStatistics}
 * instances with column names as keys.
 *
 * @param tableMetadata    the source of column types
 * @param rowGroupMetadata metadata to convert
 * @return map with converted row group metadata
 */
public static Map<SchemaPath, ColumnStatistics<?>> getRowGroupColumnStatistics(MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata) {
    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
    for (MetadataBase.ColumnMetadata column : rowGroupMetadata.getColumns()) {
        SchemaPath colPath = SchemaPath.getCompoundPath(column.getName());
        Long nulls = column.getNulls();
        if (hasInvalidStatistics(column, tableMetadata)) {
            nulls = Statistic.NO_COLUMN_STATS;
        }
        PrimitiveType.PrimitiveTypeName primitiveType = getPrimitiveTypeName(tableMetadata, column);
        OriginalType originalType = getOriginalType(tableMetadata, column);
        TypeProtos.MinorType type = ParquetReaderUtility.getMinorType(primitiveType, originalType);
        List<StatisticsHolder<?>> statistics = new ArrayList<>();
        statistics.add(new StatisticsHolder<>(getValue(column.getMinValue(), primitiveType, originalType), ColumnStatisticsKind.MIN_VALUE));
        statistics.add(new StatisticsHolder<>(getValue(column.getMaxValue(), primitiveType, originalType), ColumnStatisticsKind.MAX_VALUE));
        statistics.add(new StatisticsHolder<>(nulls, ColumnStatisticsKind.NULLS_COUNT));
        columnsStatistics.put(colPath, new ColumnStatistics<>(statistics, type));
    }
    return columnsStatistics;
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) TypeProtos(org.apache.drill.common.types.TypeProtos) OriginalType(org.apache.parquet.schema.OriginalType) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Aggregations

StatisticsHolder (org.apache.drill.metastore.statistics.StatisticsHolder)34 SchemaPath (org.apache.drill.common.expression.SchemaPath)31 ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)28 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)24 HashMap (java.util.HashMap)23 Path (org.apache.hadoop.fs.Path)23 MetastoreTableInfo (org.apache.drill.metastore.components.tables.MetastoreTableInfo)21 TableInfo (org.apache.drill.metastore.metadata.TableInfo)21 ArrayList (java.util.ArrayList)16 MetastoreTest (org.apache.drill.categories.MetastoreTest)16 ClusterTest (org.apache.drill.test.ClusterTest)16 Test (org.junit.Test)16 File (java.io.File)15 SlowTest (org.apache.drill.categories.SlowTest)15 SegmentMetadata (org.apache.drill.metastore.metadata.SegmentMetadata)15 MetadataInfo (org.apache.drill.metastore.metadata.MetadataInfo)14 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)13 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)12 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)12 TypeProtos (org.apache.drill.common.types.TypeProtos)11