use of org.apache.drill.metastore.metadata.TableMetadata in project drill by apache.
the class BaseParquetMetadataProvider method init.
protected void init(BaseParquetMetadataProvider metadataProvider) throws IOException {
// Once deserialization for metadata is provided, initInternal() call should be removed
// and only files list is deserialized based on specified locations
initInternal();
assert parquetTableMetadata != null;
if (fileSet == null) {
fileSet = new HashSet<>();
fileSet.addAll(parquetTableMetadata.getFiles().stream().map(MetadataBase.ParquetFileMetadata::getPath).collect(Collectors.toSet()));
}
List<Path> fileLocations = getLocations();
// obtains metadata from cache files or table footers
if (metadataProvider == null || (metadataProvider.rowGroups != null && !metadataProvider.rowGroups.keySet().containsAll(fileLocations)) || (metadataProvider.files != null && !metadataProvider.files.keySet().containsAll(fileLocations))) {
initializeMetadata();
} else {
// reuse metadata from existing TableMetadataProvider
if (metadataProvider.files != null && metadataProvider.files.size() != files.size()) {
files = metadataProvider.files.entrySet().stream().filter(entry -> fileLocations.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}
if (metadataProvider.rowGroups != null) {
rowGroups = LinkedListMultimap.create();
metadataProvider.rowGroups.entries().stream().filter(entry -> fileLocations.contains(entry.getKey())).forEach(entry -> rowGroups.put(entry.getKey(), entry.getValue()));
}
TableMetadata tableMetadata = getTableMetadata();
getSegmentsMetadataMap();
getPartitionsMetadata();
getRowGroupsMeta();
getNonInterestingColumnsMetadata();
this.tableMetadata = TableMetadataUtils.updateRowCount(tableMetadata, getRowGroupsMeta());
parquetTableMetadata = null;
}
}
use of org.apache.drill.metastore.metadata.TableMetadata in project drill by apache.
the class DrillRelMdDistinctRowCount method getDistinctRowCountInternal.
/**
* Estimates the number of rows which would be produced by a GROUP BY on the
* set of columns indicated by groupKey.
* column").
*/
private Double getDistinctRowCountInternal(TableScan scan, RelMetadataQuery mq, DrillTable table, ImmutableBitSet groupKey, RelDataType type, RexNode predicate) {
double selectivity, gbyColPredSel, rowCount;
/* If predicate is present, determine its selectivity to estimate filtered rows.
* Thereafter, compute the number of distinct rows.
*/
selectivity = mq.getSelectivity(scan, predicate);
rowCount = mq.getRowCount(scan);
if (groupKey.length() == 0) {
return selectivity * rowCount;
}
TableMetadata tableMetadata;
try {
tableMetadata = table.getGroupScan().getTableMetadata();
} catch (IOException e) {
// Statistics cannot be obtained, use default behaviour
return scan.estimateRowCount(mq) * 0.1;
}
double estRowCnt = 1.0;
String colName = "";
boolean allColsHaveNDV = true;
for (int i = 0; i < groupKey.length(); i++) {
colName = type.getFieldNames().get(i);
if (!groupKey.get(i)) {
continue;
}
ColumnStatistics<?> columnStatistics = tableMetadata != null ? tableMetadata.getColumnStatistics(SchemaPath.getSimplePath(colName)) : null;
Double ndv = columnStatistics != null ? ColumnStatisticsKind.NDV.getFrom(columnStatistics) : null;
// Skip NDV, if not available
if (ndv == null) {
allColsHaveNDV = false;
break;
}
estRowCnt *= ndv;
gbyColPredSel = getPredSelectivityContainingInputRef(predicate, i, mq, scan);
/* If predicate is on group-by column, scale down the NDV by selectivity. Consider the query
* select a, b from t where a = 10 group by a, b. Here, NDV(a) will be scaled down by SEL(a)
* whereas NDV(b) will not.
*/
if (gbyColPredSel > 0) {
estRowCnt *= gbyColPredSel;
}
}
// Estimated NDV should not exceed number of rows after applying the filters
estRowCnt = Math.min(estRowCnt, selectivity * rowCount);
if (!allColsHaveNDV) {
if (logger.isDebugEnabled()) {
logger.debug(String.format("NDV not available for %s(%s). Using default rowcount for group-by %s", (tableMetadata != null ? tableMetadata.getTableInfo().name() : ""), colName, groupKey.toString()));
}
// Could not get any NDV estimate from stats - probably stats not present for GBY cols. So Guess!
return scan.estimateRowCount(mq) * 0.1;
} else {
/* rowCount maybe less than NDV(different source), sanity check OR NDV not used at all */
return estRowCnt;
}
}
use of org.apache.drill.metastore.metadata.TableMetadata in project drill by apache.
the class DrillRelMdSelectivity method getScanSelectivity.
private Double getScanSelectivity(RelNode rel, RelMetadataQuery mq, RexNode predicate) {
double ROWCOUNT_UNKNOWN = -1.0;
GroupScan scan = null;
PlannerSettings settings = PrelUtil.getPlannerSettings(rel.getCluster().getPlanner());
final RexBuilder rexBuilder = rel.getCluster().getRexBuilder();
if (rel instanceof DrillScanRel) {
scan = ((DrillScanRel) rel).getGroupScan();
} else if (rel instanceof ScanPrel) {
scan = ((ScanPrel) rel).getGroupScan();
}
if (scan != null) {
if (settings.isStatisticsEnabled() && scan instanceof DbGroupScan) {
double filterRows = ((DbGroupScan) scan).getRowCount(predicate, rel);
double totalRows = ((DbGroupScan) scan).getRowCount(null, rel);
if (filterRows != ROWCOUNT_UNKNOWN && totalRows != ROWCOUNT_UNKNOWN && totalRows > 0) {
return Math.min(1.0, filterRows / totalRows);
}
}
}
// Do not mess with statistics used for DBGroupScans.
if (rel instanceof TableScan) {
if (DrillRelOptUtil.guessRows(rel)) {
return super.getSelectivity(rel, mq, predicate);
}
DrillTable table = Utilities.getDrillTable(rel.getTable());
try {
TableMetadata tableMetadata;
if (table != null && (tableMetadata = table.getGroupScan().getTableMetadata()) != null && TableStatisticsKind.HAS_DESCRIPTIVE_STATISTICS.getValue(tableMetadata)) {
List<SchemaPath> fieldNames;
if (rel instanceof DrillScanRelBase) {
fieldNames = ((DrillScanRelBase) rel).getGroupScan().getColumns();
} else {
fieldNames = rel.getRowType().getFieldNames().stream().map(SchemaPath::getSimplePath).collect(Collectors.toList());
}
return getScanSelectivityInternal(tableMetadata, predicate, fieldNames, rexBuilder);
}
} catch (IOException e) {
super.getSelectivity(rel, mq, predicate);
}
}
return super.getSelectivity(rel, mq, predicate);
}
use of org.apache.drill.metastore.metadata.TableMetadata in project drill by apache.
the class BaseParquetMetadataProvider method getPartitionsMetadata.
@Override
public List<PartitionMetadata> getPartitionsMetadata() {
if (partitions == null) {
partitions = new ArrayList<>();
if (collectMetadata) {
Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
for (FileMetadata fileMetadata : filesMetadata) {
for (SchemaPath partitionColumn : partitionColumns) {
Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
// Table cannot contain nulls
partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
if (partitionFiles == null) {
partitionFiles = new ArrayList<>();
colValFile.put(partitionColumn, partitionValue, partitionFiles);
}
partitionFiles.add(fileMetadata);
}
}
for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
}
}
} else {
for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
Multimap<Object, Path> partitionsForValue = HashMultimap.create();
partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
partitionsForValue.asMap().forEach((partitionKey, value) -> {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
List<StatisticsHolder<?>> statistics = new ArrayList<>();
partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
TableMetadata tableMetadata = getTableMetadata();
PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
partitions.add(partitionMetadata);
});
}
}
}
return partitions;
}
Aggregations