Search in sources :

Example 21 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class AbstractParquetGroupScan method pruneRowGroupsForFiles.

protected Multimap<Path, RowGroupMetadata> pruneRowGroupsForFiles(Map<Path, FileMetadata> filteredFileMetadata) {
    Multimap<Path, RowGroupMetadata> prunedRowGroups = LinkedListMultimap.create();
    for (Path filteredPartition : filteredFileMetadata.keySet()) {
        Multimap<Path, RowGroupMetadata> rowGroupsMetadata = getRowGroupsMetadata();
        Collection<RowGroupMetadata> filesRowGroupMetadata = rowGroupsMetadata.get(filteredPartition);
        if (CollectionUtils.isNotEmpty(filesRowGroupMetadata)) {
            prunedRowGroups.putAll(filteredPartition, filesRowGroupMetadata);
        }
    }
    return prunedRowGroups;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Example 22 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class AbstractParquetGroupScan method getRowGroupInfos.

private List<RowGroupInfo> getRowGroupInfos() {
    if (rowGroupInfos == null) {
        Map<String, CoordinationProtos.DrillbitEndpoint> hostEndpointMap = new HashMap<>();
        for (CoordinationProtos.DrillbitEndpoint endpoint : getDrillbits()) {
            hostEndpointMap.put(endpoint.getAddress(), endpoint);
        }
        rowGroupInfos = new ArrayList<>();
        for (RowGroupMetadata rowGroupMetadata : getRowGroupsMetadata().values()) {
            RowGroupInfo rowGroupInfo = new RowGroupInfo(rowGroupMetadata.getPath(), rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START), rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH), rowGroupMetadata.getRowGroupIndex(), TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata));
            rowGroupInfo.setNumRecordsToRead(rowGroupInfo.getRowCount());
            EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
            for (String host : rowGroupMetadata.getHostAffinity().keySet()) {
                if (hostEndpointMap.containsKey(host)) {
                    endpointByteMap.add(hostEndpointMap.get(host), (long) (rowGroupMetadata.getHostAffinity().get(host) * (long) rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
                }
            }
            rowGroupInfo.setEndpointByteMap(endpointByteMap);
            rowGroupInfos.add(rowGroupInfo);
        }
    }
    return rowGroupInfos;
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CoordinationProtos(org.apache.drill.exec.proto.CoordinationProtos) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Example 23 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class ParquetTableMetadataUtils method getRowGroupsMetadata.

/**
 * Returns list of {@link RowGroupMetadata} received by converting parquet row groups metadata
 * taken from the specified tableMetadata.
 * Assigns index to row groups based on their position in files metadata.
 * For empty / fake row groups assigns '-1' index.
 *
 * @param tableMetadata the source of row groups to be converted
 * @return list of {@link RowGroupMetadata}
 */
public static Multimap<Path, RowGroupMetadata> getRowGroupsMetadata(MetadataBase.ParquetTableMetadataBase tableMetadata) {
    Multimap<Path, RowGroupMetadata> rowGroups = LinkedListMultimap.create();
    for (MetadataBase.ParquetFileMetadata file : tableMetadata.getFiles()) {
        int index = 0;
        for (MetadataBase.RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
            int newIndex;
            if (rowGroupMetadata.isEmpty()) {
                Preconditions.checkState(file.getRowGroups().size() == 1, "Only one empty / fake row group is allowed per file");
                newIndex = -1;
            } else {
                newIndex = index++;
            }
            Path filePath = Path.getPathWithoutSchemeAndAuthority(file.getPath());
            rowGroups.put(filePath, getRowGroupMetadata(tableMetadata, rowGroupMetadata, newIndex, filePath));
        }
    }
    return rowGroups;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Example 24 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class ParquetTableMetadataUtils method getFileMetadata.

/**
 * Returns {@link FileMetadata} instance received by merging specified {@link RowGroupMetadata} list.
 *
 * @param rowGroups collection of {@link RowGroupMetadata} to be merged
 * @return {@link FileMetadata} instance
 */
public static FileMetadata getFileMetadata(Collection<RowGroupMetadata> rowGroups) {
    if (rowGroups.isEmpty()) {
        return null;
    }
    List<StatisticsHolder<?>> fileStatistics = new ArrayList<>();
    fileStatistics.add(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(rowGroups), TableStatisticsKind.ROW_COUNT));
    RowGroupMetadata rowGroupMetadata = rowGroups.iterator().next();
    TupleMetadata schema = rowGroupMetadata.getSchema();
    Set<SchemaPath> columns = rowGroupMetadata.getColumnsStatistics().keySet();
    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.FILE).build();
    return FileMetadata.builder().tableInfo(rowGroupMetadata.getTableInfo()).metadataInfo(metadataInfo).path(rowGroupMetadata.getPath()).schema(schema).columnsStatistics(TableMetadataUtils.mergeColumnsStatistics(rowGroups, columns, PARQUET_COLUMN_STATISTICS)).metadataStatistics(fileStatistics).build();
}
Also used : StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) SchemaPath(org.apache.drill.common.expression.SchemaPath) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) ArrayList(java.util.ArrayList) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Aggregations

RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)24 SchemaPath (org.apache.drill.common.expression.SchemaPath)18 Path (org.apache.hadoop.fs.Path)16 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)15 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)13 TableInfo (org.apache.drill.metastore.metadata.TableInfo)13 MetastoreTest (org.apache.drill.categories.MetastoreTest)12 MetastoreTableInfo (org.apache.drill.metastore.components.tables.MetastoreTableInfo)12 Test (org.junit.Test)12 SlowTest (org.apache.drill.categories.SlowTest)11 SegmentMetadata (org.apache.drill.metastore.metadata.SegmentMetadata)11 ClusterTest (org.apache.drill.test.ClusterTest)11 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)11 File (java.io.File)10 ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)9 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)7 MetadataType (org.apache.drill.metastore.metadata.MetadataType)7 List (java.util.List)6