Search in sources :

Example 1 with ColumnMetadata

use of org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata in project drill by axbaretto.

the class ParquetGroupScan method updatePartitionColTypeMap.

private void updatePartitionColTypeMap() {
    columnValueCounts = Maps.newHashMap();
    this.rowCount = 0;
    boolean first = true;
    for (RowGroupInfo rowGroup : this.rowGroupInfos) {
        long rowCount = rowGroup.getRowCount();
        for (ColumnMetadata column : rowGroup.getColumns()) {
            SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName());
            Long previousCount = columnValueCounts.get(schemaPath);
            if (previousCount != null) {
                if (previousCount != GroupScan.NO_COLUMN_STATS) {
                    if (column.getNulls() != null) {
                        Long newCount = rowCount - column.getNulls();
                        columnValueCounts.put(schemaPath, columnValueCounts.get(schemaPath) + newCount);
                    }
                }
            } else {
                if (column.getNulls() != null) {
                    Long newCount = rowCount - column.getNulls();
                    columnValueCounts.put(schemaPath, newCount);
                } else {
                    columnValueCounts.put(schemaPath, GroupScan.NO_COLUMN_STATS);
                }
            }
            boolean partitionColumn = checkForPartitionColumn(column, first, rowCount);
            if (partitionColumn) {
                Map<SchemaPath, Object> map = partitionValueMap.get(rowGroup.getPath());
                if (map == null) {
                    map = Maps.newHashMap();
                    partitionValueMap.put(rowGroup.getPath(), map);
                }
                Object value = map.get(schemaPath);
                Object currentValue = column.getMaxValue();
                if (value != null) {
                    if (value != currentValue) {
                        partitionColTypeMap.remove(schemaPath);
                    }
                } else {
                    // so checks that there are really null value and puts it to the map
                    if (rowCount == column.getNulls()) {
                        map.put(schemaPath, null);
                    } else {
                        map.put(schemaPath, currentValue);
                    }
                }
            } else {
                partitionColTypeMap.remove(schemaPath);
            }
        }
        this.rowCount += rowGroup.getRowCount();
        first = false;
    }
}
Also used : ColumnMetadata(org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath)

Example 2 with ColumnMetadata

use of org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata in project drill by apache.

the class ParquetGroupScan method init.

private void init(MetadataContext metaContext) throws IOException {
    if (entries.size() == 1 && parquetTableMetadata == null) {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath()));
        Path metaPath = null;
        if (fs.isDirectory(p)) {
            // Using the metadata file makes sense when querying a directory; otherwise
            // if querying a single file we can look up the metadata directly from the file
            metaPath = new Path(p, Metadata.METADATA_FILENAME);
        }
        if (metaPath != null && fs.exists(metaPath)) {
            usedMetadataCache = true;
            parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext, formatConfig);
        } else {
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString(), formatConfig);
        }
    } else {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot));
        Path metaPath = new Path(p, Metadata.METADATA_FILENAME);
        if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) {
            usedMetadataCache = true;
            if (parquetTableMetadata == null) {
                parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext, formatConfig);
            }
            if (fileSet != null) {
                parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
            }
        } else {
            final List<FileStatus> fileStatuses = Lists.newArrayList();
            for (ReadEntryWithPath entry : entries) {
                getFiles(entry.getPath(), fileStatuses);
            }
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, fileStatuses, formatConfig);
        }
    }
    if (fileSet == null) {
        fileSet = Sets.newHashSet();
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }
    }
    Map<String, DrillbitEndpoint> hostEndpointMap = Maps.newHashMap();
    for (DrillbitEndpoint endpoint : formatPlugin.getContext().getBits()) {
        hostEndpointMap.put(endpoint.getAddress(), endpoint);
    }
    rowGroupInfos = Lists.newArrayList();
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        int rgIndex = 0;
        for (RowGroupMetadata rg : file.getRowGroups()) {
            RowGroupInfo rowGroupInfo = new RowGroupInfo(file.getPath(), rg.getStart(), rg.getLength(), rgIndex, rg.getRowCount());
            EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
            for (String host : rg.getHostAffinity().keySet()) {
                if (hostEndpointMap.containsKey(host)) {
                    endpointByteMap.add(hostEndpointMap.get(host), (long) (rg.getHostAffinity().get(host) * rg.getLength()));
                }
            }
            rowGroupInfo.setEndpointByteMap(endpointByteMap);
            rgIndex++;
            rowGroupInfos.add(rowGroupInfo);
        }
    }
    this.endpointAffinities = AffinityCreator.getAffinityMap(rowGroupInfos);
    columnValueCounts = Maps.newHashMap();
    this.rowCount = 0;
    boolean first = true;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            long rowCount = rowGroup.getRowCount();
            for (ColumnMetadata column : rowGroup.getColumns()) {
                SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName());
                Long previousCount = columnValueCounts.get(schemaPath);
                if (previousCount != null) {
                    if (previousCount != GroupScan.NO_COLUMN_STATS) {
                        if (column.getNulls() != null) {
                            Long newCount = rowCount - column.getNulls();
                            columnValueCounts.put(schemaPath, columnValueCounts.get(schemaPath) + newCount);
                        }
                    }
                } else {
                    if (column.getNulls() != null) {
                        Long newCount = rowCount - column.getNulls();
                        columnValueCounts.put(schemaPath, newCount);
                    } else {
                        columnValueCounts.put(schemaPath, GroupScan.NO_COLUMN_STATS);
                    }
                }
                boolean partitionColumn = checkForPartitionColumn(column, first);
                if (partitionColumn) {
                    Map<SchemaPath, Object> map = partitionValueMap.get(file.getPath());
                    if (map == null) {
                        map = Maps.newHashMap();
                        partitionValueMap.put(file.getPath(), map);
                    }
                    Object value = map.get(schemaPath);
                    Object currentValue = column.getMaxValue();
                    if (value != null) {
                        if (value != currentValue) {
                            partitionColTypeMap.remove(schemaPath);
                        }
                    } else {
                        map.put(schemaPath, currentValue);
                    }
                } else {
                    partitionColTypeMap.remove(schemaPath);
                }
            }
            this.rowCount += rowGroup.getRowCount();
            first = false;
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ColumnMetadata(org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) SchemaPath(org.apache.drill.common.expression.SchemaPath) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl)

Example 3 with ColumnMetadata

use of org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata in project drill by axbaretto.

the class ParquetGroupScan method checkForPartitionColumn.

/**
 * When reading the very first footer, any column is a potential partition column. So for the first footer, we check
 * every column to see if it is single valued, and if so, add it to the list of potential partition columns. For the
 * remaining footers, we will not find any new partition columns, but we may discover that what was previously a
 * potential partition column now no longer qualifies, so it needs to be removed from the list.
 * @return whether column is a potential partition column
 */
private boolean checkForPartitionColumn(ColumnMetadata columnMetadata, boolean first, long rowCount) {
    SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
    final PrimitiveTypeName primitiveType;
    final OriginalType originalType;
    int precision = 0;
    int scale = 0;
    if (this.parquetTableMetadata.hasColumnMetadata()) {
        // only ColumnTypeMetadata_v3 stores information about scale and precision
        if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v3) {
            Metadata.ColumnTypeMetadata_v3 columnTypeInfo = ((Metadata.ParquetTableMetadata_v3) parquetTableMetadata).getColumnTypeInfo(columnMetadata.getName());
            scale = columnTypeInfo.scale;
            precision = columnTypeInfo.precision;
        }
        primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
        originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
    } else {
        primitiveType = columnMetadata.getPrimitiveType();
        originalType = columnMetadata.getOriginalType();
    }
    if (first) {
        if (hasSingleValue(columnMetadata, rowCount)) {
            partitionColTypeMap.put(schemaPath, getType(primitiveType, originalType, scale, precision));
            return true;
        } else {
            return false;
        }
    } else {
        if (!partitionColTypeMap.keySet().contains(schemaPath)) {
            return false;
        } else {
            if (!hasSingleValue(columnMetadata, rowCount)) {
                partitionColTypeMap.remove(schemaPath);
                return false;
            }
            if (!getType(primitiveType, originalType, scale, precision).equals(partitionColTypeMap.get(schemaPath))) {
                partitionColTypeMap.remove(schemaPath);
                return false;
            }
        }
    }
    return true;
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnMetadata(org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Aggregations

SchemaPath (org.apache.drill.common.expression.SchemaPath)3 ColumnMetadata (org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata)3 DrillbitEndpoint (org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)2 ParquetFileMetadata (org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata)2 RowGroupMetadata (org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata)2 ReadEntryWithPath (org.apache.drill.exec.store.dfs.ReadEntryWithPath)1 EndpointByteMap (org.apache.drill.exec.store.schedule.EndpointByteMap)1 EndpointByteMapImpl (org.apache.drill.exec.store.schedule.EndpointByteMapImpl)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Path (org.apache.hadoop.fs.Path)1 OriginalType (org.apache.parquet.schema.OriginalType)1 PrimitiveTypeName (org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)1