Search in sources :

Example 1 with EndpointByteMap

use of org.apache.drill.exec.store.schedule.EndpointByteMap in project drill by apache.

the class ParquetGroupScan method init.

private void init(MetadataContext metaContext) throws IOException {
    if (entries.size() == 1 && parquetTableMetadata == null) {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath()));
        Path metaPath = null;
        if (fs.isDirectory(p)) {
            // Using the metadata file makes sense when querying a directory; otherwise
            // if querying a single file we can look up the metadata directly from the file
            metaPath = new Path(p, Metadata.METADATA_FILENAME);
        }
        if (metaPath != null && fs.exists(metaPath)) {
            usedMetadataCache = true;
            parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext, formatConfig);
        } else {
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString(), formatConfig);
        }
    } else {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot));
        Path metaPath = new Path(p, Metadata.METADATA_FILENAME);
        if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) {
            usedMetadataCache = true;
            if (parquetTableMetadata == null) {
                parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext, formatConfig);
            }
            if (fileSet != null) {
                parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
            }
        } else {
            final List<FileStatus> fileStatuses = Lists.newArrayList();
            for (ReadEntryWithPath entry : entries) {
                getFiles(entry.getPath(), fileStatuses);
            }
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, fileStatuses, formatConfig);
        }
    }
    if (fileSet == null) {
        fileSet = Sets.newHashSet();
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }
    }
    Map<String, DrillbitEndpoint> hostEndpointMap = Maps.newHashMap();
    for (DrillbitEndpoint endpoint : formatPlugin.getContext().getBits()) {
        hostEndpointMap.put(endpoint.getAddress(), endpoint);
    }
    rowGroupInfos = Lists.newArrayList();
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        int rgIndex = 0;
        for (RowGroupMetadata rg : file.getRowGroups()) {
            RowGroupInfo rowGroupInfo = new RowGroupInfo(file.getPath(), rg.getStart(), rg.getLength(), rgIndex, rg.getRowCount());
            EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
            for (String host : rg.getHostAffinity().keySet()) {
                if (hostEndpointMap.containsKey(host)) {
                    endpointByteMap.add(hostEndpointMap.get(host), (long) (rg.getHostAffinity().get(host) * rg.getLength()));
                }
            }
            rowGroupInfo.setEndpointByteMap(endpointByteMap);
            rgIndex++;
            rowGroupInfos.add(rowGroupInfo);
        }
    }
    this.endpointAffinities = AffinityCreator.getAffinityMap(rowGroupInfos);
    columnValueCounts = Maps.newHashMap();
    this.rowCount = 0;
    boolean first = true;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            long rowCount = rowGroup.getRowCount();
            for (ColumnMetadata column : rowGroup.getColumns()) {
                SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName());
                Long previousCount = columnValueCounts.get(schemaPath);
                if (previousCount != null) {
                    if (previousCount != GroupScan.NO_COLUMN_STATS) {
                        if (column.getNulls() != null) {
                            Long newCount = rowCount - column.getNulls();
                            columnValueCounts.put(schemaPath, columnValueCounts.get(schemaPath) + newCount);
                        }
                    }
                } else {
                    if (column.getNulls() != null) {
                        Long newCount = rowCount - column.getNulls();
                        columnValueCounts.put(schemaPath, newCount);
                    } else {
                        columnValueCounts.put(schemaPath, GroupScan.NO_COLUMN_STATS);
                    }
                }
                boolean partitionColumn = checkForPartitionColumn(column, first);
                if (partitionColumn) {
                    Map<SchemaPath, Object> map = partitionValueMap.get(file.getPath());
                    if (map == null) {
                        map = Maps.newHashMap();
                        partitionValueMap.put(file.getPath(), map);
                    }
                    Object value = map.get(schemaPath);
                    Object currentValue = column.getMaxValue();
                    if (value != null) {
                        if (value != currentValue) {
                            partitionColTypeMap.remove(schemaPath);
                        }
                    } else {
                        map.put(schemaPath, currentValue);
                    }
                } else {
                    partitionColTypeMap.remove(schemaPath);
                }
            }
            this.rowCount += rowGroup.getRowCount();
            first = false;
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ColumnMetadata(org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) SchemaPath(org.apache.drill.common.expression.SchemaPath) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl)

Example 2 with EndpointByteMap

use of org.apache.drill.exec.store.schedule.EndpointByteMap in project drill by axbaretto.

the class ParquetGroupScan method init.

private void init() throws IOException {
    Path metaPath = null;
    if (entries.size() == 1 && parquetTableMetadata == null) {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath()));
        if (fs.isDirectory(p)) {
            // Using the metadata file makes sense when querying a directory; otherwise
            // if querying a single file we can look up the metadata directly from the file
            metaPath = new Path(p, Metadata.METADATA_FILENAME);
        }
        if (!metaContext.isMetadataCacheCorrupted() && metaPath != null && fs.exists(metaPath)) {
            parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath, metaContext, formatConfig);
            if (parquetTableMetadata != null) {
                usedMetadataCache = true;
            }
        }
        if (!usedMetadataCache) {
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString(), formatConfig);
        }
    } else {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot));
        metaPath = new Path(p, Metadata.METADATA_FILENAME);
        if (!metaContext.isMetadataCacheCorrupted() && fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) {
            if (parquetTableMetadata == null) {
                parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath, metaContext, formatConfig);
            }
            if (parquetTableMetadata != null) {
                usedMetadataCache = true;
                if (fileSet != null) {
                    parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
                }
            }
        }
        if (!usedMetadataCache) {
            final List<FileStatus> fileStatuses = Lists.newArrayList();
            for (ReadEntryWithPath entry : entries) {
                fileStatuses.addAll(DrillFileSystemUtil.listFiles(fs, Path.getPathWithoutSchemeAndAuthority(new Path(entry.getPath())), true));
            }
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, fileStatuses, formatConfig);
        }
    }
    if (fileSet == null) {
        fileSet = Sets.newHashSet();
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }
    }
    Map<String, DrillbitEndpoint> hostEndpointMap = Maps.newHashMap();
    for (DrillbitEndpoint endpoint : formatPlugin.getContext().getBits()) {
        hostEndpointMap.put(endpoint.getAddress(), endpoint);
    }
    rowGroupInfos = Lists.newArrayList();
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        int rgIndex = 0;
        for (RowGroupMetadata rg : file.getRowGroups()) {
            RowGroupInfo rowGroupInfo = new RowGroupInfo(file.getPath(), rg.getStart(), rg.getLength(), rgIndex, rg.getRowCount());
            EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
            for (String host : rg.getHostAffinity().keySet()) {
                if (hostEndpointMap.containsKey(host)) {
                    endpointByteMap.add(hostEndpointMap.get(host), (long) (rg.getHostAffinity().get(host) * rg.getLength()));
                }
            }
            rowGroupInfo.setEndpointByteMap(endpointByteMap);
            rowGroupInfo.setColumns(rg.getColumns());
            rgIndex++;
            rowGroupInfos.add(rowGroupInfo);
        }
    }
    this.endpointAffinities = AffinityCreator.getAffinityMap(rowGroupInfos);
    updatePartitionColTypeMap();
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl)

Example 3 with EndpointByteMap

use of org.apache.drill.exec.store.schedule.EndpointByteMap in project drill by axbaretto.

the class TestAssignment method createByteMap.

private EndpointByteMap createByteMap() {
    EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
    Set<DrillbitEndpoint> usedEndpoints = Sets.newHashSet();
    while (usedEndpoints.size() < 3) {
        usedEndpoints.add(getRandom(endpoints));
    }
    for (DrillbitEndpoint ep : usedEndpoints) {
        endpointByteMap.add(ep, FILE_SIZE);
    }
    return endpointByteMap;
}
Also used : DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl)

Example 4 with EndpointByteMap

use of org.apache.drill.exec.store.schedule.EndpointByteMap in project drill by apache.

the class TestAssignment method createByteMap.

private EndpointByteMap createByteMap() {
    EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
    Set<DrillbitEndpoint> usedEndpoints = Sets.newHashSet();
    while (usedEndpoints.size() < 3) {
        usedEndpoints.add(getRandom(endpoints));
    }
    for (DrillbitEndpoint ep : usedEndpoints) {
        endpointByteMap.add(ep, FILE_SIZE);
    }
    return endpointByteMap;
}
Also used : DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl)

Example 5 with EndpointByteMap

use of org.apache.drill.exec.store.schedule.EndpointByteMap in project drill by apache.

the class AbstractParquetGroupScan method getRowGroupInfos.

private List<RowGroupInfo> getRowGroupInfos() {
    if (rowGroupInfos == null) {
        Map<String, CoordinationProtos.DrillbitEndpoint> hostEndpointMap = new HashMap<>();
        for (CoordinationProtos.DrillbitEndpoint endpoint : getDrillbits()) {
            hostEndpointMap.put(endpoint.getAddress(), endpoint);
        }
        rowGroupInfos = new ArrayList<>();
        for (RowGroupMetadata rowGroupMetadata : getRowGroupsMetadata().values()) {
            RowGroupInfo rowGroupInfo = new RowGroupInfo(rowGroupMetadata.getPath(), rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START), rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH), rowGroupMetadata.getRowGroupIndex(), TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata));
            rowGroupInfo.setNumRecordsToRead(rowGroupInfo.getRowCount());
            EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
            for (String host : rowGroupMetadata.getHostAffinity().keySet()) {
                if (hostEndpointMap.containsKey(host)) {
                    endpointByteMap.add(hostEndpointMap.get(host), (long) (rowGroupMetadata.getHostAffinity().get(host) * (long) rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
                }
            }
            rowGroupInfo.setEndpointByteMap(endpointByteMap);
            rowGroupInfos.add(rowGroupInfo);
        }
    }
    return rowGroupInfos;
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CoordinationProtos(org.apache.drill.exec.proto.CoordinationProtos) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Aggregations

EndpointByteMap (org.apache.drill.exec.store.schedule.EndpointByteMap)5 EndpointByteMapImpl (org.apache.drill.exec.store.schedule.EndpointByteMapImpl)5 DrillbitEndpoint (org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)2 ReadEntryWithPath (org.apache.drill.exec.store.dfs.ReadEntryWithPath)2 ParquetFileMetadata (org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata)2 RowGroupMetadata (org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 Path (org.apache.hadoop.fs.Path)2 HashMap (java.util.HashMap)1 LinkedHashMap (java.util.LinkedHashMap)1 CoordinationProtos (org.apache.drill.exec.proto.CoordinationProtos)1 ColumnMetadata (org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata)1 RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)1