Search in sources :

Example 1 with EndpointByteMapImpl

use of org.apache.drill.exec.store.schedule.EndpointByteMapImpl in project drill by apache.

the class ParquetGroupScan method init.

private void init(MetadataContext metaContext) throws IOException {
    if (entries.size() == 1 && parquetTableMetadata == null) {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath()));
        Path metaPath = null;
        if (fs.isDirectory(p)) {
            // Using the metadata file makes sense when querying a directory; otherwise
            // if querying a single file we can look up the metadata directly from the file
            metaPath = new Path(p, Metadata.METADATA_FILENAME);
        }
        if (metaPath != null && fs.exists(metaPath)) {
            usedMetadataCache = true;
            parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext, formatConfig);
        } else {
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString(), formatConfig);
        }
    } else {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot));
        Path metaPath = new Path(p, Metadata.METADATA_FILENAME);
        if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) {
            usedMetadataCache = true;
            if (parquetTableMetadata == null) {
                parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext, formatConfig);
            }
            if (fileSet != null) {
                parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
            }
        } else {
            final List<FileStatus> fileStatuses = Lists.newArrayList();
            for (ReadEntryWithPath entry : entries) {
                getFiles(entry.getPath(), fileStatuses);
            }
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, fileStatuses, formatConfig);
        }
    }
    if (fileSet == null) {
        fileSet = Sets.newHashSet();
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }
    }
    Map<String, DrillbitEndpoint> hostEndpointMap = Maps.newHashMap();
    for (DrillbitEndpoint endpoint : formatPlugin.getContext().getBits()) {
        hostEndpointMap.put(endpoint.getAddress(), endpoint);
    }
    rowGroupInfos = Lists.newArrayList();
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        int rgIndex = 0;
        for (RowGroupMetadata rg : file.getRowGroups()) {
            RowGroupInfo rowGroupInfo = new RowGroupInfo(file.getPath(), rg.getStart(), rg.getLength(), rgIndex, rg.getRowCount());
            EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
            for (String host : rg.getHostAffinity().keySet()) {
                if (hostEndpointMap.containsKey(host)) {
                    endpointByteMap.add(hostEndpointMap.get(host), (long) (rg.getHostAffinity().get(host) * rg.getLength()));
                }
            }
            rowGroupInfo.setEndpointByteMap(endpointByteMap);
            rgIndex++;
            rowGroupInfos.add(rowGroupInfo);
        }
    }
    this.endpointAffinities = AffinityCreator.getAffinityMap(rowGroupInfos);
    columnValueCounts = Maps.newHashMap();
    this.rowCount = 0;
    boolean first = true;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            long rowCount = rowGroup.getRowCount();
            for (ColumnMetadata column : rowGroup.getColumns()) {
                SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName());
                Long previousCount = columnValueCounts.get(schemaPath);
                if (previousCount != null) {
                    if (previousCount != GroupScan.NO_COLUMN_STATS) {
                        if (column.getNulls() != null) {
                            Long newCount = rowCount - column.getNulls();
                            columnValueCounts.put(schemaPath, columnValueCounts.get(schemaPath) + newCount);
                        }
                    }
                } else {
                    if (column.getNulls() != null) {
                        Long newCount = rowCount - column.getNulls();
                        columnValueCounts.put(schemaPath, newCount);
                    } else {
                        columnValueCounts.put(schemaPath, GroupScan.NO_COLUMN_STATS);
                    }
                }
                boolean partitionColumn = checkForPartitionColumn(column, first);
                if (partitionColumn) {
                    Map<SchemaPath, Object> map = partitionValueMap.get(file.getPath());
                    if (map == null) {
                        map = Maps.newHashMap();
                        partitionValueMap.put(file.getPath(), map);
                    }
                    Object value = map.get(schemaPath);
                    Object currentValue = column.getMaxValue();
                    if (value != null) {
                        if (value != currentValue) {
                            partitionColTypeMap.remove(schemaPath);
                        }
                    } else {
                        map.put(schemaPath, currentValue);
                    }
                } else {
                    partitionColTypeMap.remove(schemaPath);
                }
            }
            this.rowCount += rowGroup.getRowCount();
            first = false;
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ColumnMetadata(org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) SchemaPath(org.apache.drill.common.expression.SchemaPath) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl)

Example 2 with EndpointByteMapImpl

use of org.apache.drill.exec.store.schedule.EndpointByteMapImpl in project drill by apache.

the class TestAssignment method createByteMap.

private EndpointByteMap createByteMap() {
    EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
    Set<DrillbitEndpoint> usedEndpoints = Sets.newHashSet();
    while (usedEndpoints.size() < 3) {
        usedEndpoints.add(getRandom(endpoints));
    }
    for (DrillbitEndpoint ep : usedEndpoints) {
        endpointByteMap.add(ep, FILE_SIZE);
    }
    return endpointByteMap;
}
Also used : DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl)

Aggregations

DrillbitEndpoint (org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)2 EndpointByteMap (org.apache.drill.exec.store.schedule.EndpointByteMap)2 EndpointByteMapImpl (org.apache.drill.exec.store.schedule.EndpointByteMapImpl)2 SchemaPath (org.apache.drill.common.expression.SchemaPath)1 ReadEntryWithPath (org.apache.drill.exec.store.dfs.ReadEntryWithPath)1 ColumnMetadata (org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata)1 ParquetFileMetadata (org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata)1 RowGroupMetadata (org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Path (org.apache.hadoop.fs.Path)1