Search in sources :

Example 1 with TableScan

use of org.apache.iceberg.TableScan in project presto by prestodb.

the class FilesTable method buildPages.

private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable, Optional<Long> snapshotId) {
    PageListBuilder pagesBuilder = forTable(tableMetadata);
    TableScan tableScan = getTableScan(TupleDomain.all(), snapshotId, icebergTable).includeColumnStats();
    Map<Integer, Type> idToTypeMap = getIdToTypeMap(icebergTable.schema());
    tableScan.planFiles().forEach(fileScanTask -> {
        DataFile dataFile = fileScanTask.file();
        pagesBuilder.beginRow();
        pagesBuilder.appendVarchar(dataFile.path().toString());
        pagesBuilder.appendVarchar(dataFile.format().name());
        pagesBuilder.appendBigint(dataFile.recordCount());
        pagesBuilder.appendBigint(dataFile.fileSizeInBytes());
        if (checkNonNull(dataFile.columnSizes(), pagesBuilder)) {
            pagesBuilder.appendIntegerBigintMap(dataFile.columnSizes());
        }
        if (checkNonNull(dataFile.valueCounts(), pagesBuilder)) {
            pagesBuilder.appendIntegerBigintMap(dataFile.valueCounts());
        }
        if (checkNonNull(dataFile.nullValueCounts(), pagesBuilder)) {
            pagesBuilder.appendIntegerBigintMap(dataFile.nullValueCounts());
        }
        if (checkNonNull(dataFile.lowerBounds(), pagesBuilder)) {
            pagesBuilder.appendIntegerVarcharMap(dataFile.lowerBounds().entrySet().stream().collect(toImmutableMap(Map.Entry<Integer, ByteBuffer>::getKey, entry -> Transforms.identity(idToTypeMap.get(entry.getKey())).toHumanString(Conversions.fromByteBuffer(idToTypeMap.get(entry.getKey()), entry.getValue())))));
        }
        if (checkNonNull(dataFile.upperBounds(), pagesBuilder)) {
            pagesBuilder.appendIntegerVarcharMap(dataFile.upperBounds().entrySet().stream().collect(toImmutableMap(Map.Entry<Integer, ByteBuffer>::getKey, entry -> Transforms.identity(idToTypeMap.get(entry.getKey())).toHumanString(Conversions.fromByteBuffer(idToTypeMap.get(entry.getKey()), entry.getValue())))));
        }
        if (checkNonNull(dataFile.keyMetadata(), pagesBuilder)) {
            pagesBuilder.appendVarbinary(Slices.wrappedBuffer(dataFile.keyMetadata()));
        }
        if (checkNonNull(dataFile.splitOffsets(), pagesBuilder)) {
            pagesBuilder.appendBigintArray(dataFile.splitOffsets());
        }
        pagesBuilder.endRow();
    });
    return pagesBuilder.build();
}
Also used : DataFile(org.apache.iceberg.DataFile) PageListBuilder(com.facebook.presto.iceberg.util.PageListBuilder) IcebergUtil.getTableScan(com.facebook.presto.iceberg.IcebergUtil.getTableScan) TableScan(org.apache.iceberg.TableScan) ArrayType(com.facebook.presto.common.type.ArrayType) Type(org.apache.iceberg.types.Type)

Example 2 with TableScan

use of org.apache.iceberg.TableScan in project presto by prestodb.

the class IcebergUtil method getTableScan.

public static TableScan getTableScan(TupleDomain<IcebergColumnHandle> predicates, Optional<Long> snapshotId, Table icebergTable) {
    Expression expression = ExpressionConverter.toIcebergExpression(predicates);
    TableScan tableScan = icebergTable.newScan().filter(expression);
    return snapshotId.map(id -> isSnapshot(icebergTable, id) ? tableScan.useSnapshot(id) : tableScan.asOfTime(id)).orElse(tableScan);
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) MetastoreContext(com.facebook.presto.hive.metastore.MetastoreContext) ICEBERG_TABLE_TYPE_VALUE(org.apache.iceberg.BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE) PrestoException(com.facebook.presto.spi.PrestoException) WRITE_LOCATION_PROVIDER_IMPL(org.apache.iceberg.TableProperties.WRITE_LOCATION_PROVIDER_IMPL) TABLE_TYPE_PROP(org.apache.iceberg.BaseMetastoreTableOperations.TABLE_TYPE_PROP) PartitionField(org.apache.iceberg.PartitionField) LocationProvider(org.apache.iceberg.io.LocationProvider) TableOperations(org.apache.iceberg.TableOperations) SchemaTableName(com.facebook.presto.spi.SchemaTableName) Expression(org.apache.iceberg.expressions.Expression) ExtendedHiveMetastore(com.facebook.presto.hive.metastore.ExtendedHiveMetastore) HistoryEntry(org.apache.iceberg.HistoryEntry) Locale(java.util.Locale) TypeManager(com.facebook.presto.common.type.TypeManager) Map(java.util.Map) TABLE_COMMENT(com.facebook.presto.hive.HiveMetadata.TABLE_COMMENT) DEFAULT_FILE_FORMAT(org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT) HdfsContext(com.facebook.presto.hive.HdfsContext) LocationProviders.locationsFor(org.apache.iceberg.LocationProviders.locationsFor) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) HiveColumnConverterProvider(com.facebook.presto.hive.HiveColumnConverterProvider) BaseTable(org.apache.iceberg.BaseTable) ImmutableMap(com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) TableScan(org.apache.iceberg.TableScan) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) String.format(java.lang.String.format) ConnectorSession(com.facebook.presto.spi.ConnectorSession) Streams.stream(com.google.common.collect.Streams.stream) List(java.util.List) IcebergPrestoModelConverters.toIcebergTableIdentifier(com.facebook.presto.iceberg.util.IcebergPrestoModelConverters.toIcebergTableIdentifier) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) PartitionSpec(org.apache.iceberg.PartitionSpec) Optional(java.util.Optional) Pattern(java.util.regex.Pattern) DEFAULT_FILE_FORMAT_DEFAULT(org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) ICEBERG_INVALID_SNAPSHOT_ID(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_INVALID_SNAPSHOT_ID) Lists.reverse(com.google.common.collect.Lists.reverse) Snapshot(org.apache.iceberg.Snapshot) TableScan(org.apache.iceberg.TableScan) Expression(org.apache.iceberg.expressions.Expression)

Example 3 with TableScan

use of org.apache.iceberg.TableScan in project hive by apache.

the class IcebergInputFormat method getSplits.

@Override
public List<InputSplit> getSplits(JobContext context) {
    Configuration conf = context.getConfiguration();
    Table table = Optional.ofNullable(HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))).orElseGet(() -> Catalogs.loadTable(conf));
    TableScan scan = createTableScan(table, conf);
    List<InputSplit> splits = Lists.newArrayList();
    boolean applyResidual = !conf.getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false);
    InputFormatConfig.InMemoryDataModel model = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC);
    try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
        Table serializableTable = SerializableTable.copyOf(table);
        tasksIterable.forEach(task -> {
            if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE || model == InputFormatConfig.InMemoryDataModel.PIG)) {
                // TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet
                checkResiduals(task);
            }
            splits.add(new IcebergSplit(serializableTable, conf, task));
        });
    } catch (IOException e) {
        throw new UncheckedIOException(String.format("Failed to close table scan: %s", scan), e);
    }
    // wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for standard queries
    if (scan instanceof DataTableScan) {
        HiveIcebergStorageHandler.checkAndSkipIoConfigSerialization(conf, table);
    }
    return splits;
}
Also used : TableScan(org.apache.iceberg.TableScan) DataTableScan(org.apache.iceberg.DataTableScan) Table(org.apache.iceberg.Table) SerializableTable(org.apache.iceberg.SerializableTable) CombinedScanTask(org.apache.iceberg.CombinedScanTask) Configuration(org.apache.hadoop.conf.Configuration) UncheckedIOException(java.io.UncheckedIOException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) DataTableScan(org.apache.iceberg.DataTableScan) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 4 with TableScan

use of org.apache.iceberg.TableScan in project hive by apache.

the class IcebergInputFormat method createTableScan.

private static TableScan createTableScan(Table table, Configuration conf) {
    TableScan scan = table.newScan().caseSensitive(conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT));
    long snapshotId = conf.getLong(InputFormatConfig.SNAPSHOT_ID, -1);
    if (snapshotId != -1) {
        scan = scan.useSnapshot(snapshotId);
    }
    long asOfTime = conf.getLong(InputFormatConfig.AS_OF_TIMESTAMP, -1);
    if (asOfTime != -1) {
        scan = scan.asOfTime(asOfTime);
    }
    long splitSize = conf.getLong(InputFormatConfig.SPLIT_SIZE, 0);
    if (splitSize > 0) {
        scan = scan.option(TableProperties.SPLIT_SIZE, String.valueOf(splitSize));
    }
    // In case of LLAP-based execution we ask Iceberg not to combine multiple fileScanTasks into one split.
    // This is so that cache affinity can work, and each file(split) is executed/cached on always the same LLAP daemon.
    MapWork mapWork = LlapHiveUtils.findMapWork((JobConf) conf);
    if (mapWork != null && mapWork.getCacheAffinity()) {
        // Iceberg splits logically consist of buckets, where the bucket size equals to openFileCost setting if the files
        // assigned to such bucket are smaller. This is how Iceberg would combine multiple files into one split, so here
        // we need to enforce the bucket size to be equal to split size to avoid file combination.
        Long openFileCost = splitSize > 0 ? splitSize : TableProperties.SPLIT_SIZE_DEFAULT;
        scan = scan.option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(openFileCost));
    }
    String schemaStr = conf.get(InputFormatConfig.READ_SCHEMA);
    if (schemaStr != null) {
        scan.project(SchemaParser.fromJson(schemaStr));
    }
    String[] selectedColumns = conf.getStrings(InputFormatConfig.SELECTED_COLUMNS);
    if (selectedColumns != null) {
        scan.select(selectedColumns);
    }
    // TODO add a filter parser to get rid of Serialization
    Expression filter = SerializationUtil.deserializeFromBase64(conf.get(InputFormatConfig.FILTER_EXPRESSION));
    if (filter != null) {
        scan = scan.filter(filter);
    }
    return scan;
}
Also used : TableScan(org.apache.iceberg.TableScan) DataTableScan(org.apache.iceberg.DataTableScan) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Expression(org.apache.iceberg.expressions.Expression)

Example 5 with TableScan

use of org.apache.iceberg.TableScan in project presto by prestodb.

the class TableStatisticsMaker method makeTableStatistics.

private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
    if (!tableHandle.getSnapshotId().isPresent() || constraint.getSummary().isNone()) {
        return TableStatistics.empty();
    }
    TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transform(IcebergColumnHandle.class::cast).intersect(tableHandle.getPredicate());
    if (intersection.isNone()) {
        return TableStatistics.empty();
    }
    List<Types.NestedField> columns = icebergTable.schema().columns();
    Map<Integer, Type.PrimitiveType> idToTypeMapping = columns.stream().filter(column -> column.type().isPrimitiveType()).collect(Collectors.toMap(Types.NestedField::fieldId, column -> column.type().asPrimitiveType()));
    List<PartitionField> partitionFields = icebergTable.spec().fields();
    Set<Integer> identityPartitionIds = getIdentityPartitions(icebergTable.spec()).keySet().stream().map(PartitionField::sourceId).collect(toSet());
    List<Types.NestedField> nonPartitionPrimitiveColumns = columns.stream().filter(column -> !identityPartitionIds.contains(column.fieldId()) && column.type().isPrimitiveType()).collect(toImmutableList());
    List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
    List<IcebergColumnHandle> columnHandles = getColumns(icebergTable.schema(), typeManager);
    Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toImmutableMap(IcebergColumnHandle::getId, identity()));
    ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
    for (int index = 0; index < partitionFields.size(); index++) {
        PartitionField field = partitionFields.get(index);
        Type type = icebergPartitionTypes.get(index);
        idToDetailsBuilder.put(field.sourceId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toPrestoType(type, typeManager), type.typeId().javaClass()));
    }
    Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.build();
    TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
    Partition summary = null;
    try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
        for (FileScanTask fileScanTask : fileScanTasks) {
            DataFile dataFile = fileScanTask.file();
            if (!dataFileMatches(dataFile, constraint, idToTypeMapping, partitionFields, idToDetails)) {
                continue;
            }
            if (summary == null) {
                summary = new Partition(idToTypeMapping, nonPartitionPrimitiveColumns, dataFile.partition(), dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(idToTypeMapping, dataFile.lowerBounds()), toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
            } else {
                summary.incrementFileCount();
                summary.incrementRecordCount(dataFile.recordCount());
                summary.incrementSize(dataFile.fileSizeInBytes());
                updateSummaryMin(summary, partitionFields, toMap(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
                updateSummaryMax(summary, partitionFields, toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
                summary.updateNullCount(dataFile.nullValueCounts());
                updateColumnSizes(summary, dataFile.columnSizes());
            }
        }
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    if (summary == null) {
        return TableStatistics.empty();
    }
    double recordCount = summary.getRecordCount();
    TableStatistics.Builder result = TableStatistics.builder();
    result.setRowCount(Estimate.of(recordCount));
    result.setTotalSize(Estimate.of(summary.getSize()));
    for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
        int fieldId = columnHandle.getId();
        ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
        Long nullCount = summary.getNullCounts().get(fieldId);
        if (nullCount != null) {
            columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
        }
        if (summary.getColumnSizes() != null) {
            Long columnSize = summary.getColumnSizes().get(fieldId);
            if (columnSize != null) {
                columnBuilder.setDataSize(Estimate.of(columnSize));
            }
        }
        Object min = summary.getMinValues().get(fieldId);
        Object max = summary.getMaxValues().get(fieldId);
        if (min instanceof Number && max instanceof Number) {
            columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue())));
        }
        result.setColumnStatistics(columnHandle, columnBuilder.build());
    }
    return result.build();
}
Also used : Types(org.apache.iceberg.types.Types) ColumnStatistics(com.facebook.presto.spi.statistics.ColumnStatistics) TableStatistics(com.facebook.presto.spi.statistics.TableStatistics) PartitionField(org.apache.iceberg.PartitionField) DoubleRange(com.facebook.presto.spi.statistics.DoubleRange) ImmutableList(com.google.common.collect.ImmutableList) Partition.toMap(com.facebook.presto.iceberg.Partition.toMap) TypeManager(com.facebook.presto.common.type.TypeManager) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) IcebergUtil.getIdentityPartitions(com.facebook.presto.iceberg.IcebergUtil.getIdentityPartitions) FileScanTask(org.apache.iceberg.FileScanTask) DataFile(org.apache.iceberg.DataFile) ExpressionConverter.toIcebergExpression(com.facebook.presto.iceberg.ExpressionConverter.toIcebergExpression) IcebergUtil.getColumns(com.facebook.presto.iceberg.IcebergUtil.getColumns) Collectors.toSet(java.util.stream.Collectors.toSet) Comparators(org.apache.iceberg.types.Comparators) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) CloseableIterable(org.apache.iceberg.io.CloseableIterable) NullableValue(com.facebook.presto.common.predicate.NullableValue) ImmutableMap(com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) Predicate(java.util.function.Predicate) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Constraint(com.facebook.presto.spi.Constraint) TableScan(org.apache.iceberg.TableScan) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) Type(org.apache.iceberg.types.Type) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Estimate(com.facebook.presto.spi.statistics.Estimate) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) Comparator(java.util.Comparator) Types(org.apache.iceberg.types.Types) UncheckedIOException(java.io.UncheckedIOException) DataFile(org.apache.iceberg.DataFile) PartitionField(org.apache.iceberg.PartitionField) ColumnStatistics(com.facebook.presto.spi.statistics.ColumnStatistics) TableScan(org.apache.iceberg.TableScan) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Constraint(com.facebook.presto.spi.Constraint) DoubleRange(com.facebook.presto.spi.statistics.DoubleRange) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) Type(org.apache.iceberg.types.Type) TableStatistics(com.facebook.presto.spi.statistics.TableStatistics) FileScanTask(org.apache.iceberg.FileScanTask)

Aggregations

TableScan (org.apache.iceberg.TableScan)7 Table (org.apache.iceberg.Table)4 Map (java.util.Map)3 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)2 TypeManager (com.facebook.presto.common.type.TypeManager)2 ExtendedHiveMetastore (com.facebook.presto.hive.metastore.ExtendedHiveMetastore)2 TypeConverter.toPrestoType (com.facebook.presto.iceberg.TypeConverter.toPrestoType)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 IOException (java.io.IOException)2 UncheckedIOException (java.io.UncheckedIOException)2 List (java.util.List)2 Optional (java.util.Optional)2 DataFile (org.apache.iceberg.DataFile)2 Expression (org.apache.iceberg.expressions.Expression)2 NullableValue (com.facebook.presto.common.predicate.NullableValue)1 ArrayType (com.facebook.presto.common.type.ArrayType)1 HdfsContext (com.facebook.presto.hive.HdfsContext)1 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)1 HiveColumnConverterProvider (com.facebook.presto.hive.HiveColumnConverterProvider)1