use of org.apache.iceberg.TableScan in project presto by prestodb.
the class FilesTable method buildPages.
private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable, Optional<Long> snapshotId) {
PageListBuilder pagesBuilder = forTable(tableMetadata);
TableScan tableScan = getTableScan(TupleDomain.all(), snapshotId, icebergTable).includeColumnStats();
Map<Integer, Type> idToTypeMap = getIdToTypeMap(icebergTable.schema());
tableScan.planFiles().forEach(fileScanTask -> {
DataFile dataFile = fileScanTask.file();
pagesBuilder.beginRow();
pagesBuilder.appendVarchar(dataFile.path().toString());
pagesBuilder.appendVarchar(dataFile.format().name());
pagesBuilder.appendBigint(dataFile.recordCount());
pagesBuilder.appendBigint(dataFile.fileSizeInBytes());
if (checkNonNull(dataFile.columnSizes(), pagesBuilder)) {
pagesBuilder.appendIntegerBigintMap(dataFile.columnSizes());
}
if (checkNonNull(dataFile.valueCounts(), pagesBuilder)) {
pagesBuilder.appendIntegerBigintMap(dataFile.valueCounts());
}
if (checkNonNull(dataFile.nullValueCounts(), pagesBuilder)) {
pagesBuilder.appendIntegerBigintMap(dataFile.nullValueCounts());
}
if (checkNonNull(dataFile.lowerBounds(), pagesBuilder)) {
pagesBuilder.appendIntegerVarcharMap(dataFile.lowerBounds().entrySet().stream().collect(toImmutableMap(Map.Entry<Integer, ByteBuffer>::getKey, entry -> Transforms.identity(idToTypeMap.get(entry.getKey())).toHumanString(Conversions.fromByteBuffer(idToTypeMap.get(entry.getKey()), entry.getValue())))));
}
if (checkNonNull(dataFile.upperBounds(), pagesBuilder)) {
pagesBuilder.appendIntegerVarcharMap(dataFile.upperBounds().entrySet().stream().collect(toImmutableMap(Map.Entry<Integer, ByteBuffer>::getKey, entry -> Transforms.identity(idToTypeMap.get(entry.getKey())).toHumanString(Conversions.fromByteBuffer(idToTypeMap.get(entry.getKey()), entry.getValue())))));
}
if (checkNonNull(dataFile.keyMetadata(), pagesBuilder)) {
pagesBuilder.appendVarbinary(Slices.wrappedBuffer(dataFile.keyMetadata()));
}
if (checkNonNull(dataFile.splitOffsets(), pagesBuilder)) {
pagesBuilder.appendBigintArray(dataFile.splitOffsets());
}
pagesBuilder.endRow();
});
return pagesBuilder.build();
}
use of org.apache.iceberg.TableScan in project presto by prestodb.
the class IcebergUtil method getTableScan.
public static TableScan getTableScan(TupleDomain<IcebergColumnHandle> predicates, Optional<Long> snapshotId, Table icebergTable) {
Expression expression = ExpressionConverter.toIcebergExpression(predicates);
TableScan tableScan = icebergTable.newScan().filter(expression);
return snapshotId.map(id -> isSnapshot(icebergTable, id) ? tableScan.useSnapshot(id) : tableScan.asOfTime(id)).orElse(tableScan);
}
use of org.apache.iceberg.TableScan in project hive by apache.
the class IcebergInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext context) {
Configuration conf = context.getConfiguration();
Table table = Optional.ofNullable(HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))).orElseGet(() -> Catalogs.loadTable(conf));
TableScan scan = createTableScan(table, conf);
List<InputSplit> splits = Lists.newArrayList();
boolean applyResidual = !conf.getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false);
InputFormatConfig.InMemoryDataModel model = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC);
try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
Table serializableTable = SerializableTable.copyOf(table);
tasksIterable.forEach(task -> {
if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE || model == InputFormatConfig.InMemoryDataModel.PIG)) {
// TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet
checkResiduals(task);
}
splits.add(new IcebergSplit(serializableTable, conf, task));
});
} catch (IOException e) {
throw new UncheckedIOException(String.format("Failed to close table scan: %s", scan), e);
}
// wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for standard queries
if (scan instanceof DataTableScan) {
HiveIcebergStorageHandler.checkAndSkipIoConfigSerialization(conf, table);
}
return splits;
}
use of org.apache.iceberg.TableScan in project hive by apache.
the class IcebergInputFormat method createTableScan.
private static TableScan createTableScan(Table table, Configuration conf) {
TableScan scan = table.newScan().caseSensitive(conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT));
long snapshotId = conf.getLong(InputFormatConfig.SNAPSHOT_ID, -1);
if (snapshotId != -1) {
scan = scan.useSnapshot(snapshotId);
}
long asOfTime = conf.getLong(InputFormatConfig.AS_OF_TIMESTAMP, -1);
if (asOfTime != -1) {
scan = scan.asOfTime(asOfTime);
}
long splitSize = conf.getLong(InputFormatConfig.SPLIT_SIZE, 0);
if (splitSize > 0) {
scan = scan.option(TableProperties.SPLIT_SIZE, String.valueOf(splitSize));
}
// In case of LLAP-based execution we ask Iceberg not to combine multiple fileScanTasks into one split.
// This is so that cache affinity can work, and each file(split) is executed/cached on always the same LLAP daemon.
MapWork mapWork = LlapHiveUtils.findMapWork((JobConf) conf);
if (mapWork != null && mapWork.getCacheAffinity()) {
// Iceberg splits logically consist of buckets, where the bucket size equals to openFileCost setting if the files
// assigned to such bucket are smaller. This is how Iceberg would combine multiple files into one split, so here
// we need to enforce the bucket size to be equal to split size to avoid file combination.
Long openFileCost = splitSize > 0 ? splitSize : TableProperties.SPLIT_SIZE_DEFAULT;
scan = scan.option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(openFileCost));
}
String schemaStr = conf.get(InputFormatConfig.READ_SCHEMA);
if (schemaStr != null) {
scan.project(SchemaParser.fromJson(schemaStr));
}
String[] selectedColumns = conf.getStrings(InputFormatConfig.SELECTED_COLUMNS);
if (selectedColumns != null) {
scan.select(selectedColumns);
}
// TODO add a filter parser to get rid of Serialization
Expression filter = SerializationUtil.deserializeFromBase64(conf.get(InputFormatConfig.FILTER_EXPRESSION));
if (filter != null) {
scan = scan.filter(filter);
}
return scan;
}
use of org.apache.iceberg.TableScan in project presto by prestodb.
the class TableStatisticsMaker method makeTableStatistics.
private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
if (!tableHandle.getSnapshotId().isPresent() || constraint.getSummary().isNone()) {
return TableStatistics.empty();
}
TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transform(IcebergColumnHandle.class::cast).intersect(tableHandle.getPredicate());
if (intersection.isNone()) {
return TableStatistics.empty();
}
List<Types.NestedField> columns = icebergTable.schema().columns();
Map<Integer, Type.PrimitiveType> idToTypeMapping = columns.stream().filter(column -> column.type().isPrimitiveType()).collect(Collectors.toMap(Types.NestedField::fieldId, column -> column.type().asPrimitiveType()));
List<PartitionField> partitionFields = icebergTable.spec().fields();
Set<Integer> identityPartitionIds = getIdentityPartitions(icebergTable.spec()).keySet().stream().map(PartitionField::sourceId).collect(toSet());
List<Types.NestedField> nonPartitionPrimitiveColumns = columns.stream().filter(column -> !identityPartitionIds.contains(column.fieldId()) && column.type().isPrimitiveType()).collect(toImmutableList());
List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
List<IcebergColumnHandle> columnHandles = getColumns(icebergTable.schema(), typeManager);
Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toImmutableMap(IcebergColumnHandle::getId, identity()));
ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
for (int index = 0; index < partitionFields.size(); index++) {
PartitionField field = partitionFields.get(index);
Type type = icebergPartitionTypes.get(index);
idToDetailsBuilder.put(field.sourceId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toPrestoType(type, typeManager), type.typeId().javaClass()));
}
Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.build();
TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
Partition summary = null;
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
if (!dataFileMatches(dataFile, constraint, idToTypeMapping, partitionFields, idToDetails)) {
continue;
}
if (summary == null) {
summary = new Partition(idToTypeMapping, nonPartitionPrimitiveColumns, dataFile.partition(), dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(idToTypeMapping, dataFile.lowerBounds()), toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
} else {
summary.incrementFileCount();
summary.incrementRecordCount(dataFile.recordCount());
summary.incrementSize(dataFile.fileSizeInBytes());
updateSummaryMin(summary, partitionFields, toMap(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
updateSummaryMax(summary, partitionFields, toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
summary.updateNullCount(dataFile.nullValueCounts());
updateColumnSizes(summary, dataFile.columnSizes());
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
if (summary == null) {
return TableStatistics.empty();
}
double recordCount = summary.getRecordCount();
TableStatistics.Builder result = TableStatistics.builder();
result.setRowCount(Estimate.of(recordCount));
result.setTotalSize(Estimate.of(summary.getSize()));
for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
int fieldId = columnHandle.getId();
ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
Long nullCount = summary.getNullCounts().get(fieldId);
if (nullCount != null) {
columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
}
if (summary.getColumnSizes() != null) {
Long columnSize = summary.getColumnSizes().get(fieldId);
if (columnSize != null) {
columnBuilder.setDataSize(Estimate.of(columnSize));
}
}
Object min = summary.getMinValues().get(fieldId);
Object max = summary.getMaxValues().get(fieldId);
if (min instanceof Number && max instanceof Number) {
columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue())));
}
result.setColumnStatistics(columnHandle, columnBuilder.build());
}
return result.build();
}
Aggregations