use of org.apache.iceberg.PartitionField in project presto by prestodb.
the class PartitionTable method partitionTypes.
private List<Type> partitionTypes(List<PartitionField> partitionFields) {
ImmutableList.Builder<Type> partitionTypeBuilder = ImmutableList.builder();
for (PartitionField partitionField : partitionFields) {
Type.PrimitiveType sourceType = idToTypeMapping.get(partitionField.sourceId());
Type type = partitionField.transform().getResultType(sourceType);
partitionTypeBuilder.add(type);
}
return partitionTypeBuilder.build();
}
use of org.apache.iceberg.PartitionField in project hive by apache.
the class TestHiveIcebergStorageHandlerNoScan method testAlterTableRenamePartitionColumn.
@Test
public void testAlterTableRenamePartitionColumn() throws Exception {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
testTables.createTable(shell, identifier.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, SPEC, FileFormat.PARQUET, ImmutableList.of());
shell.executeStatement("ALTER TABLE default.customers SET PARTITION SPEC (last_name)");
// Renaming (and reordering) a partition column
shell.executeStatement("ALTER TABLE default.customers CHANGE last_name family_name string FIRST");
List<PartitionField> partitionFields = testTables.loadTable(identifier).spec().fields();
Assert.assertEquals(1, partitionFields.size());
Assert.assertEquals("family_name", partitionFields.get(0).name());
// Addign new columns, assigning them as partition columns then removing 1 partition column
shell.executeStatement("ALTER TABLE default.customers ADD COLUMNS (p1 string, p2 string)");
shell.executeStatement("ALTER TABLE default.customers SET PARTITION SPEC (family_name, p1, p2)");
shell.executeStatement("ALTER TABLE default.customers CHANGE p1 region string");
shell.executeStatement("ALTER TABLE default.customers CHANGE p2 city string");
shell.executeStatement("ALTER TABLE default.customers SET PARTITION SPEC (region, city)");
List<Object[]> result = shell.executeStatement("DESCRIBE default.customers");
Assert.assertArrayEquals(new String[] { "family_name", "VOID", null }, result.get(8));
Assert.assertArrayEquals(new String[] { "region", "IDENTITY", null }, result.get(9));
Assert.assertArrayEquals(new String[] { "city", "IDENTITY", null }, result.get(10));
}
use of org.apache.iceberg.PartitionField in project hive by apache.
the class HiveIcebergSerDe method initialize.
@Override
public void initialize(@Nullable Configuration configuration, Properties serDeProperties, Properties partitionProperties) throws SerDeException {
super.initialize(configuration, serDeProperties, partitionProperties);
if (serDeProperties.get(InputFormatConfig.TABLE_SCHEMA) != null) {
this.tableSchema = SchemaParser.fromJson((String) serDeProperties.get(InputFormatConfig.TABLE_SCHEMA));
if (serDeProperties.get(InputFormatConfig.PARTITION_SPEC) != null) {
PartitionSpec spec = PartitionSpecParser.fromJson(tableSchema, serDeProperties.getProperty(InputFormatConfig.PARTITION_SPEC));
this.partitionColumns = spec.fields().stream().map(PartitionField::name).collect(Collectors.toList());
} else {
this.partitionColumns = ImmutableList.of();
}
} else {
try {
Table table = IcebergTableUtil.getTable(configuration, serDeProperties);
// always prefer the original table schema if there is one
this.tableSchema = table.schema();
this.partitionColumns = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toList());
LOG.info("Using schema from existing table {}", SchemaParser.toJson(tableSchema));
} catch (Exception e) {
// During table creation we might not have the schema information from the Iceberg table, nor from the HMS
// table. In this case we have to generate the schema using the serdeProperties which contains the info
// provided in the CREATE TABLE query.
boolean autoConversion = configuration.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false);
// If we can not load the table try the provided hive schema
this.tableSchema = hiveSchemaOrThrow(e, autoConversion);
// This is only for table creation, it is ok to have an empty partition column list
this.partitionColumns = ImmutableList.of();
// create table for CTAS
if (e instanceof NoSuchTableException && Boolean.parseBoolean(serDeProperties.getProperty(hive_metastoreConstants.TABLE_IS_CTAS))) {
if (!Catalogs.hiveCatalog(configuration, serDeProperties)) {
throw new SerDeException(CTAS_EXCEPTION_MSG);
}
createTableForCTAS(configuration, serDeProperties);
}
}
}
Schema projectedSchema;
if (serDeProperties.get(HiveIcebergStorageHandler.WRITE_KEY) != null) {
// when writing out data, we should not do projection pushdown
projectedSchema = tableSchema;
} else {
configuration.setBoolean(InputFormatConfig.CASE_SENSITIVE, false);
String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(configuration);
// When same table is joined multiple times, it is possible some selected columns are duplicated,
// in this case wrong recordStructField position leads wrong value or ArrayIndexOutOfBoundException
String[] distinctSelectedColumns = Arrays.stream(selectedColumns).distinct().toArray(String[]::new);
projectedSchema = distinctSelectedColumns.length > 0 ? tableSchema.caseInsensitiveSelect(distinctSelectedColumns) : tableSchema;
// or we cannot find selectOperator's column from inspector
if (projectedSchema.columns().size() != distinctSelectedColumns.length) {
projectedSchema = tableSchema;
}
}
try {
this.inspector = IcebergObjectInspector.create(projectedSchema);
} catch (Exception e) {
throw new SerDeException(e);
}
}
use of org.apache.iceberg.PartitionField in project presto by prestodb.
the class PartitionTable method buildRecordCursor.
private RecordCursor buildRecordCursor(Map<StructLikeWrapper, Partition> partitions, List<PartitionField> partitionFields) {
List<Type> partitionTypes = partitionTypes(partitionFields);
List<? extends Class<?>> partitionColumnClass = partitionTypes.stream().map(type -> type.typeId().javaClass()).collect(toImmutableList());
int columnCounts = partitionColumnTypes.size() + 3 + columnMetricTypes.size();
ImmutableList.Builder<List<Object>> records = ImmutableList.builder();
for (Partition partition : partitions.values()) {
List<Object> row = new ArrayList<>(columnCounts);
// add data for partition columns
for (int i = 0; i < partitionColumnTypes.size(); i++) {
row.add(convert(partition.getValues().get(i, partitionColumnClass.get(i)), partitionTypes.get(i)));
}
// add the top level metrics.
row.add(partition.getRecordCount());
row.add(partition.getFileCount());
row.add(partition.getSize());
// add column level metrics
for (int i = 0; i < columnMetricTypes.size(); i++) {
if (!partition.hasValidColumnMetrics()) {
row.add(null);
continue;
}
Integer fieldId = nonPartitionPrimitiveColumns.get(i).fieldId();
Type.PrimitiveType type = idToTypeMapping.get(fieldId);
Object min = convert(partition.getMinValues().get(fieldId), type);
Object max = convert(partition.getMaxValues().get(fieldId), type);
Long nullCount = partition.getNullCounts().get(fieldId);
row.add(getColumnMetricBlock(columnMetricTypes.get(i), min, max, nullCount));
}
records.add(row);
}
return new InMemoryRecordSet(resultTypes, records.build()).cursor();
}
use of org.apache.iceberg.PartitionField in project presto by prestodb.
the class TableStatisticsMaker method makeTableStatistics.
private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
if (!tableHandle.getSnapshotId().isPresent() || constraint.getSummary().isNone()) {
return TableStatistics.empty();
}
TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transform(IcebergColumnHandle.class::cast).intersect(tableHandle.getPredicate());
if (intersection.isNone()) {
return TableStatistics.empty();
}
List<Types.NestedField> columns = icebergTable.schema().columns();
Map<Integer, Type.PrimitiveType> idToTypeMapping = columns.stream().filter(column -> column.type().isPrimitiveType()).collect(Collectors.toMap(Types.NestedField::fieldId, column -> column.type().asPrimitiveType()));
List<PartitionField> partitionFields = icebergTable.spec().fields();
Set<Integer> identityPartitionIds = getIdentityPartitions(icebergTable.spec()).keySet().stream().map(PartitionField::sourceId).collect(toSet());
List<Types.NestedField> nonPartitionPrimitiveColumns = columns.stream().filter(column -> !identityPartitionIds.contains(column.fieldId()) && column.type().isPrimitiveType()).collect(toImmutableList());
List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
List<IcebergColumnHandle> columnHandles = getColumns(icebergTable.schema(), typeManager);
Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toImmutableMap(IcebergColumnHandle::getId, identity()));
ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
for (int index = 0; index < partitionFields.size(); index++) {
PartitionField field = partitionFields.get(index);
Type type = icebergPartitionTypes.get(index);
idToDetailsBuilder.put(field.sourceId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toPrestoType(type, typeManager), type.typeId().javaClass()));
}
Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.build();
TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
Partition summary = null;
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
if (!dataFileMatches(dataFile, constraint, idToTypeMapping, partitionFields, idToDetails)) {
continue;
}
if (summary == null) {
summary = new Partition(idToTypeMapping, nonPartitionPrimitiveColumns, dataFile.partition(), dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(idToTypeMapping, dataFile.lowerBounds()), toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
} else {
summary.incrementFileCount();
summary.incrementRecordCount(dataFile.recordCount());
summary.incrementSize(dataFile.fileSizeInBytes());
updateSummaryMin(summary, partitionFields, toMap(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
updateSummaryMax(summary, partitionFields, toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
summary.updateNullCount(dataFile.nullValueCounts());
updateColumnSizes(summary, dataFile.columnSizes());
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
if (summary == null) {
return TableStatistics.empty();
}
double recordCount = summary.getRecordCount();
TableStatistics.Builder result = TableStatistics.builder();
result.setRowCount(Estimate.of(recordCount));
result.setTotalSize(Estimate.of(summary.getSize()));
for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
int fieldId = columnHandle.getId();
ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
Long nullCount = summary.getNullCounts().get(fieldId);
if (nullCount != null) {
columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
}
if (summary.getColumnSizes() != null) {
Long columnSize = summary.getColumnSizes().get(fieldId);
if (columnSize != null) {
columnBuilder.setDataSize(Estimate.of(columnSize));
}
}
Object min = summary.getMinValues().get(fieldId);
Object max = summary.getMaxValues().get(fieldId);
if (min instanceof Number && max instanceof Number) {
columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue())));
}
result.setColumnStatistics(columnHandle, columnBuilder.build());
}
return result.build();
}
Aggregations