use of org.apache.iceberg.PartitionField in project trino by trinodb.
the class PartitionTable method partitionTypes.
private List<Type> partitionTypes(List<PartitionField> partitionFields) {
ImmutableList.Builder<Type> partitionTypeBuilder = ImmutableList.builder();
for (PartitionField partitionField : partitionFields) {
Type.PrimitiveType sourceType = idToTypeMapping.get(partitionField.sourceId());
Type type = partitionField.transform().getResultType(sourceType);
partitionTypeBuilder.add(type);
}
return partitionTypeBuilder.build();
}
use of org.apache.iceberg.PartitionField in project trino by trinodb.
the class TableStatisticsMaker method makeTableStatistics.
private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
if (tableHandle.getSnapshotId().isEmpty() || constraint.getSummary().isNone()) {
return TableStatistics.empty();
}
TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transformKeys(IcebergColumnHandle.class::cast).intersect(tableHandle.getEnforcedPredicate());
if (intersection.isNone()) {
return TableStatistics.empty();
}
Schema icebergTableSchema = icebergTable.schema();
List<Types.NestedField> columns = icebergTableSchema.columns();
Map<Integer, Type.PrimitiveType> idToTypeMapping = primitiveFieldTypes(icebergTableSchema);
List<PartitionField> partitionFields = icebergTable.spec().fields();
List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
List<IcebergColumnHandle> columnHandles = getColumns(icebergTableSchema, typeManager);
Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toUnmodifiableMap(IcebergColumnHandle::getId, identity()));
ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
for (int index = 0; index < partitionFields.size(); index++) {
PartitionField field = partitionFields.get(index);
Type type = icebergPartitionTypes.get(index);
idToDetailsBuilder.put(field.fieldId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toTrinoType(type, typeManager), type.typeId().javaClass()));
}
Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.buildOrThrow();
TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
IcebergStatistics.Builder icebergStatisticsBuilder = new IcebergStatistics.Builder(columns, typeManager);
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
if (!dataFileMatches(dataFile, constraint, partitionFields, idToDetails)) {
continue;
}
icebergStatisticsBuilder.acceptDataFile(dataFile, fileScanTask.spec());
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
IcebergStatistics summary = icebergStatisticsBuilder.build();
if (summary.getFileCount() == 0) {
return TableStatistics.empty();
}
ImmutableMap.Builder<ColumnHandle, ColumnStatistics> columnHandleBuilder = ImmutableMap.builder();
double recordCount = summary.getRecordCount();
for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
int fieldId = columnHandle.getId();
ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
Long nullCount = summary.getNullCounts().get(fieldId);
if (nullCount != null) {
columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
}
if (summary.getColumnSizes() != null) {
Long columnSize = summary.getColumnSizes().get(fieldId);
if (columnSize != null) {
columnBuilder.setDataSize(Estimate.of(columnSize));
}
}
Object min = summary.getMinValues().get(fieldId);
Object max = summary.getMaxValues().get(fieldId);
if (min != null && max != null) {
columnBuilder.setRange(DoubleRange.from(columnHandle.getType(), min, max));
}
columnHandleBuilder.put(columnHandle, columnBuilder.build());
}
return new TableStatistics(Estimate.of(recordCount), columnHandleBuilder.buildOrThrow());
}
use of org.apache.iceberg.PartitionField in project trino by trinodb.
the class IcebergMetadata method getWriteLayout.
private Optional<ConnectorTableLayout> getWriteLayout(Schema tableSchema, PartitionSpec partitionSpec, boolean forceRepartitioning) {
if (partitionSpec.isUnpartitioned()) {
return Optional.empty();
}
Map<Integer, IcebergColumnHandle> columnById = getColumns(tableSchema, typeManager).stream().collect(toImmutableMap(IcebergColumnHandle::getId, identity()));
List<IcebergColumnHandle> partitioningColumns = partitionSpec.fields().stream().sorted(Comparator.comparing(PartitionField::sourceId)).map(field -> requireNonNull(columnById.get(field.sourceId()), () -> "Cannot find source column for partitioning field " + field)).distinct().collect(toImmutableList());
List<String> partitioningColumnNames = partitioningColumns.stream().map(IcebergColumnHandle::getName).collect(toImmutableList());
if (!forceRepartitioning && partitionSpec.fields().stream().allMatch(field -> field.transform().isIdentity())) {
// Do not set partitioningHandle, to let engine determine whether to repartition data or not, on stat-based basis.
return Optional.of(new ConnectorTableLayout(partitioningColumnNames));
}
IcebergPartitioningHandle partitioningHandle = new IcebergPartitioningHandle(toPartitionFields(partitionSpec), partitioningColumns);
return Optional.of(new ConnectorTableLayout(partitioningHandle, partitioningColumnNames));
}
use of org.apache.iceberg.PartitionField in project hive by apache.
the class HiveVectorizedReader method reader.
public static <D> CloseableIterable<D> reader(InputFile inputFile, FileScanTask task, Map<Integer, ?> idToConstant, TaskAttemptContext context) {
// Tweaks on jobConf here are relevant for this task only, so we need to copy it first as context's conf is reused..
JobConf job = new JobConf((JobConf) context.getConfiguration());
Path path = new Path(inputFile.location());
FileFormat format = task.file().format();
Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter();
// Hive by default requires partition columns to be read too. This is not required for identity partition
// columns, as we will add this as constants later.
int[] partitionColIndices = null;
Object[] partitionValues = null;
PartitionSpec partitionSpec = task.spec();
List<Integer> readColumnIds = ColumnProjectionUtils.getReadColumnIDs(job);
if (!partitionSpec.isUnpartitioned()) {
List<PartitionField> fields = partitionSpec.fields();
List<Integer> partitionColIndicesList = Lists.newLinkedList();
List<Object> partitionValuesList = Lists.newLinkedList();
for (PartitionField partitionField : fields) {
if (partitionField.transform().isIdentity()) {
// Get columns in read schema order (which matches those of readColumnIds) to find partition column indices
List<Types.NestedField> columns = task.spec().schema().columns();
for (int colIdx = 0; colIdx < columns.size(); ++colIdx) {
if (columns.get(colIdx).fieldId() == partitionField.sourceId()) {
// Skip reading identity partition columns from source file...
readColumnIds.remove((Integer) colIdx);
// ...and use the corresponding constant value instead
partitionColIndicesList.add(colIdx);
partitionValuesList.add(idToConstant.get(partitionField.sourceId()));
break;
}
}
}
}
partitionColIndices = ArrayUtils.toPrimitive(partitionColIndicesList.toArray(new Integer[0]));
partitionValues = partitionValuesList.toArray(new Object[0]);
ColumnProjectionUtils.setReadColumns(job, readColumnIds);
}
try {
long start = task.start();
long length = task.length();
// TODO: Iceberg currently does not track the last modification time of a file. Until that's added,
// we need to set Long.MIN_VALUE as last modification time in the fileId triplet.
SyntheticFileId fileId = new SyntheticFileId(path, task.file().fileSizeInBytes(), Long.MIN_VALUE);
RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
switch(format) {
case ORC:
recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length, readColumnIds, fileId);
break;
case PARQUET:
recordReader = parquetRecordReader(job, reporter, task, path, start, length);
break;
default:
throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format);
}
return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues);
} catch (IOException ioe) {
throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe);
}
}
use of org.apache.iceberg.PartitionField in project presto by prestodb.
the class PartitionTable method buildRecordCursor.
private RecordCursor buildRecordCursor(Map<StructLikeWrapper, Partition> partitions, List<PartitionField> partitionFields) {
List<Type> partitionTypes = partitionTypes(partitionFields);
List<? extends Class<?>> partitionColumnClass = partitionTypes.stream().map(type -> type.typeId().javaClass()).collect(toImmutableList());
int columnCounts = partitionColumnTypes.size() + 3 + columnMetricTypes.size();
ImmutableList.Builder<List<Object>> records = ImmutableList.builder();
for (Partition partition : partitions.values()) {
List<Object> row = new ArrayList<>(columnCounts);
// add data for partition columns
for (int i = 0; i < partitionColumnTypes.size(); i++) {
row.add(convert(partition.getValues().get(i, partitionColumnClass.get(i)), partitionTypes.get(i)));
}
// add the top level metrics.
row.add(partition.getRecordCount());
row.add(partition.getFileCount());
row.add(partition.getSize());
// add column level metrics
for (int i = 0; i < columnMetricTypes.size(); i++) {
if (!partition.hasValidColumnMetrics()) {
row.add(null);
continue;
}
Integer fieldId = nonPartitionPrimitiveColumns.get(i).fieldId();
Type.PrimitiveType type = idToTypeMapping.get(fieldId);
Object min = convert(partition.getMinValues().get(fieldId), type);
Object max = convert(partition.getMaxValues().get(fieldId), type);
Long nullCount = partition.getNullCounts().get(fieldId);
row.add(getColumnMetricBlock(columnMetricTypes.get(i), min, max, nullCount));
}
records.add(row);
}
return new InMemoryRecordSet(resultTypes, records.build()).cursor();
}
Aggregations