Search in sources :

Example 21 with PartitionField

use of org.apache.iceberg.PartitionField in project urban-eureka by errir503.

the class IcebergSplitSource method getPartitionKeys.

private static Map<Integer, String> getPartitionKeys(FileScanTask scanTask) {
    StructLike partition = scanTask.file().partition();
    PartitionSpec spec = scanTask.spec();
    Map<PartitionField, Integer> fieldToIndex = getIdentityPartitions(spec);
    Map<Integer, String> partitionKeys = new HashMap<>();
    fieldToIndex.forEach((field, index) -> {
        int id = field.sourceId();
        Type type = spec.schema().findType(id);
        Class<?> javaClass = type.typeId().javaClass();
        Object value = partition.get(index, javaClass);
        if (value == null) {
            partitionKeys.put(id, null);
        } else {
            String partitionValue;
            if (type.typeId() == FIXED || type.typeId() == BINARY) {
                // this is safe because Iceberg PartitionData directly wraps the byte array
                partitionValue = new String(((ByteBuffer) value).array(), UTF_8);
            } else {
                partitionValue = value.toString();
            }
            partitionKeys.put(id, partitionValue);
        }
    });
    return Collections.unmodifiableMap(partitionKeys);
}
Also used : HashMap(java.util.HashMap) StructLike(org.apache.iceberg.StructLike) PartitionSpec(org.apache.iceberg.PartitionSpec) ByteBuffer(java.nio.ByteBuffer) PartitionField(org.apache.iceberg.PartitionField) Type(org.apache.iceberg.types.Type)

Example 22 with PartitionField

use of org.apache.iceberg.PartitionField in project urban-eureka by errir503.

the class PartitionTable method partitionTypes.

private List<Type> partitionTypes(List<PartitionField> partitionFields) {
    ImmutableList.Builder<Type> partitionTypeBuilder = ImmutableList.builder();
    for (PartitionField partitionField : partitionFields) {
        Type.PrimitiveType sourceType = idToTypeMapping.get(partitionField.sourceId());
        Type type = partitionField.transform().getResultType(sourceType);
        partitionTypeBuilder.add(type);
    }
    return partitionTypeBuilder.build();
}
Also used : TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) Type(org.apache.iceberg.types.Type) RowType(com.facebook.presto.common.type.RowType) PartitionField(org.apache.iceberg.PartitionField) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList)

Example 23 with PartitionField

use of org.apache.iceberg.PartitionField in project urban-eureka by errir503.

the class TableStatisticsMaker method updatePartitionedStats.

private void updatePartitionedStats(Partition summary, List<PartitionField> partitionFields, Map<Integer, Object> current, Map<Integer, Object> newStats, Predicate<Integer> predicate) {
    for (PartitionField field : partitionFields) {
        int id = field.sourceId();
        if (summary.getCorruptedStats().contains(id)) {
            continue;
        }
        Object newValue = newStats.get(id);
        if (newValue == null) {
            continue;
        }
        Object oldValue = current.putIfAbsent(id, newValue);
        if (oldValue != null) {
            Comparator<Object> comparator = Comparators.forType(summary.getIdToTypeMapping().get(id));
            if (predicate.test(comparator.compare(oldValue, newValue))) {
                current.put(id, newValue);
            }
        }
    }
}
Also used : PartitionField(org.apache.iceberg.PartitionField) Constraint(com.facebook.presto.spi.Constraint)

Example 24 with PartitionField

use of org.apache.iceberg.PartitionField in project iceberg by apache.

the class HiveVectorizedReader method reader.

public static <D> CloseableIterable<D> reader(InputFile inputFile, FileScanTask task, Map<Integer, ?> idToConstant, TaskAttemptContext context) {
    JobConf job = (JobConf) context.getConfiguration();
    Path path = new Path(inputFile.location());
    FileFormat format = task.file().format();
    Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter();
    // Hive by default requires partition columns to be read too. This is not required for identity partition
    // columns, as we will add this as constants later.
    int[] partitionColIndices = null;
    Object[] partitionValues = null;
    PartitionSpec partitionSpec = task.spec();
    if (!partitionSpec.isUnpartitioned()) {
        List<Integer> readColumnIds = ColumnProjectionUtils.getReadColumnIDs(job);
        List<PartitionField> fields = partitionSpec.fields();
        List<Integer> partitionColIndicesList = Lists.newLinkedList();
        List<Object> partitionValuesList = Lists.newLinkedList();
        for (PartitionField field : fields) {
            if (field.transform().isIdentity()) {
                // Skip reading identity partition columns from source file...
                int hiveColIndex = field.sourceId() - 1;
                readColumnIds.remove((Integer) hiveColIndex);
                // ...and use the corresponding constant value instead
                partitionColIndicesList.add(hiveColIndex);
                partitionValuesList.add(idToConstant.get(field.sourceId()));
            }
        }
        partitionColIndices = ArrayUtils.toPrimitive(partitionColIndicesList.toArray(new Integer[0]));
        partitionValues = partitionValuesList.toArray(new Object[0]);
        ColumnProjectionUtils.setReadColumns(job, readColumnIds);
    }
    try {
        long start = task.start();
        long length = task.length();
        RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
        switch(format) {
            case ORC:
                recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length);
                break;
            case PARQUET:
                recordReader = parquetRecordReader(job, reporter, task, path, start, length);
                break;
            default:
                throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format);
        }
        return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues);
    } catch (IOException ioe) {
        throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Reporter(org.apache.hadoop.mapred.Reporter) IOException(java.io.IOException) FileFormat(org.apache.iceberg.FileFormat) PartitionSpec(org.apache.iceberg.PartitionSpec) NullWritable(org.apache.hadoop.io.NullWritable) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) PartitionField(org.apache.iceberg.PartitionField) JobConf(org.apache.hadoop.mapred.JobConf)

Example 25 with PartitionField

use of org.apache.iceberg.PartitionField in project iceberg by apache.

the class SparkBatchQueryScan method filterAttributes.

@Override
public NamedReference[] filterAttributes() {
    Set<Integer> partitionFieldSourceIds = Sets.newHashSet();
    for (Integer specId : specIds()) {
        PartitionSpec spec = table().specs().get(specId);
        for (PartitionField field : spec.fields()) {
            partitionFieldSourceIds.add(field.sourceId());
        }
    }
    Map<Integer, String> quotedNameById = SparkSchemaUtil.indexQuotedNameById(expectedSchema());
    return partitionFieldSourceIds.stream().filter(fieldId -> expectedSchema().findField(fieldId) != null).map(fieldId -> Spark3Util.toNamedReference(quotedNameById.get(fieldId))).toArray(NamedReference[]::new);
}
Also used : Statistics(org.apache.spark.sql.connector.read.Statistics) LoggerFactory(org.slf4j.LoggerFactory) SparkFilters(org.apache.iceberg.spark.SparkFilters) Spark3Util(org.apache.iceberg.spark.Spark3Util) TableScanUtil(org.apache.iceberg.util.TableScanUtil) PartitionField(org.apache.iceberg.PartitionField) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) Expression(org.apache.iceberg.expressions.Expression) Map(java.util.Map) FileScanTask(org.apache.iceberg.FileScanTask) SparkSession(org.apache.spark.sql.SparkSession) NamedReference(org.apache.spark.sql.connector.expressions.NamedReference) Logger(org.slf4j.Logger) Binder(org.apache.iceberg.expressions.Binder) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Table(org.apache.iceberg.Table) SnapshotUtil(org.apache.iceberg.util.SnapshotUtil) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) Set(java.util.Set) IOException(java.io.IOException) TableScan(org.apache.iceberg.TableScan) SupportsRuntimeFiltering(org.apache.spark.sql.connector.read.SupportsRuntimeFiltering) Schema(org.apache.iceberg.Schema) SparkSchemaUtil(org.apache.iceberg.spark.SparkSchemaUtil) Collectors(java.util.stream.Collectors) CombinedScanTask(org.apache.iceberg.CombinedScanTask) UncheckedIOException(java.io.UncheckedIOException) Objects(java.util.Objects) ValidationException(org.apache.iceberg.exceptions.ValidationException) Evaluator(org.apache.iceberg.expressions.Evaluator) Sets(org.apache.iceberg.relocated.com.google.common.collect.Sets) List(java.util.List) PartitionSpec(org.apache.iceberg.PartitionSpec) Projections(org.apache.iceberg.expressions.Projections) Filter(org.apache.spark.sql.sources.Filter) Expressions(org.apache.iceberg.expressions.Expressions) SparkReadConf(org.apache.iceberg.spark.SparkReadConf) Collections(java.util.Collections) Snapshot(org.apache.iceberg.Snapshot) PartitionField(org.apache.iceberg.PartitionField) PartitionSpec(org.apache.iceberg.PartitionSpec)

Aggregations

PartitionField (org.apache.iceberg.PartitionField)30 Type (org.apache.iceberg.types.Type)18 ImmutableList (com.google.common.collect.ImmutableList)13 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)11 IOException (java.io.IOException)11 UncheckedIOException (java.io.UncheckedIOException)9 List (java.util.List)9 PartitionSpec (org.apache.iceberg.PartitionSpec)9 TypeConverter.toPrestoType (com.facebook.presto.iceberg.TypeConverter.toPrestoType)8 Map (java.util.Map)8 DataFile (org.apache.iceberg.DataFile)8 FileScanTask (org.apache.iceberg.FileScanTask)8 Schema (org.apache.iceberg.Schema)8 Table (org.apache.iceberg.Table)8 TableScan (org.apache.iceberg.TableScan)8 Set (java.util.Set)7 Collectors (java.util.stream.Collectors)7 CloseableIterable (org.apache.iceberg.io.CloseableIterable)7 RowType (com.facebook.presto.common.type.RowType)6 HashMap (java.util.HashMap)6