use of org.apache.iceberg.PartitionField in project urban-eureka by errir503.
the class IcebergSplitSource method getPartitionKeys.
private static Map<Integer, String> getPartitionKeys(FileScanTask scanTask) {
StructLike partition = scanTask.file().partition();
PartitionSpec spec = scanTask.spec();
Map<PartitionField, Integer> fieldToIndex = getIdentityPartitions(spec);
Map<Integer, String> partitionKeys = new HashMap<>();
fieldToIndex.forEach((field, index) -> {
int id = field.sourceId();
Type type = spec.schema().findType(id);
Class<?> javaClass = type.typeId().javaClass();
Object value = partition.get(index, javaClass);
if (value == null) {
partitionKeys.put(id, null);
} else {
String partitionValue;
if (type.typeId() == FIXED || type.typeId() == BINARY) {
// this is safe because Iceberg PartitionData directly wraps the byte array
partitionValue = new String(((ByteBuffer) value).array(), UTF_8);
} else {
partitionValue = value.toString();
}
partitionKeys.put(id, partitionValue);
}
});
return Collections.unmodifiableMap(partitionKeys);
}
use of org.apache.iceberg.PartitionField in project urban-eureka by errir503.
the class PartitionTable method partitionTypes.
private List<Type> partitionTypes(List<PartitionField> partitionFields) {
ImmutableList.Builder<Type> partitionTypeBuilder = ImmutableList.builder();
for (PartitionField partitionField : partitionFields) {
Type.PrimitiveType sourceType = idToTypeMapping.get(partitionField.sourceId());
Type type = partitionField.transform().getResultType(sourceType);
partitionTypeBuilder.add(type);
}
return partitionTypeBuilder.build();
}
use of org.apache.iceberg.PartitionField in project urban-eureka by errir503.
the class TableStatisticsMaker method updatePartitionedStats.
private void updatePartitionedStats(Partition summary, List<PartitionField> partitionFields, Map<Integer, Object> current, Map<Integer, Object> newStats, Predicate<Integer> predicate) {
for (PartitionField field : partitionFields) {
int id = field.sourceId();
if (summary.getCorruptedStats().contains(id)) {
continue;
}
Object newValue = newStats.get(id);
if (newValue == null) {
continue;
}
Object oldValue = current.putIfAbsent(id, newValue);
if (oldValue != null) {
Comparator<Object> comparator = Comparators.forType(summary.getIdToTypeMapping().get(id));
if (predicate.test(comparator.compare(oldValue, newValue))) {
current.put(id, newValue);
}
}
}
}
use of org.apache.iceberg.PartitionField in project iceberg by apache.
the class HiveVectorizedReader method reader.
public static <D> CloseableIterable<D> reader(InputFile inputFile, FileScanTask task, Map<Integer, ?> idToConstant, TaskAttemptContext context) {
JobConf job = (JobConf) context.getConfiguration();
Path path = new Path(inputFile.location());
FileFormat format = task.file().format();
Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter();
// Hive by default requires partition columns to be read too. This is not required for identity partition
// columns, as we will add this as constants later.
int[] partitionColIndices = null;
Object[] partitionValues = null;
PartitionSpec partitionSpec = task.spec();
if (!partitionSpec.isUnpartitioned()) {
List<Integer> readColumnIds = ColumnProjectionUtils.getReadColumnIDs(job);
List<PartitionField> fields = partitionSpec.fields();
List<Integer> partitionColIndicesList = Lists.newLinkedList();
List<Object> partitionValuesList = Lists.newLinkedList();
for (PartitionField field : fields) {
if (field.transform().isIdentity()) {
// Skip reading identity partition columns from source file...
int hiveColIndex = field.sourceId() - 1;
readColumnIds.remove((Integer) hiveColIndex);
// ...and use the corresponding constant value instead
partitionColIndicesList.add(hiveColIndex);
partitionValuesList.add(idToConstant.get(field.sourceId()));
}
}
partitionColIndices = ArrayUtils.toPrimitive(partitionColIndicesList.toArray(new Integer[0]));
partitionValues = partitionValuesList.toArray(new Object[0]);
ColumnProjectionUtils.setReadColumns(job, readColumnIds);
}
try {
long start = task.start();
long length = task.length();
RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
switch(format) {
case ORC:
recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length);
break;
case PARQUET:
recordReader = parquetRecordReader(job, reporter, task, path, start, length);
break;
default:
throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format);
}
return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues);
} catch (IOException ioe) {
throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe);
}
}
use of org.apache.iceberg.PartitionField in project iceberg by apache.
the class SparkBatchQueryScan method filterAttributes.
@Override
public NamedReference[] filterAttributes() {
Set<Integer> partitionFieldSourceIds = Sets.newHashSet();
for (Integer specId : specIds()) {
PartitionSpec spec = table().specs().get(specId);
for (PartitionField field : spec.fields()) {
partitionFieldSourceIds.add(field.sourceId());
}
}
Map<Integer, String> quotedNameById = SparkSchemaUtil.indexQuotedNameById(expectedSchema());
return partitionFieldSourceIds.stream().filter(fieldId -> expectedSchema().findField(fieldId) != null).map(fieldId -> Spark3Util.toNamedReference(quotedNameById.get(fieldId))).toArray(NamedReference[]::new);
}
Aggregations