use of org.apache.iceberg.CombinedScanTask in project hive by apache.
the class IcebergInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext context) {
Configuration conf = context.getConfiguration();
Table table = Optional.ofNullable(HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))).orElseGet(() -> Catalogs.loadTable(conf));
TableScan scan = createTableScan(table, conf);
List<InputSplit> splits = Lists.newArrayList();
boolean applyResidual = !conf.getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false);
InputFormatConfig.InMemoryDataModel model = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC);
try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
Table serializableTable = SerializableTable.copyOf(table);
tasksIterable.forEach(task -> {
if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE || model == InputFormatConfig.InMemoryDataModel.PIG)) {
// TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet
checkResiduals(task);
}
splits.add(new IcebergSplit(serializableTable, conf, task));
});
} catch (IOException e) {
throw new UncheckedIOException(String.format("Failed to close table scan: %s", scan), e);
}
// wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for standard queries
if (scan instanceof DataTableScan) {
HiveIcebergStorageHandler.checkAndSkipIoConfigSerialization(conf, table);
}
return splits;
}
Aggregations