use of org.apache.iceberg.SerializableTable in project hive by apache.
the class IcebergInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext context) {
Configuration conf = context.getConfiguration();
Table table = Optional.ofNullable(HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))).orElseGet(() -> Catalogs.loadTable(conf));
TableScan scan = createTableScan(table, conf);
List<InputSplit> splits = Lists.newArrayList();
boolean applyResidual = !conf.getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false);
InputFormatConfig.InMemoryDataModel model = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC);
try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
Table serializableTable = SerializableTable.copyOf(table);
tasksIterable.forEach(task -> {
if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE || model == InputFormatConfig.InMemoryDataModel.PIG)) {
// TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet
checkResiduals(task);
}
splits.add(new IcebergSplit(serializableTable, conf, task));
});
} catch (IOException e) {
throw new UncheckedIOException(String.format("Failed to close table scan: %s", scan), e);
}
// wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for standard queries
if (scan instanceof DataTableScan) {
HiveIcebergStorageHandler.checkAndSkipIoConfigSerialization(conf, table);
}
return splits;
}
use of org.apache.iceberg.SerializableTable in project hive by apache.
the class HiveIcebergStorageHandler method overlayTableProperties.
/**
* Stores the serializable table data in the configuration.
* Currently the following is handled:
* <ul>
* <li>- Table - in case the table is serializable</li>
* <li>- Location</li>
* <li>- Schema</li>
* <li>- Partition specification</li>
* <li>- FileIO for handling table files</li>
* <li>- Location provider used for file generation</li>
* <li>- Encryption manager for encryption handling</li>
* </ul>
* @param configuration The configuration storing the catalog information
* @param tableDesc The table which we want to store to the configuration
* @param map The map of the configuration properties which we append with the serialized data
*/
@VisibleForTesting
static void overlayTableProperties(Configuration configuration, TableDesc tableDesc, Map<String, String> map) {
Properties props = tableDesc.getProperties();
Table table = IcebergTableUtil.getTable(configuration, props);
String schemaJson = SchemaParser.toJson(table.schema());
Maps.fromProperties(props).entrySet().stream().filter(// map overrides tableDesc properties
entry -> !map.containsKey(entry.getKey())).forEach(entry -> map.put(entry.getKey(), entry.getValue()));
map.put(InputFormatConfig.TABLE_IDENTIFIER, props.getProperty(Catalogs.NAME));
map.put(InputFormatConfig.TABLE_LOCATION, table.location());
map.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
props.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(table.spec()));
// serialize table object into config
Table serializableTable = SerializableTable.copyOf(table);
checkAndSkipIoConfigSerialization(configuration, serializableTable);
map.put(InputFormatConfig.SERIALIZED_TABLE_PREFIX + tableDesc.getTableName(), SerializationUtil.serializeToBase64(serializableTable));
// We need to remove this otherwise the job.xml will be invalid as column comments are separated with '\0' and
// the serialization utils fail to serialize this character
map.remove("columns.comments");
// save schema into table props as well to avoid repeatedly hitting the HMS during serde initializations
// this is an exception to the interface documentation, but it's a safe operation to add this property
props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
}
Aggregations