use of org.apache.iceberg.catalog.TableIdentifier in project incubator-gobblin by apache.
the class IcebergMetadataWriter method write.
/**
* The write method will be responsible for processing gmce and aggregating the metadata.
* The logic of this function will be:
* 1. Check whether a table exists, if not then create a iceberg table
* - If completeness is enabled, Add new parititon column to
* table {#NEW_PARTITION_KEY}
* 2. Compute schema from the gmce and update the cache for candidate schemas
* 3. Do the required operation of the gmce, i.e. addFile, rewriteFile, dropFile or change_property.
*
* Note: this method only aggregates the metadata in cache without committing,
* while the actual commit will be done in flush method (except rewrite and drop methods where preserving older file
* information increases the memory footprints, therefore we would like to flush them eagerly).
*/
public void write(GobblinMetadataChangeEvent gmce, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, HiveSpec tableSpec) throws IOException {
TableIdentifier tid = TableIdentifier.of(tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName());
TableMetadata tableMetadata = tableMetadataMap.computeIfAbsent(tid, t -> new TableMetadata());
Table table;
try {
table = getIcebergTable(tid);
} catch (NoSuchTableException e) {
try {
if (gmce.getOperationType() == OperationType.drop_files || gmce.getOperationType() == OperationType.change_property) {
log.warn("Table {} does not exist, skip processing this {} event", tid.toString(), gmce.getOperationType());
return;
}
table = createTable(gmce, tableSpec);
tableMetadata.table = Optional.of(table);
} catch (Exception e1) {
log.error("skip processing {} for table {}.{} due to error when creating table", gmce.toString(), tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName());
log.debug(e1.toString());
return;
}
}
computeCandidateSchema(gmce, tid, tableSpec);
tableMetadata.ensureTxnInit();
tableMetadata.lowestGMCEEmittedTime = Long.min(tableMetadata.lowestGMCEEmittedTime, gmce.getGMCEmittedTime());
switch(gmce.getOperationType()) {
case add_files:
{
updateTableProperty(tableSpec, tid);
addFiles(gmce, newSpecsMap, table, tableMetadata);
if (gmce.getTopicPartitionOffsetsRange() != null) {
mergeOffsets(gmce, tid);
}
// compute topic name
if (!tableMetadata.newProperties.get().containsKey(TOPIC_NAME_KEY) && tableMetadata.dataOffsetRange.isPresent() && !tableMetadata.dataOffsetRange.get().isEmpty()) {
String topicPartition = tableMetadata.dataOffsetRange.get().keySet().iterator().next();
tableMetadata.newProperties.get().put(TOPIC_NAME_KEY, topicPartition.substring(0, topicPartition.lastIndexOf("-")));
}
break;
}
case rewrite_files:
{
updateTableProperty(tableSpec, tid);
rewriteFiles(gmce, newSpecsMap, oldSpecsMap, table, tableMetadata);
break;
}
case drop_files:
{
dropFiles(gmce, oldSpecsMap, table, tableMetadata, tid);
break;
}
case change_property:
{
updateTableProperty(tableSpec, tid);
if (gmce.getTopicPartitionOffsetsRange() != null) {
mergeOffsets(gmce, tid);
}
log.info("No file operation need to be performed by Iceberg Metadata Writer at this point.");
break;
}
default:
{
log.error("unsupported operation {}", gmce.getOperationType().toString());
return;
}
}
}
use of org.apache.iceberg.catalog.TableIdentifier in project incubator-gobblin by apache.
the class IcebergMetadataWriter method writeEnvelope.
@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, HiveSpec tableSpec) throws IOException {
Lock readLock = readWriteLock.readLock();
readLock.lock();
try {
GenericRecord genericRecord = recordEnvelope.getRecord();
GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
String dbName = tableSpec.getTable().getDbName();
String tableName = tableSpec.getTable().getTableName();
if (whitelistBlacklist.acceptTable(dbName, tableName)) {
TableIdentifier tid = TableIdentifier.of(dbName, tableName);
String topicPartition = tableTopicPartitionMap.computeIfAbsent(tid, t -> recordEnvelope.getWatermark().getSource());
Long currentWatermark = getAndPersistCurrentWatermark(tid, topicPartition);
Long currentOffset = ((LongWatermark) recordEnvelope.getWatermark().getWatermark()).getValue();
if (currentOffset > currentWatermark) {
if (!tableMetadataMap.computeIfAbsent(tid, t -> new TableMetadata()).lowWatermark.isPresent()) {
// This means we haven't register this table or met some error before, we need to reset the low watermark
tableMetadataMap.get(tid).lowWatermark = Optional.of(currentOffset - 1);
tableMetadataMap.get(tid).setDatasetName(gmce.getDatasetIdentifier().getNativeName());
if (this.newPartitionEnabled && this.newPartitionTableWhitelistBlacklist.acceptTable(dbName, tableName)) {
tableMetadataMap.get(tid).newPartitionColumnEnabled = true;
if (this.completenessEnabled && this.completenessWhitelistBlacklist.acceptTable(dbName, tableName)) {
tableMetadataMap.get(tid).completenessEnabled = true;
}
}
}
write(gmce, newSpecsMap, oldSpecsMap, tableSpec);
tableCurrentWatermarkMap.put(tid, currentOffset);
} else {
log.warn(String.format("Skip processing record %s since it has lower watermark", genericRecord.toString()));
}
} else {
log.info(String.format("Skip table %s.%s since it's not selected", tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName()));
}
} finally {
readLock.unlock();
}
}
Aggregations