Search in sources :

Example 86 with TableIdentifier

use of org.apache.iceberg.catalog.TableIdentifier in project incubator-gobblin by apache.

the class IcebergMetadataWriter method write.

/**
 * The write method will be responsible for processing gmce and aggregating the metadata.
 * The logic of this function will be:
 * 1. Check whether a table exists, if not then create a iceberg table
 *    - If completeness is enabled, Add new parititon column to
 *      table {#NEW_PARTITION_KEY}
 * 2. Compute schema from the gmce and update the cache for candidate schemas
 * 3. Do the required operation of the gmce, i.e. addFile, rewriteFile, dropFile or change_property.
 *
 * Note: this method only aggregates the metadata in cache without committing,
 * while the actual commit will be done in flush method (except rewrite and drop methods where preserving older file
 * information increases the memory footprints, therefore we would like to flush them eagerly).
 */
public void write(GobblinMetadataChangeEvent gmce, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, HiveSpec tableSpec) throws IOException {
    TableIdentifier tid = TableIdentifier.of(tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName());
    TableMetadata tableMetadata = tableMetadataMap.computeIfAbsent(tid, t -> new TableMetadata());
    Table table;
    try {
        table = getIcebergTable(tid);
    } catch (NoSuchTableException e) {
        try {
            if (gmce.getOperationType() == OperationType.drop_files || gmce.getOperationType() == OperationType.change_property) {
                log.warn("Table {} does not exist, skip processing this {} event", tid.toString(), gmce.getOperationType());
                return;
            }
            table = createTable(gmce, tableSpec);
            tableMetadata.table = Optional.of(table);
        } catch (Exception e1) {
            log.error("skip processing {} for table {}.{} due to error when creating table", gmce.toString(), tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName());
            log.debug(e1.toString());
            return;
        }
    }
    computeCandidateSchema(gmce, tid, tableSpec);
    tableMetadata.ensureTxnInit();
    tableMetadata.lowestGMCEEmittedTime = Long.min(tableMetadata.lowestGMCEEmittedTime, gmce.getGMCEmittedTime());
    switch(gmce.getOperationType()) {
        case add_files:
            {
                updateTableProperty(tableSpec, tid);
                addFiles(gmce, newSpecsMap, table, tableMetadata);
                if (gmce.getTopicPartitionOffsetsRange() != null) {
                    mergeOffsets(gmce, tid);
                }
                // compute topic name
                if (!tableMetadata.newProperties.get().containsKey(TOPIC_NAME_KEY) && tableMetadata.dataOffsetRange.isPresent() && !tableMetadata.dataOffsetRange.get().isEmpty()) {
                    String topicPartition = tableMetadata.dataOffsetRange.get().keySet().iterator().next();
                    tableMetadata.newProperties.get().put(TOPIC_NAME_KEY, topicPartition.substring(0, topicPartition.lastIndexOf("-")));
                }
                break;
            }
        case rewrite_files:
            {
                updateTableProperty(tableSpec, tid);
                rewriteFiles(gmce, newSpecsMap, oldSpecsMap, table, tableMetadata);
                break;
            }
        case drop_files:
            {
                dropFiles(gmce, oldSpecsMap, table, tableMetadata, tid);
                break;
            }
        case change_property:
            {
                updateTableProperty(tableSpec, tid);
                if (gmce.getTopicPartitionOffsetsRange() != null) {
                    mergeOffsets(gmce, tid);
                }
                log.info("No file operation need to be performed by Iceberg Metadata Writer at this point.");
                break;
            }
        default:
            {
                log.error("unsupported operation {}", gmce.getOperationType().toString());
                return;
            }
    }
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) AlreadyExistsException(org.apache.iceberg.exceptions.AlreadyExistsException) SchemaRegistryException(org.apache.gobblin.metrics.kafka.SchemaRegistryException) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) IOException(java.io.IOException)

Example 87 with TableIdentifier

use of org.apache.iceberg.catalog.TableIdentifier in project incubator-gobblin by apache.

the class IcebergMetadataWriter method writeEnvelope.

@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, HiveSpec tableSpec) throws IOException {
    Lock readLock = readWriteLock.readLock();
    readLock.lock();
    try {
        GenericRecord genericRecord = recordEnvelope.getRecord();
        GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
        String dbName = tableSpec.getTable().getDbName();
        String tableName = tableSpec.getTable().getTableName();
        if (whitelistBlacklist.acceptTable(dbName, tableName)) {
            TableIdentifier tid = TableIdentifier.of(dbName, tableName);
            String topicPartition = tableTopicPartitionMap.computeIfAbsent(tid, t -> recordEnvelope.getWatermark().getSource());
            Long currentWatermark = getAndPersistCurrentWatermark(tid, topicPartition);
            Long currentOffset = ((LongWatermark) recordEnvelope.getWatermark().getWatermark()).getValue();
            if (currentOffset > currentWatermark) {
                if (!tableMetadataMap.computeIfAbsent(tid, t -> new TableMetadata()).lowWatermark.isPresent()) {
                    // This means we haven't register this table or met some error before, we need to reset the low watermark
                    tableMetadataMap.get(tid).lowWatermark = Optional.of(currentOffset - 1);
                    tableMetadataMap.get(tid).setDatasetName(gmce.getDatasetIdentifier().getNativeName());
                    if (this.newPartitionEnabled && this.newPartitionTableWhitelistBlacklist.acceptTable(dbName, tableName)) {
                        tableMetadataMap.get(tid).newPartitionColumnEnabled = true;
                        if (this.completenessEnabled && this.completenessWhitelistBlacklist.acceptTable(dbName, tableName)) {
                            tableMetadataMap.get(tid).completenessEnabled = true;
                        }
                    }
                }
                write(gmce, newSpecsMap, oldSpecsMap, tableSpec);
                tableCurrentWatermarkMap.put(tid, currentOffset);
            } else {
                log.warn(String.format("Skip processing record %s since it has lower watermark", genericRecord.toString()));
            }
        } else {
            log.info(String.format("Skip table %s.%s since it's not selected", tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName()));
        }
    } finally {
        readLock.unlock();
    }
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Arrays(java.util.Arrays) AlreadyExistsException(org.apache.iceberg.exceptions.AlreadyExistsException) WhitelistBlacklist(org.apache.gobblin.data.management.copy.hive.WhitelistBlacklist) SchemaRegistryException(org.apache.gobblin.metrics.kafka.SchemaRegistryException) FileSystem(org.apache.hadoop.fs.FileSystem) ZonedDateTime(java.time.ZonedDateTime) IcebergUtils(org.apache.gobblin.iceberg.Utils.IcebergUtils) DeleteFiles(org.apache.iceberg.DeleteFiles) Optional(com.google.common.base.Optional) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) IcebergMetadataWriterConfigKeys(org.apache.gobblin.iceberg.writer.IcebergMetadataWriterConfigKeys) ReadWriteLock(java.util.concurrent.locks.ReadWriteLock) HiveCatalogs(org.apache.iceberg.hive.HiveCatalogs) Range(com.google.common.collect.Range) Set(java.util.Set) Schema(org.apache.iceberg.Schema) ZoneId(java.time.ZoneId) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) PeriodFormatterBuilder(org.joda.time.format.PeriodFormatterBuilder) HiveMetaStoreUtils(org.apache.gobblin.hive.metastore.HiveMetaStoreUtils) KafkaAuditCountVerifier(org.apache.gobblin.completeness.verifier.KafkaAuditCountVerifier) Timer(com.codahale.metrics.Timer) Joiner(com.google.common.base.Joiner) MetricContext(org.apache.gobblin.metrics.MetricContext) Callable(java.util.concurrent.Callable) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) PeriodFormatter(org.joda.time.format.PeriodFormatter) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) Lists(com.google.common.collect.Lists) Closer(com.google.common.io.Closer) ParallelRunner(org.apache.gobblin.util.ParallelRunner) HadoopUtils(org.apache.gobblin.util.HadoopUtils) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) HiveCatalog(org.apache.iceberg.hive.HiveCatalog) KafkaSchemaRegistry(org.apache.gobblin.metrics.kafka.KafkaSchemaRegistry) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) Lock(java.util.concurrent.locks.Lock) Preconditions(com.google.common.base.Preconditions) WriterUtils(org.apache.gobblin.util.WriterUtils) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Tag(org.apache.gobblin.metrics.Tag) Types(org.apache.iceberg.types.Types) SortedSet(java.util.SortedSet) ExpireSnapshots(org.apache.iceberg.ExpireSnapshots) AutoCloseableHiveLock(org.apache.gobblin.hive.AutoCloseableHiveLock) AppendFiles(org.apache.iceberg.AppendFiles) StructLike(org.apache.iceberg.StructLike) FsPermission(org.apache.hadoop.fs.permission.FsPermission) Expression(org.apache.iceberg.expressions.Expression) ClustersNames(org.apache.gobblin.util.ClustersNames) TimeIterator(org.apache.gobblin.time.TimeIterator) Path(org.apache.hadoop.fs.Path) DataFile(org.apache.iceberg.DataFile) Splitter(com.google.common.base.Splitter) SpecificData(org.apache.avro.specific.SpecificData) Collection(java.util.Collection) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) State(org.apache.gobblin.configuration.State) HiveLock(org.apache.gobblin.hive.HiveLock) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) List(java.util.List) UpdateProperties(org.apache.iceberg.UpdateProperties) AvroSchemaUtil(org.apache.iceberg.avro.AvroSchemaUtil) LocalDate(java.time.LocalDate) FindFiles(org.apache.iceberg.FindFiles) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) CacheBuilder(com.google.common.cache.CacheBuilder) Expressions(org.apache.iceberg.expressions.Expressions) HivePartition(org.apache.gobblin.hive.HivePartition) GobblinMetricsRegistry(org.apache.gobblin.metrics.GobblinMetricsRegistry) Setter(lombok.Setter) Getter(lombok.Getter) HashMap(java.util.HashMap) HashSet(java.util.HashSet) MetadataWriter(org.apache.gobblin.hive.writer.MetadataWriter) GenericRecord(org.apache.avro.generic.GenericRecord) DateTime(org.joda.time.DateTime) Maps(com.google.common.collect.Maps) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) EventSubmitter(org.apache.gobblin.metrics.event.EventSubmitter) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) AvroUtils(org.apache.gobblin.util.AvroUtils) Transaction(org.apache.iceberg.Transaction) DateTimeFormatter(java.time.format.DateTimeFormatter) OperationType(org.apache.gobblin.metadata.OperationType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) Cache(com.google.common.cache.Cache) Collections(java.util.Collections) Snapshot(org.apache.iceberg.Snapshot) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) GenericRecord(org.apache.avro.generic.GenericRecord) ReadWriteLock(java.util.concurrent.locks.ReadWriteLock) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) Lock(java.util.concurrent.locks.Lock) AutoCloseableHiveLock(org.apache.gobblin.hive.AutoCloseableHiveLock) HiveLock(org.apache.gobblin.hive.HiveLock) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)87 Test (org.junit.Test)69 Table (org.apache.iceberg.Table)56 PartitionSpec (org.apache.iceberg.PartitionSpec)27 Schema (org.apache.iceberg.Schema)25 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)16 BaseTable (org.apache.iceberg.BaseTable)15 UpdateSchema (org.apache.iceberg.UpdateSchema)15 List (java.util.List)13 NoSuchTableException (org.apache.iceberg.exceptions.NoSuchTableException)13 ArrayList (java.util.ArrayList)11 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)11 IOException (java.io.IOException)10 Map (java.util.Map)10 Types (org.apache.iceberg.types.Types)10 HashMap (java.util.HashMap)9 Path (org.apache.hadoop.fs.Path)9 TableProperties (org.apache.iceberg.TableProperties)9 Collections (java.util.Collections)8 Properties (java.util.Properties)8