Search in sources :

Example 6 with GobblinMetadataChangeEvent

use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.

the class IcebergMetadataWriter method writeEnvelope.

@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, HiveSpec tableSpec) throws IOException {
    Lock readLock = readWriteLock.readLock();
    readLock.lock();
    try {
        GenericRecord genericRecord = recordEnvelope.getRecord();
        GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
        String dbName = tableSpec.getTable().getDbName();
        String tableName = tableSpec.getTable().getTableName();
        if (whitelistBlacklist.acceptTable(dbName, tableName)) {
            TableIdentifier tid = TableIdentifier.of(dbName, tableName);
            String topicPartition = tableTopicPartitionMap.computeIfAbsent(tid, t -> recordEnvelope.getWatermark().getSource());
            Long currentWatermark = getAndPersistCurrentWatermark(tid, topicPartition);
            Long currentOffset = ((LongWatermark) recordEnvelope.getWatermark().getWatermark()).getValue();
            if (currentOffset > currentWatermark) {
                if (!tableMetadataMap.computeIfAbsent(tid, t -> new TableMetadata()).lowWatermark.isPresent()) {
                    // This means we haven't register this table or met some error before, we need to reset the low watermark
                    tableMetadataMap.get(tid).lowWatermark = Optional.of(currentOffset - 1);
                    tableMetadataMap.get(tid).setDatasetName(gmce.getDatasetIdentifier().getNativeName());
                    if (this.newPartitionEnabled && this.newPartitionTableWhitelistBlacklist.acceptTable(dbName, tableName)) {
                        tableMetadataMap.get(tid).newPartitionColumnEnabled = true;
                        if (this.completenessEnabled && this.completenessWhitelistBlacklist.acceptTable(dbName, tableName)) {
                            tableMetadataMap.get(tid).completenessEnabled = true;
                        }
                    }
                }
                write(gmce, newSpecsMap, oldSpecsMap, tableSpec);
                tableCurrentWatermarkMap.put(tid, currentOffset);
            } else {
                log.warn(String.format("Skip processing record %s since it has lower watermark", genericRecord.toString()));
            }
        } else {
            log.info(String.format("Skip table %s.%s since it's not selected", tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName()));
        }
    } finally {
        readLock.unlock();
    }
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Arrays(java.util.Arrays) AlreadyExistsException(org.apache.iceberg.exceptions.AlreadyExistsException) WhitelistBlacklist(org.apache.gobblin.data.management.copy.hive.WhitelistBlacklist) SchemaRegistryException(org.apache.gobblin.metrics.kafka.SchemaRegistryException) FileSystem(org.apache.hadoop.fs.FileSystem) ZonedDateTime(java.time.ZonedDateTime) IcebergUtils(org.apache.gobblin.iceberg.Utils.IcebergUtils) DeleteFiles(org.apache.iceberg.DeleteFiles) Optional(com.google.common.base.Optional) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) IcebergMetadataWriterConfigKeys(org.apache.gobblin.iceberg.writer.IcebergMetadataWriterConfigKeys) ReadWriteLock(java.util.concurrent.locks.ReadWriteLock) HiveCatalogs(org.apache.iceberg.hive.HiveCatalogs) Range(com.google.common.collect.Range) Set(java.util.Set) Schema(org.apache.iceberg.Schema) ZoneId(java.time.ZoneId) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) PeriodFormatterBuilder(org.joda.time.format.PeriodFormatterBuilder) HiveMetaStoreUtils(org.apache.gobblin.hive.metastore.HiveMetaStoreUtils) KafkaAuditCountVerifier(org.apache.gobblin.completeness.verifier.KafkaAuditCountVerifier) Timer(com.codahale.metrics.Timer) Joiner(com.google.common.base.Joiner) MetricContext(org.apache.gobblin.metrics.MetricContext) Callable(java.util.concurrent.Callable) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) PeriodFormatter(org.joda.time.format.PeriodFormatter) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) Lists(com.google.common.collect.Lists) Closer(com.google.common.io.Closer) ParallelRunner(org.apache.gobblin.util.ParallelRunner) HadoopUtils(org.apache.gobblin.util.HadoopUtils) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) HiveCatalog(org.apache.iceberg.hive.HiveCatalog) KafkaSchemaRegistry(org.apache.gobblin.metrics.kafka.KafkaSchemaRegistry) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) Lock(java.util.concurrent.locks.Lock) Preconditions(com.google.common.base.Preconditions) WriterUtils(org.apache.gobblin.util.WriterUtils) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Tag(org.apache.gobblin.metrics.Tag) Types(org.apache.iceberg.types.Types) SortedSet(java.util.SortedSet) ExpireSnapshots(org.apache.iceberg.ExpireSnapshots) AutoCloseableHiveLock(org.apache.gobblin.hive.AutoCloseableHiveLock) AppendFiles(org.apache.iceberg.AppendFiles) StructLike(org.apache.iceberg.StructLike) FsPermission(org.apache.hadoop.fs.permission.FsPermission) Expression(org.apache.iceberg.expressions.Expression) ClustersNames(org.apache.gobblin.util.ClustersNames) TimeIterator(org.apache.gobblin.time.TimeIterator) Path(org.apache.hadoop.fs.Path) DataFile(org.apache.iceberg.DataFile) Splitter(com.google.common.base.Splitter) SpecificData(org.apache.avro.specific.SpecificData) Collection(java.util.Collection) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) State(org.apache.gobblin.configuration.State) HiveLock(org.apache.gobblin.hive.HiveLock) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) List(java.util.List) UpdateProperties(org.apache.iceberg.UpdateProperties) AvroSchemaUtil(org.apache.iceberg.avro.AvroSchemaUtil) LocalDate(java.time.LocalDate) FindFiles(org.apache.iceberg.FindFiles) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) CacheBuilder(com.google.common.cache.CacheBuilder) Expressions(org.apache.iceberg.expressions.Expressions) HivePartition(org.apache.gobblin.hive.HivePartition) GobblinMetricsRegistry(org.apache.gobblin.metrics.GobblinMetricsRegistry) Setter(lombok.Setter) Getter(lombok.Getter) HashMap(java.util.HashMap) HashSet(java.util.HashSet) MetadataWriter(org.apache.gobblin.hive.writer.MetadataWriter) GenericRecord(org.apache.avro.generic.GenericRecord) DateTime(org.joda.time.DateTime) Maps(com.google.common.collect.Maps) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) EventSubmitter(org.apache.gobblin.metrics.event.EventSubmitter) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) AvroUtils(org.apache.gobblin.util.AvroUtils) Transaction(org.apache.iceberg.Transaction) DateTimeFormatter(java.time.format.DateTimeFormatter) OperationType(org.apache.gobblin.metadata.OperationType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) Cache(com.google.common.cache.Cache) Collections(java.util.Collections) Snapshot(org.apache.iceberg.Snapshot) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) GenericRecord(org.apache.avro.generic.GenericRecord) ReadWriteLock(java.util.concurrent.locks.ReadWriteLock) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) Lock(java.util.concurrent.locks.Lock) AutoCloseableHiveLock(org.apache.gobblin.hive.AutoCloseableHiveLock) HiveLock(org.apache.gobblin.hive.HiveLock) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 7 with GobblinMetadataChangeEvent

use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.

the class GobblinMCEPublisherTest method testPublishGMCEForORC.

@Test
public void testPublishGMCEForORC() throws IOException {
    GobblinMCEProducer producer = Mockito.mock(GobblinMCEProducer.class);
    Mockito.doCallRealMethod().when(producer).getGobblinMetadataChangeEvent(anyMap(), anyList(), anyList(), anyMap(), any(), any());
    Mockito.doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            Object[] args = invocation.getArguments();
            GobblinMetadataChangeEvent gmce = producer.getGobblinMetadataChangeEvent((Map<Path, Metrics>) args[0], null, null, (Map<String, String>) args[1], OperationType.add_files, SchemaSource.SCHEMAREGISTRY);
            Assert.assertEquals(gmce.getNewFiles().size(), 1);
            FileSystem fs = FileSystem.get(new Configuration());
            Charset charset = Charset.forName("UTF-8");
            CharsetEncoder encoder = charset.newEncoder();
            Assert.assertEquals(gmce.getNewFiles().get(0).getFilePath(), orcFilePath.makeQualified(fs.getUri(), new Path("/")).toString());
            Assert.assertEquals(gmce.getNewFiles().get(0).getFileMetrics().getLowerBounds().get(1).getValue(), encoder.encode(CharBuffer.wrap("Alyssa")));
            Assert.assertEquals(gmce.getNewFiles().get(0).getFileMetrics().getUpperBounds().get(1).getValue(), encoder.encode(CharBuffer.wrap("Bob")));
            return null;
        }
    }).when(producer).sendGMCE(anyMap(), anyList(), anyList(), anyMap(), any(), any());
    WorkUnitState state = new WorkUnitState();
    setGMCEPublisherStateForOrcFile(state);
    Mockito.doCallRealMethod().when(producer).setState(state);
    producer.setState(state);
    GobblinMCEPublisher publisher = new GobblinMCEPublisher(state, producer);
    publisher.publishData(Arrays.asList(state));
}
Also used : Path(org.apache.hadoop.fs.Path) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) Configuration(org.apache.hadoop.conf.Configuration) WorkUnitState(gobblin.configuration.WorkUnitState) GobblinMCEProducer(org.apache.gobblin.iceberg.GobblinMCEProducer) Charset(java.nio.charset.Charset) CharsetEncoder(java.nio.charset.CharsetEncoder) Answer(org.mockito.stubbing.Answer) InvocationOnMock(org.mockito.invocation.InvocationOnMock) FileSystem(org.apache.hadoop.fs.FileSystem) Map(java.util.Map) Test(org.testng.annotations.Test)

Aggregations

GobblinMetadataChangeEvent (org.apache.gobblin.metadata.GobblinMetadataChangeEvent)7 Map (java.util.Map)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 WorkUnitState (gobblin.configuration.WorkUnitState)3 Joiner (com.google.common.base.Joiner)2 Preconditions (com.google.common.base.Preconditions)2 Splitter (com.google.common.base.Splitter)2 Cache (com.google.common.cache.Cache)2 CacheBuilder (com.google.common.cache.CacheBuilder)2 Lists (com.google.common.collect.Lists)2 Closer (com.google.common.io.Closer)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Collection (java.util.Collection)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Set (java.util.Set)2 Callable (java.util.concurrent.Callable)2 TimeUnit (java.util.concurrent.TimeUnit)2