Search in sources :

Example 1 with GobblinMetadataChangeEvent

use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.

the class GobblinMCEPublisherTest method testPublishGMCEWithoutFile.

@Test(dependsOnMethods = { "testPublishGMCEForAvro" })
public void testPublishGMCEWithoutFile() throws IOException {
    GobblinMCEProducer producer = Mockito.mock(GobblinMCEProducer.class);
    Mockito.doCallRealMethod().when(producer).getGobblinMetadataChangeEvent(anyMap(), anyList(), anyList(), anyMap(), any(), any());
    Mockito.doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            Object[] args = invocation.getArguments();
            GobblinMetadataChangeEvent gmce = producer.getGobblinMetadataChangeEvent((Map<Path, Metrics>) args[0], null, null, (Map<String, String>) args[1], OperationType.change_property, SchemaSource.NONE);
            Assert.assertEquals(gmce.getNewFiles().size(), 1);
            Assert.assertNull(gmce.getOldFiles());
            Assert.assertNull(gmce.getOldFilePrefixes());
            Assert.assertEquals(gmce.getOperationType(), OperationType.change_property);
            return null;
        }
    }).when(producer).sendGMCE(anyMap(), anyList(), anyList(), anyMap(), any(), any());
    WorkUnitState state = new WorkUnitState();
    setGMCEPublisherStateWithoutNewFile(state);
    Mockito.doCallRealMethod().when(producer).setState(state);
    producer.setState(state);
    GobblinMCEPublisher publisher = new GobblinMCEPublisher(state, producer);
    publisher.publishData(Arrays.asList(state));
}
Also used : Answer(org.mockito.stubbing.Answer) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) InvocationOnMock(org.mockito.invocation.InvocationOnMock) WorkUnitState(gobblin.configuration.WorkUnitState) GobblinMCEProducer(org.apache.gobblin.iceberg.GobblinMCEProducer) Map(java.util.Map) Test(org.testng.annotations.Test)

Example 2 with GobblinMetadataChangeEvent

use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.

the class GobblinMCEPublisherTest method testPublishGMCEForAvro.

@Test
public void testPublishGMCEForAvro() throws IOException {
    GobblinMCEProducer producer = Mockito.mock(GobblinMCEProducer.class);
    Mockito.doCallRealMethod().when(producer).getGobblinMetadataChangeEvent(anyMap(), anyList(), anyList(), anyMap(), any(), any());
    Mockito.doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            Object[] args = invocation.getArguments();
            GobblinMetadataChangeEvent gmce = producer.getGobblinMetadataChangeEvent((Map<Path, Metrics>) args[0], null, null, (Map<String, String>) args[1], OperationType.add_files, SchemaSource.SCHEMAREGISTRY);
            Assert.assertEquals(gmce.getNewFiles().size(), 1);
            FileSystem fs = FileSystem.get(new Configuration());
            Assert.assertEquals(gmce.getNewFiles().get(0).getFilePath(), new Path(dataFile.getAbsolutePath()).makeQualified(fs.getUri(), new Path("/")).toString());
            return null;
        }
    }).when(producer).sendGMCE(anyMap(), anyList(), anyList(), anyMap(), any(), any());
    WorkUnitState state = new WorkUnitState();
    setGMCEPublisherStateForAvroFile(state);
    Mockito.doCallRealMethod().when(producer).setState(state);
    producer.setState(state);
    GobblinMCEPublisher publisher = new GobblinMCEPublisher(state, producer);
    publisher.publishData(Arrays.asList(state));
}
Also used : Path(org.apache.hadoop.fs.Path) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) Configuration(org.apache.hadoop.conf.Configuration) WorkUnitState(gobblin.configuration.WorkUnitState) GobblinMCEProducer(org.apache.gobblin.iceberg.GobblinMCEProducer) Answer(org.mockito.stubbing.Answer) InvocationOnMock(org.mockito.invocation.InvocationOnMock) FileSystem(org.apache.hadoop.fs.FileSystem) Map(java.util.Map) Test(org.testng.annotations.Test)

Example 3 with GobblinMetadataChangeEvent

use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.

the class GobblinMCEWriter method writeEnvelope.

@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope) throws IOException {
    GenericRecord genericRecord = recordEnvelope.getRecord();
    CheckpointableWatermark watermark = recordEnvelope.getWatermark();
    Preconditions.checkNotNull(watermark);
    // filter out the events that not emitted by accepted clusters
    if (!acceptedClusters.contains(genericRecord.get("cluster"))) {
        return;
    }
    // Use schema from record to avoid issue when schema evolution
    GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
    String datasetName = gmce.getDatasetIdentifier().toString();
    // remove the old hive spec cache after flush
    // Here we assume that new hive spec for one path always be the same(ingestion flow register to same tables)
    oldSpecsMaps.remove(datasetName);
    // Mapping from URI of path of arrival files to the list of HiveSpec objects.
    // We assume in one same operation interval, for same dataset, the table property will not change to reduce the time to compute hiveSpec.
    ConcurrentHashMap<String, Collection<HiveSpec>> newSpecsMap = new ConcurrentHashMap<>();
    ConcurrentHashMap<String, Collection<HiveSpec>> oldSpecsMap = new ConcurrentHashMap<>();
    if (gmce.getNewFiles() != null) {
        State registerState = setHiveRegProperties(state, gmce, true);
        computeSpecMap(Lists.newArrayList(Iterables.transform(gmce.getNewFiles(), DataFile::getFilePath)), newSpecsMap, newSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
    }
    if (gmce.getOldFilePrefixes() != null) {
        State registerState = setHiveRegProperties(state, gmce, false);
        computeSpecMap(gmce.getOldFilePrefixes(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, true);
    } else if (gmce.getOldFiles() != null) {
        State registerState = setHiveRegProperties(state, gmce, false);
        computeSpecMap(gmce.getOldFiles(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
    }
    if (newSpecsMap.isEmpty() && oldSpecsMap.isEmpty()) {
        return;
    }
    // Sample one entry among all "Path <--> List<HiveSpec>" pair is good enough, reasoning:
    // 0. Objective here is to execute metadata registration for all target table destinations of a dataset,
    // 1. GMCE guarantees all paths coming from single dataset (but not necessary single "partition" in Hive's layout),
    // 2. HiveSpec of paths from a dataset should be targeting at the same set of table destinations,
    // 3. therefore fetching one path's HiveSpec and iterate through it is good enough to cover all table destinations.
    Collection<HiveSpec> specs = newSpecsMap.isEmpty() ? oldSpecsMap.values().iterator().next() : newSpecsMap.values().iterator().next();
    for (HiveSpec spec : specs) {
        String dbName = spec.getTable().getDbName();
        String tableName = spec.getTable().getTableName();
        String tableString = Joiner.on(TABLE_NAME_DELIMITER).join(dbName, tableName);
        if (!tableOperationTypeMap.containsKey(tableString)) {
            tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
        } else if (tableOperationTypeMap.get(tableString).operationType != gmce.getOperationType() && gmce.getOperationType() != OperationType.change_property) {
            flush(dbName, tableName);
            tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
        }
        tableOperationTypeMap.get(tableString).gmceHighWatermark = ((LongWatermark) watermark.getWatermark()).getValue();
        write(recordEnvelope, newSpecsMap, oldSpecsMap, spec);
    }
    this.recordCount.incrementAndGet();
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) GobblinConstructorUtils(org.apache.gobblin.util.reflection.GobblinConstructorUtils) ClustersNames(org.apache.gobblin.util.ClustersNames) Map(java.util.Map) AvroSerdeUtils(org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils) Path(org.apache.hadoop.fs.Path) Splitter(com.google.common.base.Splitter) SpecificData(org.apache.avro.specific.SpecificData) DataFile(org.apache.gobblin.metadata.DataFile) Schema(org.apache.avro.Schema) Descriptor(org.apache.gobblin.dataset.Descriptor) Collection(java.util.Collection) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) Instrumented(org.apache.gobblin.instrumented.Instrumented) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) State(org.apache.gobblin.configuration.State) HiveRegistrationPolicyBase(org.apache.gobblin.hive.policy.HiveRegistrationPolicyBase) Set(java.util.Set) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) CacheBuilder(com.google.common.cache.CacheBuilder) Joiner(com.google.common.base.Joiner) Iterables(com.google.common.collect.Iterables) Setter(lombok.Setter) Getter(lombok.Getter) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) MetricContext(org.apache.gobblin.metrics.MetricContext) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) ArrayList(java.util.ArrayList) HiveRegistrationPolicy(org.apache.gobblin.hive.policy.HiveRegistrationPolicy) MetadataWriter(org.apache.gobblin.hive.writer.MetadataWriter) Lists(com.google.common.collect.Lists) Closer(com.google.common.io.Closer) DataWriterBuilder(org.apache.gobblin.writer.DataWriterBuilder) ParallelRunner(org.apache.gobblin.util.ParallelRunner) HadoopUtils(org.apache.gobblin.util.HadoopUtils) GenericRecord(org.apache.avro.generic.GenericRecord) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) DataWriter(org.apache.gobblin.writer.DataWriter) TimeUnit(java.util.concurrent.TimeUnit) EventSubmitter(org.apache.gobblin.metrics.event.EventSubmitter) AtomicLong(java.util.concurrent.atomic.AtomicLong) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) OperationType(org.apache.gobblin.metadata.OperationType) Preconditions(com.google.common.base.Preconditions) Cache(com.google.common.cache.Cache) AllArgsConstructor(lombok.AllArgsConstructor) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) DataFile(org.apache.gobblin.metadata.DataFile) State(org.apache.gobblin.configuration.State) Collection(java.util.Collection) GenericRecord(org.apache.avro.generic.GenericRecord) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 4 with GobblinMetadataChangeEvent

use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.

the class GobblinMCEProducer method sendGMCE.

/**
 * This method will use the files to compute the table name and dataset name, for each table it will generate one GMCE and send that to kafka so
 * the metadata ingestion pipeline can use the information to register metadata
 * @param newFiles The map of new files' path and metrics
 * @param oldFiles the list of old file to be dropped
 * @param offsetRange offset range of the new data, can be null
 * @param operationType The opcode of gmce emitted by this method.
 * @throws IOException
 */
public void sendGMCE(Map<Path, Metrics> newFiles, List<String> oldFiles, List<String> oldFilePrefixes, Map<String, String> offsetRange, OperationType operationType, SchemaSource schemaSource) throws IOException {
    GobblinMetadataChangeEvent gmce = getGobblinMetadataChangeEvent(newFiles, oldFiles, oldFilePrefixes, offsetRange, operationType, schemaSource);
    underlyingSendGMCE(gmce);
}
Also used : GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent)

Example 5 with GobblinMetadataChangeEvent

use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.

the class HiveMetadataWriter method writeEnvelope.

@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, HiveSpec tableSpec) throws IOException {
    GenericRecord genericRecord = recordEnvelope.getRecord();
    GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
    if (whitelistBlacklist.acceptTable(tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName())) {
        write(gmce, newSpecsMap, oldSpecsMap, tableSpec);
    } else {
        log.debug(String.format("Skip table %s.%s since it's not selected", tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName()));
    }
}
Also used : GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

GobblinMetadataChangeEvent (org.apache.gobblin.metadata.GobblinMetadataChangeEvent)7 Map (java.util.Map)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 WorkUnitState (gobblin.configuration.WorkUnitState)3 Joiner (com.google.common.base.Joiner)2 Preconditions (com.google.common.base.Preconditions)2 Splitter (com.google.common.base.Splitter)2 Cache (com.google.common.cache.Cache)2 CacheBuilder (com.google.common.cache.CacheBuilder)2 Lists (com.google.common.collect.Lists)2 Closer (com.google.common.io.Closer)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Collection (java.util.Collection)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Set (java.util.Set)2 Callable (java.util.concurrent.Callable)2 TimeUnit (java.util.concurrent.TimeUnit)2