Search in sources :

Example 1 with DataFile

use of org.apache.gobblin.metadata.DataFile in project incubator-gobblin by apache.

the class GobblinMCEWriter method writeEnvelope.

@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope) throws IOException {
    GenericRecord genericRecord = recordEnvelope.getRecord();
    CheckpointableWatermark watermark = recordEnvelope.getWatermark();
    Preconditions.checkNotNull(watermark);
    // filter out the events that not emitted by accepted clusters
    if (!acceptedClusters.contains(genericRecord.get("cluster"))) {
        return;
    }
    // Use schema from record to avoid issue when schema evolution
    GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
    String datasetName = gmce.getDatasetIdentifier().toString();
    // remove the old hive spec cache after flush
    // Here we assume that new hive spec for one path always be the same(ingestion flow register to same tables)
    oldSpecsMaps.remove(datasetName);
    // Mapping from URI of path of arrival files to the list of HiveSpec objects.
    // We assume in one same operation interval, for same dataset, the table property will not change to reduce the time to compute hiveSpec.
    ConcurrentHashMap<String, Collection<HiveSpec>> newSpecsMap = new ConcurrentHashMap<>();
    ConcurrentHashMap<String, Collection<HiveSpec>> oldSpecsMap = new ConcurrentHashMap<>();
    if (gmce.getNewFiles() != null) {
        State registerState = setHiveRegProperties(state, gmce, true);
        computeSpecMap(Lists.newArrayList(Iterables.transform(gmce.getNewFiles(), DataFile::getFilePath)), newSpecsMap, newSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
    }
    if (gmce.getOldFilePrefixes() != null) {
        State registerState = setHiveRegProperties(state, gmce, false);
        computeSpecMap(gmce.getOldFilePrefixes(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, true);
    } else if (gmce.getOldFiles() != null) {
        State registerState = setHiveRegProperties(state, gmce, false);
        computeSpecMap(gmce.getOldFiles(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
    }
    if (newSpecsMap.isEmpty() && oldSpecsMap.isEmpty()) {
        return;
    }
    // Sample one entry among all "Path <--> List<HiveSpec>" pair is good enough, reasoning:
    // 0. Objective here is to execute metadata registration for all target table destinations of a dataset,
    // 1. GMCE guarantees all paths coming from single dataset (but not necessary single "partition" in Hive's layout),
    // 2. HiveSpec of paths from a dataset should be targeting at the same set of table destinations,
    // 3. therefore fetching one path's HiveSpec and iterate through it is good enough to cover all table destinations.
    Collection<HiveSpec> specs = newSpecsMap.isEmpty() ? oldSpecsMap.values().iterator().next() : newSpecsMap.values().iterator().next();
    for (HiveSpec spec : specs) {
        String dbName = spec.getTable().getDbName();
        String tableName = spec.getTable().getTableName();
        String tableString = Joiner.on(TABLE_NAME_DELIMITER).join(dbName, tableName);
        if (!tableOperationTypeMap.containsKey(tableString)) {
            tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
        } else if (tableOperationTypeMap.get(tableString).operationType != gmce.getOperationType() && gmce.getOperationType() != OperationType.change_property) {
            flush(dbName, tableName);
            tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
        }
        tableOperationTypeMap.get(tableString).gmceHighWatermark = ((LongWatermark) watermark.getWatermark()).getValue();
        write(recordEnvelope, newSpecsMap, oldSpecsMap, spec);
    }
    this.recordCount.incrementAndGet();
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) GobblinConstructorUtils(org.apache.gobblin.util.reflection.GobblinConstructorUtils) ClustersNames(org.apache.gobblin.util.ClustersNames) Map(java.util.Map) AvroSerdeUtils(org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils) Path(org.apache.hadoop.fs.Path) Splitter(com.google.common.base.Splitter) SpecificData(org.apache.avro.specific.SpecificData) DataFile(org.apache.gobblin.metadata.DataFile) Schema(org.apache.avro.Schema) Descriptor(org.apache.gobblin.dataset.Descriptor) Collection(java.util.Collection) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) Instrumented(org.apache.gobblin.instrumented.Instrumented) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) State(org.apache.gobblin.configuration.State) HiveRegistrationPolicyBase(org.apache.gobblin.hive.policy.HiveRegistrationPolicyBase) Set(java.util.Set) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) CacheBuilder(com.google.common.cache.CacheBuilder) Joiner(com.google.common.base.Joiner) Iterables(com.google.common.collect.Iterables) Setter(lombok.Setter) Getter(lombok.Getter) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) MetricContext(org.apache.gobblin.metrics.MetricContext) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) ArrayList(java.util.ArrayList) HiveRegistrationPolicy(org.apache.gobblin.hive.policy.HiveRegistrationPolicy) MetadataWriter(org.apache.gobblin.hive.writer.MetadataWriter) Lists(com.google.common.collect.Lists) Closer(com.google.common.io.Closer) DataWriterBuilder(org.apache.gobblin.writer.DataWriterBuilder) ParallelRunner(org.apache.gobblin.util.ParallelRunner) HadoopUtils(org.apache.gobblin.util.HadoopUtils) GenericRecord(org.apache.avro.generic.GenericRecord) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) DataWriter(org.apache.gobblin.writer.DataWriter) TimeUnit(java.util.concurrent.TimeUnit) EventSubmitter(org.apache.gobblin.metrics.event.EventSubmitter) AtomicLong(java.util.concurrent.atomic.AtomicLong) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) OperationType(org.apache.gobblin.metadata.OperationType) Preconditions(com.google.common.base.Preconditions) Cache(com.google.common.cache.Cache) AllArgsConstructor(lombok.AllArgsConstructor) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) DataFile(org.apache.gobblin.metadata.DataFile) State(org.apache.gobblin.configuration.State) Collection(java.util.Collection) GenericRecord(org.apache.avro.generic.GenericRecord) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 2 with DataFile

use of org.apache.gobblin.metadata.DataFile in project incubator-gobblin by apache.

the class HiveMetadataWriterTest method testHiveWriteRewriteFileGMCE.

@Test(dependsOnMethods = { "testHiveWriteAddFileGMCE" }, groups = { "hiveMetadataWriterTest" })
public void testHiveWriteRewriteFileGMCE() throws IOException {
    gmce.setTopicPartitionOffsetsRange(null);
    Map<String, String> registrationState = gmce.getRegistrationProperties();
    registrationState.put("additional.hive.database.names", dedupedDbName);
    registrationState.put(HiveMetaStoreBasedRegister.SCHEMA_SOURCE_DB, dbName);
    gmce.setRegistrationProperties(registrationState);
    gmce.setSchemaSource(SchemaSource.NONE);
    FileSystem fs = FileSystem.get(new Configuration());
    String filePath = new Path(hourlyDataFile_1.getParentFile().getAbsolutePath()).toString();
    String filePath_1 = new Path(hourlyDataFile_2.getParentFile().getAbsolutePath()).toString();
    DataFile dailyFile = DataFile.newBuilder().setFilePath(dailyDataFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build();
    gmce.setNewFiles(Lists.newArrayList(dailyFile));
    gmce.setOldFilePrefixes(Lists.newArrayList(filePath, filePath_1));
    gmce.setOperationType(OperationType.rewrite_files);
    gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(gmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(40L))));
    gobblinMCEWriter.flush();
    // Test hive writer re-write operation can de-register old partitions and register new one
    try {
        Assert.assertTrue(client.getPartition("hivedb", "testTable", Lists.newArrayList("2020-03-17-00")) != null);
        // Test additional table been registered
        Assert.assertTrue(client.tableExists(dedupedDbName, "testTable"));
    } catch (TException e) {
        throw new IOException(e);
    }
    Assert.assertThrows(new Assert.ThrowingRunnable() {

        @Override
        public void run() throws Throwable {
            client.getPartition("hivedb", "testTable", Lists.newArrayList("2020-03-17-08"));
        }
    });
}
Also used : Path(org.apache.hadoop.fs.Path) TException(org.apache.thrift.TException) Configuration(org.apache.hadoop.conf.Configuration) SchemaBuilder(org.apache.avro.SchemaBuilder) IOException(java.io.IOException) DataFile(org.apache.gobblin.metadata.DataFile) Assert(org.testng.Assert) FileSystem(org.apache.hadoop.fs.FileSystem) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test) HiveMetastoreTest(org.apache.iceberg.hive.HiveMetastoreTest)

Example 3 with DataFile

use of org.apache.gobblin.metadata.DataFile in project incubator-gobblin by apache.

the class IcebergMetadataWriterTest method testChangeProperty.

@Test(dependsOnMethods = { "testWriteRewriteFileGMCE" }, groups = { "icebergMetadataWriterTest" })
public void testChangeProperty() throws IOException {
    Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-3000");
    Assert.assertEquals(table.currentSnapshot().allManifests().size(), 3);
    Assert.assertEquals(table.properties().get("gmce.low.watermark.GobblinMetadataChangeEvent_test-1"), "30");
    Assert.assertEquals(table.properties().get("gmce.high.watermark.GobblinMetadataChangeEvent_test-1"), "40");
    gmce.setOldFilePrefixes(null);
    DataFile dailyFile = DataFile.newBuilder().setFilePath(dailyDataFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(0L).build()).build();
    gmce.setNewFiles(Lists.newArrayList(dailyFile));
    gmce.setOperationType(OperationType.change_property);
    gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "2000-4000").build());
    GenericRecord genericGmce = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(45L))));
    gobblinMCEWriter.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    // Assert the offset has been updated
    Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-4000");
    Assert.assertEquals(table.currentSnapshot().allManifests().size(), 3);
    // Assert low watermark and high watermark set properly
    Assert.assertEquals(table.properties().get("gmce.low.watermark.GobblinMetadataChangeEvent_test-1"), "40");
    Assert.assertEquals(table.properties().get("gmce.high.watermark.GobblinMetadataChangeEvent_test-1"), "45");
}
Also used : DataFile(org.apache.gobblin.metadata.DataFile) HiveTable(org.apache.gobblin.hive.HiveTable) Table(org.apache.iceberg.Table) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) SchemaBuilder(org.apache.avro.SchemaBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test) HiveMetastoreTest(org.apache.iceberg.hive.HiveMetastoreTest)

Example 4 with DataFile

use of org.apache.gobblin.metadata.DataFile in project incubator-gobblin by apache.

the class IcebergMetadataWriterTest method testWriteRewriteFileGMCE.

// Make sure hive test execute later and close the metastore
@Test(dependsOnMethods = { "testWriteAddFileGMCE" }, groups = { "icebergMetadataWriterTest" })
public void testWriteRewriteFileGMCE() throws IOException {
    gmce.setTopicPartitionOffsetsRange(null);
    FileSystem fs = FileSystem.get(new Configuration());
    String filePath = new Path(hourlyDataFile_1.getParentFile().getAbsolutePath()).toString();
    String filePath_1 = new Path(hourlyDataFile_2.getParentFile().getAbsolutePath()).toString();
    DataFile dailyFile = DataFile.newBuilder().setFilePath(dailyDataFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build();
    gmce.setNewFiles(Lists.newArrayList(dailyFile));
    gmce.setOldFilePrefixes(Lists.newArrayList(filePath, filePath_1));
    gmce.setOperationType(OperationType.rewrite_files);
    Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    Iterator<org.apache.iceberg.DataFile> result = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", filePath_1)).collect().iterator();
    Assert.assertEquals(table.currentSnapshot().allManifests().size(), 2);
    Assert.assertTrue(result.hasNext());
    GenericRecord genericGmce = GenericData.get().deepCopy(gmce.getSchema(), gmce);
    gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(40L))));
    gobblinMCEWriter.flush();
    table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
    String dailyFilePath = new Path(dailyDataFile.toString()).toString();
    result = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", dailyFilePath)).collect().iterator();
    Assert.assertEquals(result.next().path(), dailyFilePath);
    Assert.assertFalse(result.hasNext());
    result = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", filePath)).collect().iterator();
    Assert.assertFalse(result.hasNext());
    result = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", filePath_1)).collect().iterator();
    Assert.assertFalse(result.hasNext());
}
Also used : Path(org.apache.hadoop.fs.Path) HiveTable(org.apache.gobblin.hive.HiveTable) Table(org.apache.iceberg.Table) Configuration(org.apache.hadoop.conf.Configuration) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) SchemaBuilder(org.apache.avro.SchemaBuilder) DataFile(org.apache.gobblin.metadata.DataFile) FileSystem(org.apache.hadoop.fs.FileSystem) GenericRecord(org.apache.avro.generic.GenericRecord) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test) HiveMetastoreTest(org.apache.iceberg.hive.HiveMetastoreTest)

Aggregations

DataFile (org.apache.gobblin.metadata.DataFile)4 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)4 SchemaBuilder (org.apache.avro.SchemaBuilder)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Path (org.apache.hadoop.fs.Path)3 IOException (java.io.IOException)2 HiveTable (org.apache.gobblin.hive.HiveTable)2 GobblinEventBuilder (org.apache.gobblin.metrics.event.GobblinEventBuilder)2 Table (org.apache.iceberg.Table)2 HiveMetastoreTest (org.apache.iceberg.hive.HiveMetastoreTest)2 Test (org.testng.annotations.Test)2 Joiner (com.google.common.base.Joiner)1 Preconditions (com.google.common.base.Preconditions)1 Splitter (com.google.common.base.Splitter)1 Cache (com.google.common.cache.Cache)1 CacheBuilder (com.google.common.cache.CacheBuilder)1 Iterables (com.google.common.collect.Iterables)1 Lists (com.google.common.collect.Lists)1 Closer (com.google.common.io.Closer)1