use of org.apache.gobblin.metadata.DataFile in project incubator-gobblin by apache.
the class GobblinMCEWriter method writeEnvelope.
@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope) throws IOException {
GenericRecord genericRecord = recordEnvelope.getRecord();
CheckpointableWatermark watermark = recordEnvelope.getWatermark();
Preconditions.checkNotNull(watermark);
// filter out the events that not emitted by accepted clusters
if (!acceptedClusters.contains(genericRecord.get("cluster"))) {
return;
}
// Use schema from record to avoid issue when schema evolution
GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
String datasetName = gmce.getDatasetIdentifier().toString();
// remove the old hive spec cache after flush
// Here we assume that new hive spec for one path always be the same(ingestion flow register to same tables)
oldSpecsMaps.remove(datasetName);
// Mapping from URI of path of arrival files to the list of HiveSpec objects.
// We assume in one same operation interval, for same dataset, the table property will not change to reduce the time to compute hiveSpec.
ConcurrentHashMap<String, Collection<HiveSpec>> newSpecsMap = new ConcurrentHashMap<>();
ConcurrentHashMap<String, Collection<HiveSpec>> oldSpecsMap = new ConcurrentHashMap<>();
if (gmce.getNewFiles() != null) {
State registerState = setHiveRegProperties(state, gmce, true);
computeSpecMap(Lists.newArrayList(Iterables.transform(gmce.getNewFiles(), DataFile::getFilePath)), newSpecsMap, newSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
}
if (gmce.getOldFilePrefixes() != null) {
State registerState = setHiveRegProperties(state, gmce, false);
computeSpecMap(gmce.getOldFilePrefixes(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, true);
} else if (gmce.getOldFiles() != null) {
State registerState = setHiveRegProperties(state, gmce, false);
computeSpecMap(gmce.getOldFiles(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
}
if (newSpecsMap.isEmpty() && oldSpecsMap.isEmpty()) {
return;
}
// Sample one entry among all "Path <--> List<HiveSpec>" pair is good enough, reasoning:
// 0. Objective here is to execute metadata registration for all target table destinations of a dataset,
// 1. GMCE guarantees all paths coming from single dataset (but not necessary single "partition" in Hive's layout),
// 2. HiveSpec of paths from a dataset should be targeting at the same set of table destinations,
// 3. therefore fetching one path's HiveSpec and iterate through it is good enough to cover all table destinations.
Collection<HiveSpec> specs = newSpecsMap.isEmpty() ? oldSpecsMap.values().iterator().next() : newSpecsMap.values().iterator().next();
for (HiveSpec spec : specs) {
String dbName = spec.getTable().getDbName();
String tableName = spec.getTable().getTableName();
String tableString = Joiner.on(TABLE_NAME_DELIMITER).join(dbName, tableName);
if (!tableOperationTypeMap.containsKey(tableString)) {
tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
} else if (tableOperationTypeMap.get(tableString).operationType != gmce.getOperationType() && gmce.getOperationType() != OperationType.change_property) {
flush(dbName, tableName);
tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
}
tableOperationTypeMap.get(tableString).gmceHighWatermark = ((LongWatermark) watermark.getWatermark()).getValue();
write(recordEnvelope, newSpecsMap, oldSpecsMap, spec);
}
this.recordCount.incrementAndGet();
}
use of org.apache.gobblin.metadata.DataFile in project incubator-gobblin by apache.
the class HiveMetadataWriterTest method testHiveWriteRewriteFileGMCE.
@Test(dependsOnMethods = { "testHiveWriteAddFileGMCE" }, groups = { "hiveMetadataWriterTest" })
public void testHiveWriteRewriteFileGMCE() throws IOException {
gmce.setTopicPartitionOffsetsRange(null);
Map<String, String> registrationState = gmce.getRegistrationProperties();
registrationState.put("additional.hive.database.names", dedupedDbName);
registrationState.put(HiveMetaStoreBasedRegister.SCHEMA_SOURCE_DB, dbName);
gmce.setRegistrationProperties(registrationState);
gmce.setSchemaSource(SchemaSource.NONE);
FileSystem fs = FileSystem.get(new Configuration());
String filePath = new Path(hourlyDataFile_1.getParentFile().getAbsolutePath()).toString();
String filePath_1 = new Path(hourlyDataFile_2.getParentFile().getAbsolutePath()).toString();
DataFile dailyFile = DataFile.newBuilder().setFilePath(dailyDataFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build();
gmce.setNewFiles(Lists.newArrayList(dailyFile));
gmce.setOldFilePrefixes(Lists.newArrayList(filePath, filePath_1));
gmce.setOperationType(OperationType.rewrite_files);
gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(gmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(40L))));
gobblinMCEWriter.flush();
// Test hive writer re-write operation can de-register old partitions and register new one
try {
Assert.assertTrue(client.getPartition("hivedb", "testTable", Lists.newArrayList("2020-03-17-00")) != null);
// Test additional table been registered
Assert.assertTrue(client.tableExists(dedupedDbName, "testTable"));
} catch (TException e) {
throw new IOException(e);
}
Assert.assertThrows(new Assert.ThrowingRunnable() {
@Override
public void run() throws Throwable {
client.getPartition("hivedb", "testTable", Lists.newArrayList("2020-03-17-08"));
}
});
}
use of org.apache.gobblin.metadata.DataFile in project incubator-gobblin by apache.
the class IcebergMetadataWriterTest method testChangeProperty.
@Test(dependsOnMethods = { "testWriteRewriteFileGMCE" }, groups = { "icebergMetadataWriterTest" })
public void testChangeProperty() throws IOException {
Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-3000");
Assert.assertEquals(table.currentSnapshot().allManifests().size(), 3);
Assert.assertEquals(table.properties().get("gmce.low.watermark.GobblinMetadataChangeEvent_test-1"), "30");
Assert.assertEquals(table.properties().get("gmce.high.watermark.GobblinMetadataChangeEvent_test-1"), "40");
gmce.setOldFilePrefixes(null);
DataFile dailyFile = DataFile.newBuilder().setFilePath(dailyDataFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(0L).build()).build();
gmce.setNewFiles(Lists.newArrayList(dailyFile));
gmce.setOperationType(OperationType.change_property);
gmce.setTopicPartitionOffsetsRange(ImmutableMap.<String, String>builder().put("testTopic-1", "2000-4000").build());
GenericRecord genericGmce = GenericData.get().deepCopy(gmce.getSchema(), gmce);
gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(45L))));
gobblinMCEWriter.flush();
table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
// Assert the offset has been updated
Assert.assertEquals(table.properties().get("offset.range.testTopic-1"), "0-4000");
Assert.assertEquals(table.currentSnapshot().allManifests().size(), 3);
// Assert low watermark and high watermark set properly
Assert.assertEquals(table.properties().get("gmce.low.watermark.GobblinMetadataChangeEvent_test-1"), "40");
Assert.assertEquals(table.properties().get("gmce.high.watermark.GobblinMetadataChangeEvent_test-1"), "45");
}
use of org.apache.gobblin.metadata.DataFile in project incubator-gobblin by apache.
the class IcebergMetadataWriterTest method testWriteRewriteFileGMCE.
// Make sure hive test execute later and close the metastore
@Test(dependsOnMethods = { "testWriteAddFileGMCE" }, groups = { "icebergMetadataWriterTest" })
public void testWriteRewriteFileGMCE() throws IOException {
gmce.setTopicPartitionOffsetsRange(null);
FileSystem fs = FileSystem.get(new Configuration());
String filePath = new Path(hourlyDataFile_1.getParentFile().getAbsolutePath()).toString();
String filePath_1 = new Path(hourlyDataFile_2.getParentFile().getAbsolutePath()).toString();
DataFile dailyFile = DataFile.newBuilder().setFilePath(dailyDataFile.toString()).setFileFormat("avro").setFileMetrics(DataMetrics.newBuilder().setRecordCount(10L).build()).build();
gmce.setNewFiles(Lists.newArrayList(dailyFile));
gmce.setOldFilePrefixes(Lists.newArrayList(filePath, filePath_1));
gmce.setOperationType(OperationType.rewrite_files);
Table table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
Iterator<org.apache.iceberg.DataFile> result = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", filePath_1)).collect().iterator();
Assert.assertEquals(table.currentSnapshot().allManifests().size(), 2);
Assert.assertTrue(result.hasNext());
GenericRecord genericGmce = GenericData.get().deepCopy(gmce.getSchema(), gmce);
gobblinMCEWriter.writeEnvelope(new RecordEnvelope<>(genericGmce, new KafkaStreamingExtractor.KafkaWatermark(new KafkaPartition.Builder().withTopicName("GobblinMetadataChangeEvent_test").withId(1).build(), new LongWatermark(40L))));
gobblinMCEWriter.flush();
table = catalog.loadTable(catalog.listTables(Namespace.of(dbName)).get(0));
String dailyFilePath = new Path(dailyDataFile.toString()).toString();
result = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", dailyFilePath)).collect().iterator();
Assert.assertEquals(result.next().path(), dailyFilePath);
Assert.assertFalse(result.hasNext());
result = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", filePath)).collect().iterator();
Assert.assertFalse(result.hasNext());
result = FindFiles.in(table).withMetadataMatching(Expressions.startsWith("file_path", filePath_1)).collect().iterator();
Assert.assertFalse(result.hasNext());
}
Aggregations