use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.
the class GobblinMCEPublisherTest method testPublishGMCEWithoutFile.
@Test(dependsOnMethods = { "testPublishGMCEForAvro" })
public void testPublishGMCEWithoutFile() throws IOException {
GobblinMCEProducer producer = Mockito.mock(GobblinMCEProducer.class);
Mockito.doCallRealMethod().when(producer).getGobblinMetadataChangeEvent(anyMap(), anyList(), anyList(), anyMap(), any(), any());
Mockito.doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
Object[] args = invocation.getArguments();
GobblinMetadataChangeEvent gmce = producer.getGobblinMetadataChangeEvent((Map<Path, Metrics>) args[0], null, null, (Map<String, String>) args[1], OperationType.change_property, SchemaSource.NONE);
Assert.assertEquals(gmce.getNewFiles().size(), 1);
Assert.assertNull(gmce.getOldFiles());
Assert.assertNull(gmce.getOldFilePrefixes());
Assert.assertEquals(gmce.getOperationType(), OperationType.change_property);
return null;
}
}).when(producer).sendGMCE(anyMap(), anyList(), anyList(), anyMap(), any(), any());
WorkUnitState state = new WorkUnitState();
setGMCEPublisherStateWithoutNewFile(state);
Mockito.doCallRealMethod().when(producer).setState(state);
producer.setState(state);
GobblinMCEPublisher publisher = new GobblinMCEPublisher(state, producer);
publisher.publishData(Arrays.asList(state));
}
use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.
the class GobblinMCEPublisherTest method testPublishGMCEForAvro.
@Test
public void testPublishGMCEForAvro() throws IOException {
GobblinMCEProducer producer = Mockito.mock(GobblinMCEProducer.class);
Mockito.doCallRealMethod().when(producer).getGobblinMetadataChangeEvent(anyMap(), anyList(), anyList(), anyMap(), any(), any());
Mockito.doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
Object[] args = invocation.getArguments();
GobblinMetadataChangeEvent gmce = producer.getGobblinMetadataChangeEvent((Map<Path, Metrics>) args[0], null, null, (Map<String, String>) args[1], OperationType.add_files, SchemaSource.SCHEMAREGISTRY);
Assert.assertEquals(gmce.getNewFiles().size(), 1);
FileSystem fs = FileSystem.get(new Configuration());
Assert.assertEquals(gmce.getNewFiles().get(0).getFilePath(), new Path(dataFile.getAbsolutePath()).makeQualified(fs.getUri(), new Path("/")).toString());
return null;
}
}).when(producer).sendGMCE(anyMap(), anyList(), anyList(), anyMap(), any(), any());
WorkUnitState state = new WorkUnitState();
setGMCEPublisherStateForAvroFile(state);
Mockito.doCallRealMethod().when(producer).setState(state);
producer.setState(state);
GobblinMCEPublisher publisher = new GobblinMCEPublisher(state, producer);
publisher.publishData(Arrays.asList(state));
}
use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.
the class GobblinMCEWriter method writeEnvelope.
@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope) throws IOException {
GenericRecord genericRecord = recordEnvelope.getRecord();
CheckpointableWatermark watermark = recordEnvelope.getWatermark();
Preconditions.checkNotNull(watermark);
// filter out the events that not emitted by accepted clusters
if (!acceptedClusters.contains(genericRecord.get("cluster"))) {
return;
}
// Use schema from record to avoid issue when schema evolution
GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
String datasetName = gmce.getDatasetIdentifier().toString();
// remove the old hive spec cache after flush
// Here we assume that new hive spec for one path always be the same(ingestion flow register to same tables)
oldSpecsMaps.remove(datasetName);
// Mapping from URI of path of arrival files to the list of HiveSpec objects.
// We assume in one same operation interval, for same dataset, the table property will not change to reduce the time to compute hiveSpec.
ConcurrentHashMap<String, Collection<HiveSpec>> newSpecsMap = new ConcurrentHashMap<>();
ConcurrentHashMap<String, Collection<HiveSpec>> oldSpecsMap = new ConcurrentHashMap<>();
if (gmce.getNewFiles() != null) {
State registerState = setHiveRegProperties(state, gmce, true);
computeSpecMap(Lists.newArrayList(Iterables.transform(gmce.getNewFiles(), DataFile::getFilePath)), newSpecsMap, newSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
}
if (gmce.getOldFilePrefixes() != null) {
State registerState = setHiveRegProperties(state, gmce, false);
computeSpecMap(gmce.getOldFilePrefixes(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, true);
} else if (gmce.getOldFiles() != null) {
State registerState = setHiveRegProperties(state, gmce, false);
computeSpecMap(gmce.getOldFiles(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
}
if (newSpecsMap.isEmpty() && oldSpecsMap.isEmpty()) {
return;
}
// Sample one entry among all "Path <--> List<HiveSpec>" pair is good enough, reasoning:
// 0. Objective here is to execute metadata registration for all target table destinations of a dataset,
// 1. GMCE guarantees all paths coming from single dataset (but not necessary single "partition" in Hive's layout),
// 2. HiveSpec of paths from a dataset should be targeting at the same set of table destinations,
// 3. therefore fetching one path's HiveSpec and iterate through it is good enough to cover all table destinations.
Collection<HiveSpec> specs = newSpecsMap.isEmpty() ? oldSpecsMap.values().iterator().next() : newSpecsMap.values().iterator().next();
for (HiveSpec spec : specs) {
String dbName = spec.getTable().getDbName();
String tableName = spec.getTable().getTableName();
String tableString = Joiner.on(TABLE_NAME_DELIMITER).join(dbName, tableName);
if (!tableOperationTypeMap.containsKey(tableString)) {
tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
} else if (tableOperationTypeMap.get(tableString).operationType != gmce.getOperationType() && gmce.getOperationType() != OperationType.change_property) {
flush(dbName, tableName);
tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
}
tableOperationTypeMap.get(tableString).gmceHighWatermark = ((LongWatermark) watermark.getWatermark()).getValue();
write(recordEnvelope, newSpecsMap, oldSpecsMap, spec);
}
this.recordCount.incrementAndGet();
}
use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.
the class GobblinMCEProducer method sendGMCE.
/**
* This method will use the files to compute the table name and dataset name, for each table it will generate one GMCE and send that to kafka so
* the metadata ingestion pipeline can use the information to register metadata
* @param newFiles The map of new files' path and metrics
* @param oldFiles the list of old file to be dropped
* @param offsetRange offset range of the new data, can be null
* @param operationType The opcode of gmce emitted by this method.
* @throws IOException
*/
public void sendGMCE(Map<Path, Metrics> newFiles, List<String> oldFiles, List<String> oldFilePrefixes, Map<String, String> offsetRange, OperationType operationType, SchemaSource schemaSource) throws IOException {
GobblinMetadataChangeEvent gmce = getGobblinMetadataChangeEvent(newFiles, oldFiles, oldFilePrefixes, offsetRange, operationType, schemaSource);
underlyingSendGMCE(gmce);
}
use of org.apache.gobblin.metadata.GobblinMetadataChangeEvent in project incubator-gobblin by apache.
the class HiveMetadataWriter method writeEnvelope.
@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, HiveSpec tableSpec) throws IOException {
GenericRecord genericRecord = recordEnvelope.getRecord();
GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
if (whitelistBlacklist.acceptTable(tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName())) {
write(gmce, newSpecsMap, oldSpecsMap, tableSpec);
} else {
log.debug(String.format("Skip table %s.%s since it's not selected", tableSpec.getTable().getDbName(), tableSpec.getTable().getTableName()));
}
}
Aggregations