use of org.apache.gobblin.metadata.OperationType in project incubator-gobblin by apache.
the class GobblinMCEWriter method writeEnvelope.
@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope) throws IOException {
GenericRecord genericRecord = recordEnvelope.getRecord();
CheckpointableWatermark watermark = recordEnvelope.getWatermark();
Preconditions.checkNotNull(watermark);
// filter out the events that not emitted by accepted clusters
if (!acceptedClusters.contains(genericRecord.get("cluster"))) {
return;
}
// Use schema from record to avoid issue when schema evolution
GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
String datasetName = gmce.getDatasetIdentifier().toString();
// remove the old hive spec cache after flush
// Here we assume that new hive spec for one path always be the same(ingestion flow register to same tables)
oldSpecsMaps.remove(datasetName);
// Mapping from URI of path of arrival files to the list of HiveSpec objects.
// We assume in one same operation interval, for same dataset, the table property will not change to reduce the time to compute hiveSpec.
ConcurrentHashMap<String, Collection<HiveSpec>> newSpecsMap = new ConcurrentHashMap<>();
ConcurrentHashMap<String, Collection<HiveSpec>> oldSpecsMap = new ConcurrentHashMap<>();
if (gmce.getNewFiles() != null) {
State registerState = setHiveRegProperties(state, gmce, true);
computeSpecMap(Lists.newArrayList(Iterables.transform(gmce.getNewFiles(), DataFile::getFilePath)), newSpecsMap, newSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
}
if (gmce.getOldFilePrefixes() != null) {
State registerState = setHiveRegProperties(state, gmce, false);
computeSpecMap(gmce.getOldFilePrefixes(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, true);
} else if (gmce.getOldFiles() != null) {
State registerState = setHiveRegProperties(state, gmce, false);
computeSpecMap(gmce.getOldFiles(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
}
if (newSpecsMap.isEmpty() && oldSpecsMap.isEmpty()) {
return;
}
// Sample one entry among all "Path <--> List<HiveSpec>" pair is good enough, reasoning:
// 0. Objective here is to execute metadata registration for all target table destinations of a dataset,
// 1. GMCE guarantees all paths coming from single dataset (but not necessary single "partition" in Hive's layout),
// 2. HiveSpec of paths from a dataset should be targeting at the same set of table destinations,
// 3. therefore fetching one path's HiveSpec and iterate through it is good enough to cover all table destinations.
Collection<HiveSpec> specs = newSpecsMap.isEmpty() ? oldSpecsMap.values().iterator().next() : newSpecsMap.values().iterator().next();
for (HiveSpec spec : specs) {
String dbName = spec.getTable().getDbName();
String tableName = spec.getTable().getTableName();
String tableString = Joiner.on(TABLE_NAME_DELIMITER).join(dbName, tableName);
if (!tableOperationTypeMap.containsKey(tableString)) {
tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
} else if (tableOperationTypeMap.get(tableString).operationType != gmce.getOperationType() && gmce.getOperationType() != OperationType.change_property) {
flush(dbName, tableName);
tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
}
tableOperationTypeMap.get(tableString).gmceHighWatermark = ((LongWatermark) watermark.getWatermark()).getValue();
write(recordEnvelope, newSpecsMap, oldSpecsMap, spec);
}
this.recordCount.incrementAndGet();
}
Aggregations