Search in sources :

Example 1 with OperationType

use of org.apache.gobblin.metadata.OperationType in project incubator-gobblin by apache.

the class GobblinMCEWriter method writeEnvelope.

@Override
public void writeEnvelope(RecordEnvelope<GenericRecord> recordEnvelope) throws IOException {
    GenericRecord genericRecord = recordEnvelope.getRecord();
    CheckpointableWatermark watermark = recordEnvelope.getWatermark();
    Preconditions.checkNotNull(watermark);
    // filter out the events that not emitted by accepted clusters
    if (!acceptedClusters.contains(genericRecord.get("cluster"))) {
        return;
    }
    // Use schema from record to avoid issue when schema evolution
    GobblinMetadataChangeEvent gmce = (GobblinMetadataChangeEvent) SpecificData.get().deepCopy(genericRecord.getSchema(), genericRecord);
    String datasetName = gmce.getDatasetIdentifier().toString();
    // remove the old hive spec cache after flush
    // Here we assume that new hive spec for one path always be the same(ingestion flow register to same tables)
    oldSpecsMaps.remove(datasetName);
    // Mapping from URI of path of arrival files to the list of HiveSpec objects.
    // We assume in one same operation interval, for same dataset, the table property will not change to reduce the time to compute hiveSpec.
    ConcurrentHashMap<String, Collection<HiveSpec>> newSpecsMap = new ConcurrentHashMap<>();
    ConcurrentHashMap<String, Collection<HiveSpec>> oldSpecsMap = new ConcurrentHashMap<>();
    if (gmce.getNewFiles() != null) {
        State registerState = setHiveRegProperties(state, gmce, true);
        computeSpecMap(Lists.newArrayList(Iterables.transform(gmce.getNewFiles(), DataFile::getFilePath)), newSpecsMap, newSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
    }
    if (gmce.getOldFilePrefixes() != null) {
        State registerState = setHiveRegProperties(state, gmce, false);
        computeSpecMap(gmce.getOldFilePrefixes(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, true);
    } else if (gmce.getOldFiles() != null) {
        State registerState = setHiveRegProperties(state, gmce, false);
        computeSpecMap(gmce.getOldFiles(), oldSpecsMap, oldSpecsMaps.computeIfAbsent(datasetName, t -> CacheBuilder.newBuilder().expireAfterAccess(state.getPropAsInt(MetadataWriter.CACHE_EXPIRING_TIME, MetadataWriter.DEFAULT_CACHE_EXPIRING_TIME), TimeUnit.HOURS).build()), registerState, false);
    }
    if (newSpecsMap.isEmpty() && oldSpecsMap.isEmpty()) {
        return;
    }
    // Sample one entry among all "Path <--> List<HiveSpec>" pair is good enough, reasoning:
    // 0. Objective here is to execute metadata registration for all target table destinations of a dataset,
    // 1. GMCE guarantees all paths coming from single dataset (but not necessary single "partition" in Hive's layout),
    // 2. HiveSpec of paths from a dataset should be targeting at the same set of table destinations,
    // 3. therefore fetching one path's HiveSpec and iterate through it is good enough to cover all table destinations.
    Collection<HiveSpec> specs = newSpecsMap.isEmpty() ? oldSpecsMap.values().iterator().next() : newSpecsMap.values().iterator().next();
    for (HiveSpec spec : specs) {
        String dbName = spec.getTable().getDbName();
        String tableName = spec.getTable().getTableName();
        String tableString = Joiner.on(TABLE_NAME_DELIMITER).join(dbName, tableName);
        if (!tableOperationTypeMap.containsKey(tableString)) {
            tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
        } else if (tableOperationTypeMap.get(tableString).operationType != gmce.getOperationType() && gmce.getOperationType() != OperationType.change_property) {
            flush(dbName, tableName);
            tableOperationTypeMap.put(tableString, new TableStatus(gmce.getOperationType(), gmce.getDatasetIdentifier().getNativeName(), watermark.getSource(), ((LongWatermark) watermark.getWatermark()).getValue() - 1, ((LongWatermark) watermark.getWatermark()).getValue()));
        }
        tableOperationTypeMap.get(tableString).gmceHighWatermark = ((LongWatermark) watermark.getWatermark()).getValue();
        write(recordEnvelope, newSpecsMap, oldSpecsMap, spec);
    }
    this.recordCount.incrementAndGet();
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) GobblinConstructorUtils(org.apache.gobblin.util.reflection.GobblinConstructorUtils) ClustersNames(org.apache.gobblin.util.ClustersNames) Map(java.util.Map) AvroSerdeUtils(org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils) Path(org.apache.hadoop.fs.Path) Splitter(com.google.common.base.Splitter) SpecificData(org.apache.avro.specific.SpecificData) DataFile(org.apache.gobblin.metadata.DataFile) Schema(org.apache.avro.Schema) Descriptor(org.apache.gobblin.dataset.Descriptor) Collection(java.util.Collection) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) Instrumented(org.apache.gobblin.instrumented.Instrumented) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) State(org.apache.gobblin.configuration.State) HiveRegistrationPolicyBase(org.apache.gobblin.hive.policy.HiveRegistrationPolicyBase) Set(java.util.Set) GobblinEventBuilder(org.apache.gobblin.metrics.event.GobblinEventBuilder) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) CacheBuilder(com.google.common.cache.CacheBuilder) Joiner(com.google.common.base.Joiner) Iterables(com.google.common.collect.Iterables) Setter(lombok.Setter) Getter(lombok.Getter) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) MetricContext(org.apache.gobblin.metrics.MetricContext) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) ArrayList(java.util.ArrayList) HiveRegistrationPolicy(org.apache.gobblin.hive.policy.HiveRegistrationPolicy) MetadataWriter(org.apache.gobblin.hive.writer.MetadataWriter) Lists(com.google.common.collect.Lists) Closer(com.google.common.io.Closer) DataWriterBuilder(org.apache.gobblin.writer.DataWriterBuilder) ParallelRunner(org.apache.gobblin.util.ParallelRunner) HadoopUtils(org.apache.gobblin.util.HadoopUtils) GenericRecord(org.apache.avro.generic.GenericRecord) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) DataWriter(org.apache.gobblin.writer.DataWriter) TimeUnit(java.util.concurrent.TimeUnit) EventSubmitter(org.apache.gobblin.metrics.event.EventSubmitter) AtomicLong(java.util.concurrent.atomic.AtomicLong) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) OperationType(org.apache.gobblin.metadata.OperationType) Preconditions(com.google.common.base.Preconditions) Cache(com.google.common.cache.Cache) AllArgsConstructor(lombok.AllArgsConstructor) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) GobblinMetadataChangeEvent(org.apache.gobblin.metadata.GobblinMetadataChangeEvent) DataFile(org.apache.gobblin.metadata.DataFile) State(org.apache.gobblin.configuration.State) Collection(java.util.Collection) GenericRecord(org.apache.avro.generic.GenericRecord) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

Joiner (com.google.common.base.Joiner)1 Preconditions (com.google.common.base.Preconditions)1 Splitter (com.google.common.base.Splitter)1 Cache (com.google.common.cache.Cache)1 CacheBuilder (com.google.common.cache.CacheBuilder)1 Iterables (com.google.common.collect.Iterables)1 Lists (com.google.common.collect.Lists)1 Closer (com.google.common.io.Closer)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 Callable (java.util.concurrent.Callable)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 AllArgsConstructor (lombok.AllArgsConstructor)1