Search in sources :

Example 1 with PartitionIdentifier

use of org.apache.gobblin.writer.PartitionIdentifier in project incubator-gobblin by apache.

the class BaseDataPublisher method getMergedMetadataForPartitionAndBranch.

/*
   * Get the merged metadata given a workunit state and branch id. This method assumes
   * all intermediate metadata has already been passed to the MetadataMerger.
   *
   * If metadata mergers are not configured, instead return the metadata from job config that was
   * passed in by the user.
   */
private String getMergedMetadataForPartitionAndBranch(String partitionId, int branchId) {
    String mergedMd = null;
    MetadataMerger<String> mergerForBranch = metadataMergers.get(new PartitionIdentifier(partitionId, branchId));
    if (mergerForBranch != null) {
        mergedMd = mergerForBranch.getMergedMetadata();
        if (mergedMd == null) {
            LOG.warn("Metadata merger for branch {} returned null - bug in merger?", branchId);
        }
    }
    return mergedMd;
}
Also used : PartitionIdentifier(org.apache.gobblin.writer.PartitionIdentifier)

Example 2 with PartitionIdentifier

use of org.apache.gobblin.writer.PartitionIdentifier in project incubator-gobblin by apache.

the class BaseDataPublisher method mergeMetadataAndCollectPartitionNames.

/*
   * Metadata that we publish can come from several places:
   *  - It can be passed in job config (DATA_PUBLISHER_METADATA_STR)
   *  - It can be picked up from previous runs of a job (if the output partition already exists)
   *  -- The above two are handled when we construct a new MetadataMerger
   *
   *  - The source/converters/writers associated with each branch of a job may add their own metadata
   *    (eg: this dataset is encrypted using AES256). This is returned by getIntermediateMetadataFromState()
   *    and fed into the MetadataMerger.
   *  - FsWriterMetrics can be emitted and rolled up into metadata. These metrics are specific to a {partition, branch}
   *    combo as they mention per-output file metrics. This is also fed into metadata mergers.
   *
   *  Each writer should only be a part of one branch, but it may be responsible for multiple partitions.
   */
private void mergeMetadataAndCollectPartitionNames(Collection<? extends WorkUnitState> states, Set<String> partitionPaths) {
    for (WorkUnitState workUnitState : states) {
        // First extract the partition paths and metrics from the work unit. This is essentially
        // equivalent to grouping FsWriterMetrics by {partitionKey, branchId} and extracting
        // all partitionPaths into a set.
        Map<PartitionIdentifier, Set<FsWriterMetrics>> metricsByPartition = new HashMap<>();
        boolean partitionFound = false;
        for (Map.Entry<Object, Object> property : workUnitState.getProperties().entrySet()) {
            if (((String) property.getKey()).startsWith(ConfigurationKeys.WRITER_PARTITION_PATH_KEY)) {
                partitionPaths.add((String) property.getValue());
                partitionFound = true;
            } else if (((String) property.getKey()).startsWith(FsDataWriter.FS_WRITER_METRICS_KEY)) {
                try {
                    FsWriterMetrics parsedMetrics = FsWriterMetrics.fromJson((String) property.getValue());
                    partitionPaths.add(parsedMetrics.getPartitionInfo().getPartitionKey());
                    Set<FsWriterMetrics> metricsForPartition = metricsByPartition.computeIfAbsent(parsedMetrics.getPartitionInfo(), k -> new HashSet<>());
                    metricsForPartition.add(parsedMetrics);
                } catch (IOException e) {
                    LOG.warn("Error parsing metrics from property {} - ignoring", (String) property.getValue());
                }
            }
        }
        // no specific partitions - add null as a placeholder
        if (!partitionFound) {
            partitionPaths.add(null);
        }
        final String configBasedMetadata = getMetadataFromWorkUnitState(workUnitState);
        // Now update all metadata mergers with branch metadata + partition metrics
        for (int branchId = 0; branchId < numBranches; branchId++) {
            for (String partition : partitionPaths) {
                PartitionIdentifier partitionIdentifier = new PartitionIdentifier(partition, branchId);
                final int branch = branchId;
                MetadataMerger<String> mdMerger = metadataMergers.computeIfAbsent(partitionIdentifier, k -> buildMetadataMergerForBranch(configBasedMetadata, branch, getMetadataOutputFileForBranch(workUnitState, branch)));
                if (shouldPublishWriterMetadataForBranch(branchId)) {
                    String md = getIntermediateMetadataFromState(workUnitState, branchId);
                    mdMerger.update(md);
                    Set<FsWriterMetrics> metricsForPartition = metricsByPartition.getOrDefault(partitionIdentifier, Collections.emptySet());
                    for (FsWriterMetrics metrics : metricsForPartition) {
                        mdMerger.update(metrics);
                    }
                }
            }
        }
    }
}
Also used : RetryerFactory(org.apache.gobblin.util.retry.RetryerFactory) FileSystem(org.apache.hadoop.fs.FileSystem) LoggerFactory(org.slf4j.LoggerFactory) FileStatus(org.apache.hadoop.fs.FileStatus) FsPermission(org.apache.hadoop.fs.permission.FsPermission) GobblinConstructorUtils(org.apache.gobblin.util.reflection.GobblinConstructorUtils) ConfigBuilder(org.apache.gobblin.config.ConfigBuilder) Optional(com.google.common.base.Optional) MetadataMerger(org.apache.gobblin.metadata.MetadataMerger) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) PathUtils(org.apache.gobblin.util.PathUtils) URI(java.net.URI) PartitionIdentifier(org.apache.gobblin.writer.PartitionIdentifier) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StaticStringMetadataMerger(org.apache.gobblin.metadata.types.StaticStringMetadataMerger) FileListUtils(org.apache.gobblin.util.FileListUtils) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) State(org.apache.gobblin.configuration.State) Set(java.util.Set) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) StandardCharsets(java.nio.charset.StandardCharsets) Sets(com.google.common.collect.Sets) Objects(java.util.Objects) LineageInfo(org.apache.gobblin.metrics.event.lineage.LineageInfo) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) ForkOperatorUtils(org.apache.gobblin.util.ForkOperatorUtils) DatasetConstants(org.apache.gobblin.dataset.DatasetConstants) HashMap(java.util.HashMap) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) Closer(com.google.common.io.Closer) ParallelRunner(org.apache.gobblin.util.ParallelRunner) ConfigFactory(com.typesafe.config.ConfigFactory) HadoopUtils(org.apache.gobblin.util.HadoopUtils) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) Maps(com.google.common.collect.Maps) FsDataWriter(org.apache.gobblin.writer.FsDataWriter) TimeUnit(java.util.concurrent.TimeUnit) ConfigRenderOptions(com.typesafe.config.ConfigRenderOptions) SourceState(org.apache.gobblin.configuration.SourceState) WriterUtils(org.apache.gobblin.util.WriterUtils) Collections(java.util.Collections) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) IOException(java.io.IOException) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) PartitionIdentifier(org.apache.gobblin.writer.PartitionIdentifier) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Aggregations

Optional (com.google.common.base.Optional)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 Lists (com.google.common.collect.Lists)1 Maps (com.google.common.collect.Maps)1 Sets (com.google.common.collect.Sets)1 Closer (com.google.common.io.Closer)1 Config (com.typesafe.config.Config)1 ConfigFactory (com.typesafe.config.ConfigFactory)1 ConfigRenderOptions (com.typesafe.config.ConfigRenderOptions)1 IOException (java.io.IOException)1 URI (java.net.URI)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 Objects (java.util.Objects)1