Search in sources :

Example 6 with FsWriterMetrics

use of org.apache.gobblin.writer.FsWriterMetrics in project incubator-gobblin by apache.

the class BaseDataPublisher method mergeMetadataAndCollectPartitionNames.

/*
   * Metadata that we publish can come from several places:
   *  - It can be passed in job config (DATA_PUBLISHER_METADATA_STR)
   *  - It can be picked up from previous runs of a job (if the output partition already exists)
   *  -- The above two are handled when we construct a new MetadataMerger
   *
   *  - The source/converters/writers associated with each branch of a job may add their own metadata
   *    (eg: this dataset is encrypted using AES256). This is returned by getIntermediateMetadataFromState()
   *    and fed into the MetadataMerger.
   *  - FsWriterMetrics can be emitted and rolled up into metadata. These metrics are specific to a {partition, branch}
   *    combo as they mention per-output file metrics. This is also fed into metadata mergers.
   *
   *  Each writer should only be a part of one branch, but it may be responsible for multiple partitions.
   */
private void mergeMetadataAndCollectPartitionNames(Collection<? extends WorkUnitState> states, Set<String> partitionPaths) {
    for (WorkUnitState workUnitState : states) {
        // First extract the partition paths and metrics from the work unit. This is essentially
        // equivalent to grouping FsWriterMetrics by {partitionKey, branchId} and extracting
        // all partitionPaths into a set.
        Map<PartitionIdentifier, Set<FsWriterMetrics>> metricsByPartition = new HashMap<>();
        boolean partitionFound = false;
        for (Map.Entry<Object, Object> property : workUnitState.getProperties().entrySet()) {
            if (((String) property.getKey()).startsWith(ConfigurationKeys.WRITER_PARTITION_PATH_KEY)) {
                partitionPaths.add((String) property.getValue());
                partitionFound = true;
            } else if (((String) property.getKey()).startsWith(FsDataWriter.FS_WRITER_METRICS_KEY)) {
                try {
                    FsWriterMetrics parsedMetrics = FsWriterMetrics.fromJson((String) property.getValue());
                    partitionPaths.add(parsedMetrics.getPartitionInfo().getPartitionKey());
                    Set<FsWriterMetrics> metricsForPartition = metricsByPartition.computeIfAbsent(parsedMetrics.getPartitionInfo(), k -> new HashSet<>());
                    metricsForPartition.add(parsedMetrics);
                } catch (IOException e) {
                    LOG.warn("Error parsing metrics from property {} - ignoring", (String) property.getValue());
                }
            }
        }
        // no specific partitions - add null as a placeholder
        if (!partitionFound) {
            partitionPaths.add(null);
        }
        final String configBasedMetadata = getMetadataFromWorkUnitState(workUnitState);
        // Now update all metadata mergers with branch metadata + partition metrics
        for (int branchId = 0; branchId < numBranches; branchId++) {
            for (String partition : partitionPaths) {
                PartitionIdentifier partitionIdentifier = new PartitionIdentifier(partition, branchId);
                final int branch = branchId;
                MetadataMerger<String> mdMerger = metadataMergers.computeIfAbsent(partitionIdentifier, k -> buildMetadataMergerForBranch(configBasedMetadata, branch, getMetadataOutputFileForBranch(workUnitState, branch)));
                if (shouldPublishWriterMetadataForBranch(branchId)) {
                    String md = getIntermediateMetadataFromState(workUnitState, branchId);
                    mdMerger.update(md);
                    Set<FsWriterMetrics> metricsForPartition = metricsByPartition.getOrDefault(partitionIdentifier, Collections.emptySet());
                    for (FsWriterMetrics metrics : metricsForPartition) {
                        mdMerger.update(metrics);
                    }
                }
            }
        }
    }
}
Also used : RetryerFactory(org.apache.gobblin.util.retry.RetryerFactory) FileSystem(org.apache.hadoop.fs.FileSystem) LoggerFactory(org.slf4j.LoggerFactory) FileStatus(org.apache.hadoop.fs.FileStatus) FsPermission(org.apache.hadoop.fs.permission.FsPermission) GobblinConstructorUtils(org.apache.gobblin.util.reflection.GobblinConstructorUtils) ConfigBuilder(org.apache.gobblin.config.ConfigBuilder) Optional(com.google.common.base.Optional) MetadataMerger(org.apache.gobblin.metadata.MetadataMerger) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) PathUtils(org.apache.gobblin.util.PathUtils) URI(java.net.URI) PartitionIdentifier(org.apache.gobblin.writer.PartitionIdentifier) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StaticStringMetadataMerger(org.apache.gobblin.metadata.types.StaticStringMetadataMerger) FileListUtils(org.apache.gobblin.util.FileListUtils) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) State(org.apache.gobblin.configuration.State) Set(java.util.Set) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) StandardCharsets(java.nio.charset.StandardCharsets) Sets(com.google.common.collect.Sets) Objects(java.util.Objects) LineageInfo(org.apache.gobblin.metrics.event.lineage.LineageInfo) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) ForkOperatorUtils(org.apache.gobblin.util.ForkOperatorUtils) DatasetConstants(org.apache.gobblin.dataset.DatasetConstants) HashMap(java.util.HashMap) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) Closer(com.google.common.io.Closer) ParallelRunner(org.apache.gobblin.util.ParallelRunner) ConfigFactory(com.typesafe.config.ConfigFactory) HadoopUtils(org.apache.gobblin.util.HadoopUtils) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) Maps(com.google.common.collect.Maps) FsDataWriter(org.apache.gobblin.writer.FsDataWriter) TimeUnit(java.util.concurrent.TimeUnit) ConfigRenderOptions(com.typesafe.config.ConfigRenderOptions) SourceState(org.apache.gobblin.configuration.SourceState) WriterUtils(org.apache.gobblin.util.WriterUtils) Collections(java.util.Collections) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) IOException(java.io.IOException) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) PartitionIdentifier(org.apache.gobblin.writer.PartitionIdentifier) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Aggregations

State (org.apache.gobblin.configuration.State)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)6 FsWriterMetrics (org.apache.gobblin.writer.FsWriterMetrics)6 File (java.io.File)5 GlobalMetadata (org.apache.gobblin.metadata.types.GlobalMetadata)4 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)4 Test (org.testng.annotations.Test)4 ImmutableList (com.google.common.collect.ImmutableList)2 ConfigFactory (com.typesafe.config.ConfigFactory)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 StandardCharsets (java.nio.charset.StandardCharsets)2 Collections (java.util.Collections)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 IOUtils (org.apache.commons.io.IOUtils)2 ConfigurationKeys (org.apache.gobblin.configuration.ConfigurationKeys)2