use of org.apache.gobblin.writer.PartitionIdentifier in project incubator-gobblin by apache.
the class BaseDataPublisher method getMergedMetadataForPartitionAndBranch.
/*
* Get the merged metadata given a workunit state and branch id. This method assumes
* all intermediate metadata has already been passed to the MetadataMerger.
*
* If metadata mergers are not configured, instead return the metadata from job config that was
* passed in by the user.
*/
private String getMergedMetadataForPartitionAndBranch(String partitionId, int branchId) {
String mergedMd = null;
MetadataMerger<String> mergerForBranch = metadataMergers.get(new PartitionIdentifier(partitionId, branchId));
if (mergerForBranch != null) {
mergedMd = mergerForBranch.getMergedMetadata();
if (mergedMd == null) {
LOG.warn("Metadata merger for branch {} returned null - bug in merger?", branchId);
}
}
return mergedMd;
}
use of org.apache.gobblin.writer.PartitionIdentifier in project incubator-gobblin by apache.
the class BaseDataPublisher method mergeMetadataAndCollectPartitionNames.
/*
* Metadata that we publish can come from several places:
* - It can be passed in job config (DATA_PUBLISHER_METADATA_STR)
* - It can be picked up from previous runs of a job (if the output partition already exists)
* -- The above two are handled when we construct a new MetadataMerger
*
* - The source/converters/writers associated with each branch of a job may add their own metadata
* (eg: this dataset is encrypted using AES256). This is returned by getIntermediateMetadataFromState()
* and fed into the MetadataMerger.
* - FsWriterMetrics can be emitted and rolled up into metadata. These metrics are specific to a {partition, branch}
* combo as they mention per-output file metrics. This is also fed into metadata mergers.
*
* Each writer should only be a part of one branch, but it may be responsible for multiple partitions.
*/
private void mergeMetadataAndCollectPartitionNames(Collection<? extends WorkUnitState> states, Set<String> partitionPaths) {
for (WorkUnitState workUnitState : states) {
// First extract the partition paths and metrics from the work unit. This is essentially
// equivalent to grouping FsWriterMetrics by {partitionKey, branchId} and extracting
// all partitionPaths into a set.
Map<PartitionIdentifier, Set<FsWriterMetrics>> metricsByPartition = new HashMap<>();
boolean partitionFound = false;
for (Map.Entry<Object, Object> property : workUnitState.getProperties().entrySet()) {
if (((String) property.getKey()).startsWith(ConfigurationKeys.WRITER_PARTITION_PATH_KEY)) {
partitionPaths.add((String) property.getValue());
partitionFound = true;
} else if (((String) property.getKey()).startsWith(FsDataWriter.FS_WRITER_METRICS_KEY)) {
try {
FsWriterMetrics parsedMetrics = FsWriterMetrics.fromJson((String) property.getValue());
partitionPaths.add(parsedMetrics.getPartitionInfo().getPartitionKey());
Set<FsWriterMetrics> metricsForPartition = metricsByPartition.computeIfAbsent(parsedMetrics.getPartitionInfo(), k -> new HashSet<>());
metricsForPartition.add(parsedMetrics);
} catch (IOException e) {
LOG.warn("Error parsing metrics from property {} - ignoring", (String) property.getValue());
}
}
}
// no specific partitions - add null as a placeholder
if (!partitionFound) {
partitionPaths.add(null);
}
final String configBasedMetadata = getMetadataFromWorkUnitState(workUnitState);
// Now update all metadata mergers with branch metadata + partition metrics
for (int branchId = 0; branchId < numBranches; branchId++) {
for (String partition : partitionPaths) {
PartitionIdentifier partitionIdentifier = new PartitionIdentifier(partition, branchId);
final int branch = branchId;
MetadataMerger<String> mdMerger = metadataMergers.computeIfAbsent(partitionIdentifier, k -> buildMetadataMergerForBranch(configBasedMetadata, branch, getMetadataOutputFileForBranch(workUnitState, branch)));
if (shouldPublishWriterMetadataForBranch(branchId)) {
String md = getIntermediateMetadataFromState(workUnitState, branchId);
mdMerger.update(md);
Set<FsWriterMetrics> metricsForPartition = metricsByPartition.getOrDefault(partitionIdentifier, Collections.emptySet());
for (FsWriterMetrics metrics : metricsForPartition) {
mdMerger.update(metrics);
}
}
}
}
}
}
Aggregations