use of co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext in project cdap by caskdata.
the class DynamicPartitioningOutputCommitter method commitJob.
@Override
public void commitJob(JobContext context) throws IOException {
Configuration configuration = context.getConfiguration();
MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);
String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
outputDataset = taskContext.getDataset(outputDatasetName);
Partitioning partitioning = outputDataset.getPartitioning();
Set<PartitionKey> partitionsToAdd = new HashSet<>();
relativePaths = new HashSet<>();
// Go over all files in the temporary directory and keep track of partitions to add for them
FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
for (FileStatus committedTaskPath : allCommittedTaskPaths) {
FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
while (fileIter.hasNext()) {
Path path = fileIter.next().getPath();
String relativePath = getRelative(committedTaskPath.getPath(), path);
int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
if (lastPathSepIdx == -1) {
// this shouldn't happen because each relative path should consist of at least one partition key and
// the output file name
LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);
continue;
}
// relativePath = "../key1/key2/part-m-00000"
// relativeDir = "../key1/key2"
// fileName = "part-m-00000"
String relativeDir = relativePath.substring(0, lastPathSepIdx);
String fileName = relativePath.substring(lastPathSepIdx + 1);
Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
if (fs.exists(finalDir)) {
throw new FileAlreadyExistsException("Final output path " + finalDir + " already exists");
}
PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
partitionsToAdd.add(partitionKey);
relativePaths.add(relativeDir);
}
}
// We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
// the original outputDir.
Path finalOutput = FileOutputFormat.getOutputPath(context);
FileSystem fs = finalOutput.getFileSystem(configuration);
for (FileStatus stat : getAllCommittedTaskPaths(context)) {
mergePaths(fs, stat, finalOutput);
}
// compute the metadata to be written to every output partition
Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);
// create all the necessary partitions
for (PartitionKey partitionKey : partitionsToAdd) {
PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
partitionOutput.setMetadata(metadata);
partitionOutput.addPartition();
}
// close the TaskContext, which flushes dataset operations
try {
taskContext.flushOperations();
} catch (Exception e) {
Throwables.propagateIfPossible(e, IOException.class);
throw new IOException(e);
}
// delete the job-specific _temporary folder and create a _done file in the o/p folder
cleanupJob(context);
// mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
for (String relativePath : relativePaths) {
Path pathToMark = new Path(finalOutput, relativePath);
Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
fs.createNewFile(markerPath);
}
}
}
Aggregations