Search in sources :

Example 1 with BasicMapReduceTaskContext

use of co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext in project cdap by caskdata.

the class DynamicPartitioningOutputCommitter method commitJob.

@Override
public void commitJob(JobContext context) throws IOException {
    Configuration configuration = context.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);
    String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    outputDataset = taskContext.getDataset(outputDatasetName);
    Partitioning partitioning = outputDataset.getPartitioning();
    Set<PartitionKey> partitionsToAdd = new HashSet<>();
    relativePaths = new HashSet<>();
    // Go over all files in the temporary directory and keep track of partitions to add for them
    FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
    for (FileStatus committedTaskPath : allCommittedTaskPaths) {
        FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
        RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
        while (fileIter.hasNext()) {
            Path path = fileIter.next().getPath();
            String relativePath = getRelative(committedTaskPath.getPath(), path);
            int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
            if (lastPathSepIdx == -1) {
                // this shouldn't happen because each relative path should consist of at least one partition key and
                // the output file name
                LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);
                continue;
            }
            // relativePath = "../key1/key2/part-m-00000"
            // relativeDir = "../key1/key2"
            // fileName = "part-m-00000"
            String relativeDir = relativePath.substring(0, lastPathSepIdx);
            String fileName = relativePath.substring(lastPathSepIdx + 1);
            Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
            if (fs.exists(finalDir)) {
                throw new FileAlreadyExistsException("Final output path " + finalDir + " already exists");
            }
            PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
            partitionsToAdd.add(partitionKey);
            relativePaths.add(relativeDir);
        }
    }
    // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
    // the original outputDir.
    Path finalOutput = FileOutputFormat.getOutputPath(context);
    FileSystem fs = finalOutput.getFileSystem(configuration);
    for (FileStatus stat : getAllCommittedTaskPaths(context)) {
        mergePaths(fs, stat, finalOutput);
    }
    // compute the metadata to be written to every output partition
    Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);
    // create all the necessary partitions
    for (PartitionKey partitionKey : partitionsToAdd) {
        PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
        partitionOutput.setMetadata(metadata);
        partitionOutput.addPartition();
    }
    // close the TaskContext, which flushes dataset operations
    try {
        taskContext.flushOperations();
    } catch (Exception e) {
        Throwables.propagateIfPossible(e, IOException.class);
        throw new IOException(e);
    }
    // delete the job-specific _temporary folder and create a _done file in the o/p folder
    cleanupJob(context);
    // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
    if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
        for (String relativePath : relativePaths) {
            Path pathToMark = new Path(finalOutput, relativePath);
            Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
            fs.createNewFile(markerPath);
        }
    }
}
Also used : BasicMapReduceTaskContext(co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext) Path(org.apache.hadoop.fs.Path) MapReduceClassLoader(co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Configuration(org.apache.hadoop.conf.Configuration) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) IOException(java.io.IOException) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) HashSet(java.util.HashSet)

Aggregations

PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)1 PartitionOutput (co.cask.cdap.api.dataset.lib.PartitionOutput)1 Partitioning (co.cask.cdap.api.dataset.lib.Partitioning)1 BasicMapReduceTaskContext (co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext)1 MapReduceClassLoader (co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader)1 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)1 Path (org.apache.hadoop.fs.Path)1 FileAlreadyExistsException (org.apache.hadoop.mapred.FileAlreadyExistsException)1