Search in sources :

Example 1 with MapReduceClassLoader

use of co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader in project cdap by caskdata.

the class DynamicPartitioningOutputCommitter method commitJob.

@Override
public void commitJob(JobContext context) throws IOException {
    Configuration configuration = context.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);
    String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    outputDataset = taskContext.getDataset(outputDatasetName);
    DynamicPartitioner.PartitionWriteOption partitionWriteOption = DynamicPartitioner.PartitionWriteOption.valueOf(configuration.get(PartitionedFileSetArguments.DYNAMIC_PARTITIONER_WRITE_OPTION));
    Partitioning partitioning = outputDataset.getPartitioning();
    partitionsToAdd = new HashMap<>();
    // Go over all files in the temporary directory and keep track of partitions to add for them
    FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
    for (FileStatus committedTaskPath : allCommittedTaskPaths) {
        FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
        RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
        while (fileIter.hasNext()) {
            Path path = fileIter.next().getPath();
            String relativePath = getRelative(committedTaskPath.getPath(), path);
            int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
            if (lastPathSepIdx == -1) {
                // this shouldn't happen because each relative path should consist of at least one partition key and
                // the output file name
                LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);
                continue;
            }
            // relativePath = "../key1/key2/part-m-00000"
            // relativeDir = "../key1/key2"
            // fileName = "part-m-00000"
            String relativeDir = relativePath.substring(0, lastPathSepIdx);
            Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
            if (partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE) {
                if (fs.exists(finalDir)) {
                    throw new FileAlreadyExistsException("Final output path already exists: " + finalDir);
                }
            }
            PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
            partitionsToAdd.put(relativeDir, partitionKey);
        }
    }
    // need to remove any existing partitions, before moving temporary content to final output
    if (partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE_OR_OVERWRITE) {
        for (Map.Entry<String, PartitionKey> entry : partitionsToAdd.entrySet()) {
            if (outputDataset.getPartition(entry.getValue()) != null) {
                // this allows reinstating the existing files if there's a rollback.
                // alternative is to simply remove the files within the partition's location
                // upside to that is easily avoiding explore operations. one downside is that metadata is not removed then
                outputDataset.dropPartition(entry.getValue());
            }
        }
    }
    // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
    // the original outputDir.
    Path finalOutput = FileOutputFormat.getOutputPath(context);
    FileContext fc = FileContext.getFileContext(configuration);
    // the finalOutput path doesn't have scheme or authority (but 'from' does)
    finalOutput = fc.makeQualified(finalOutput);
    for (FileStatus from : getAllCommittedTaskPaths(context)) {
        mergePaths(fc, from, finalOutput);
    }
    // compute the metadata to be written to every output partition
    Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);
    boolean allowAppend = partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE_OR_APPEND;
    // create all the necessary partitions
    for (Map.Entry<String, PartitionKey> entry : partitionsToAdd.entrySet()) {
        outputDataset.addPartition(entry.getValue(), entry.getKey(), metadata, true, allowAppend);
    }
    // delete the job-specific _temporary folder
    cleanupJob(context);
    // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
    if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
        for (String relativePath : partitionsToAdd.keySet()) {
            Path pathToMark = new Path(finalOutput, relativePath);
            createOrUpdate(fc, new Path(pathToMark, SUCCEEDED_FILE_NAME));
            // also create a _SUCCESS-<RunId>, if allowing append
            if (allowAppend) {
                createOrUpdate(fc, new Path(pathToMark, SUCCEEDED_FILE_NAME + "-" + taskContext.getProgramRunId().getRun()));
            }
        }
    }
}
Also used : BasicMapReduceTaskContext(co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext) Path(org.apache.hadoop.fs.Path) MapReduceClassLoader(co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Configuration(org.apache.hadoop.conf.Configuration) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) DynamicPartitioner(co.cask.cdap.api.dataset.lib.DynamicPartitioner) HashMap(java.util.HashMap) Map(java.util.Map) FileContext(org.apache.hadoop.fs.FileContext)

Example 2 with MapReduceClassLoader

use of co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader in project cdap by caskdata.

the class MapReduceContainerLauncher method launch.

/**
 * Launches the given main class. The main class will be loaded through the {@link MapReduceClassLoader}.
 *
 * @param mainClassName the main class to launch
 * @param args          arguments for the main class
 */
@SuppressWarnings("unused")
public static void launch(String mainClassName, String[] args) throws Exception {
    Thread.setDefaultUncaughtExceptionHandler(new UncaughtExceptionHandler());
    ClassLoader systemClassLoader = ClassLoader.getSystemClassLoader();
    List<URL> urls = ClassLoaders.getClassLoaderURLs(systemClassLoader, new ArrayList<URL>());
    // Remove the URL that contains the given main classname to avoid infinite recursion.
    // This is needed because we generate a class with the same main classname in order to intercept the main()
    // method call from the container launch script.
    URL resource = systemClassLoader.getResource(mainClassName.replace('.', '/') + ".class");
    if (resource == null) {
        throw new IllegalStateException("Failed to find resource for main class " + mainClassName);
    }
    if (!urls.remove(ClassLoaders.getClassPathURL(mainClassName, resource))) {
        throw new IllegalStateException("Failed to remove main class resource " + resource);
    }
    // Create a MainClassLoader for dataset rewrite
    URL[] classLoaderUrls = urls.toArray(new URL[urls.size()]);
    ClassLoader mainClassLoader = new MainClassLoader(classLoaderUrls, systemClassLoader.getParent());
    // Install the JUL to SLF4J Bridge
    try {
        mainClassLoader.loadClass(SLF4JBridgeHandler.class.getName()).getDeclaredMethod("install").invoke(null);
    } catch (Exception e) {
        // Log the error and continue
        LOG.warn("Failed to invoke SLF4JBridgeHandler.install() required for jul-to-slf4j bridge", e);
    }
    ClassLoaders.setContextClassLoader(mainClassLoader);
    // Creates the MapReduceClassLoader. It has to be loaded from the MainClassLoader.
    try {
        final ClassLoader classLoader = (ClassLoader) mainClassLoader.loadClass(MapReduceClassLoader.class.getName()).newInstance();
        Runtime.getRuntime().addShutdownHook(new Thread() {

            @Override
            public void run() {
                if (classLoader instanceof AutoCloseable) {
                    try {
                        ((AutoCloseable) classLoader).close();
                    } catch (Exception e) {
                        System.err.println("Failed to close ClassLoader " + classLoader);
                        e.printStackTrace();
                    }
                }
            }
        });
        Thread.currentThread().setContextClassLoader(classLoader);
        // Setup logging and stdout/stderr redirect
        // Invoke MapReduceClassLoader.getTaskContextProvider()
        classLoader.getClass().getDeclaredMethod("getTaskContextProvider").invoke(classLoader);
        // Invoke StandardOutErrorRedirector.redirectToLogger()
        classLoader.loadClass("co.cask.cdap.common.logging.StandardOutErrorRedirector").getDeclaredMethod("redirectToLogger", String.class).invoke(null, mainClassName);
        Class<?> mainClass = classLoader.loadClass(mainClassName);
        Method mainMethod = mainClass.getMethod("main", String[].class);
        mainMethod.setAccessible(true);
        LOG.info("Launch main class {}.main({})", mainClassName, Arrays.toString(args));
        mainMethod.invoke(null, new Object[] { args });
        LOG.info("Main method returned {}", mainClassName);
    } catch (Throwable t) {
        // LOG the exception since this exception will be propagated back to JVM
        // and kill the main thread (hence the JVM process).
        // If we don't log it here as ERROR, it will be logged by UncaughtExceptionHandler as DEBUG level
        LOG.error("Exception raised when calling {}.main(String[]) method", mainClassName, t);
        throw t;
    }
}
Also used : MapReduceClassLoader(co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader) Method(java.lang.reflect.Method) URL(java.net.URL) SLF4JBridgeHandler(org.slf4j.bridge.SLF4JBridgeHandler) MapReduceClassLoader(co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader) MainClassLoader(co.cask.cdap.common.app.MainClassLoader) UncaughtExceptionHandler(co.cask.cdap.common.logging.common.UncaughtExceptionHandler) MainClassLoader(co.cask.cdap.common.app.MainClassLoader)

Aggregations

MapReduceClassLoader (co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader)2 DynamicPartitioner (co.cask.cdap.api.dataset.lib.DynamicPartitioner)1 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)1 Partitioning (co.cask.cdap.api.dataset.lib.Partitioning)1 MainClassLoader (co.cask.cdap.common.app.MainClassLoader)1 UncaughtExceptionHandler (co.cask.cdap.common.logging.common.UncaughtExceptionHandler)1 BasicMapReduceTaskContext (co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext)1 Method (java.lang.reflect.Method)1 URL (java.net.URL)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileContext (org.apache.hadoop.fs.FileContext)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)1 Path (org.apache.hadoop.fs.Path)1 FileAlreadyExistsException (org.apache.hadoop.mapred.FileAlreadyExistsException)1 SLF4JBridgeHandler (org.slf4j.bridge.SLF4JBridgeHandler)1