Search in sources :

Example 1 with ProvidedOutput

use of co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput in project cdap by caskdata.

the class MapReduceRuntimeService method destroy.

/**
   * Calls the destroy method of {@link ProgramLifecycle}.
   */
private void destroy(final boolean succeeded, final String failureInfo) throws Exception {
    // if any exception happens during output committing, we want the MapReduce to fail.
    // for that to happen it is not sufficient to set the status to failed, we have to throw an exception,
    // otherwise the shutdown completes successfully and the completed() callback is called.
    // thus: remember the exception and throw it at the end.
    final AtomicReference<Exception> failureCause = new AtomicReference<>();
    // TODO (CDAP-1952): this should be done in the output committer, to make the M/R fail if addPartition fails
    try {
        context.execute(new TxRunnable() {

            @Override
            public void run(DatasetContext ctxt) throws Exception {
                ClassLoader oldClassLoader = ClassLoaders.setContextClassLoader(job.getConfiguration().getClassLoader());
                try {
                    for (Map.Entry<String, ProvidedOutput> output : context.getOutputs().entrySet()) {
                        commitOutput(succeeded, output.getKey(), output.getValue().getOutputFormatProvider(), failureCause);
                        if (succeeded && failureCause.get() != null) {
                            // mapreduce was successful but this output committer failed: call onFailure() for all committers
                            for (ProvidedOutput toFail : context.getOutputs().values()) {
                                commitOutput(false, toFail.getAlias(), toFail.getOutputFormatProvider(), failureCause);
                            }
                            break;
                        }
                    }
                    // if there was a failure, we must throw an exception to fail the transaction
                    // this will roll back all the outputs and also make sure that postCommit() is not called
                    // throwing the failure cause: it will be wrapped in a TxFailure and handled in the outer catch()
                    Exception cause = failureCause.get();
                    if (cause != null) {
                        failureCause.set(null);
                        throw cause;
                    }
                } finally {
                    ClassLoaders.setContextClassLoader(oldClassLoader);
                }
            }
        });
    } catch (TransactionFailureException e) {
        LOG.error("Transaction failure when committing dataset outputs", e);
        if (failureCause.get() != null) {
            failureCause.get().addSuppressed(e);
        } else {
            failureCause.set(e);
        }
    }
    final boolean success = succeeded && failureCause.get() == null;
    context.setState(getProgramState(success, failureInfo));
    final TransactionControl txControl = mapReduce instanceof ProgramLifecycle ? Transactions.getTransactionControl(TransactionControl.IMPLICIT, MapReduce.class, mapReduce, "destroy") : TransactionControl.IMPLICIT;
    try {
        if (TransactionControl.IMPLICIT == txControl) {
            context.execute(new TxRunnable() {

                @Override
                public void run(DatasetContext context) throws Exception {
                    doDestroy(success);
                }
            });
        } else {
            doDestroy(success);
        }
    } catch (Throwable e) {
        if (e instanceof TransactionFailureException && e.getCause() != null && !(e instanceof TransactionConflictException)) {
            e = e.getCause();
        }
        LOG.warn("Error executing the destroy method of the MapReduce program {}", context.getProgram().getName(), e);
    }
    // this is needed to make the run fail if there was an exception. See comment at beginning of this method
    if (failureCause.get() != null) {
        throw failureCause.get();
    }
}
Also used : ProgramLifecycle(co.cask.cdap.api.ProgramLifecycle) TransactionConflictException(org.apache.tephra.TransactionConflictException) AtomicReference(java.util.concurrent.atomic.AtomicReference) ProvidedOutput(co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) ProvisionException(com.google.inject.ProvisionException) IOException(java.io.IOException) TransactionFailureException(org.apache.tephra.TransactionFailureException) URISyntaxException(java.net.URISyntaxException) TransactionConflictException(org.apache.tephra.TransactionConflictException) AbstractMapReduce(co.cask.cdap.api.mapreduce.AbstractMapReduce) MapReduce(co.cask.cdap.api.mapreduce.MapReduce) JarEntry(java.util.jar.JarEntry) TransactionFailureException(org.apache.tephra.TransactionFailureException) TxRunnable(co.cask.cdap.api.TxRunnable) TransactionControl(co.cask.cdap.api.annotation.TransactionControl) WeakReferenceDelegatorClassLoader(co.cask.cdap.common.lang.WeakReferenceDelegatorClassLoader) CombineClassLoader(co.cask.cdap.common.lang.CombineClassLoader) DatasetContext(co.cask.cdap.api.data.DatasetContext)

Example 2 with ProvidedOutput

use of co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput in project cdap by caskdata.

the class BasicMapReduceContext method addOutput.

@Override
public void addOutput(Output output) {
    if (output.getNamespace() != null && output.getNamespace().equals(NamespaceId.SYSTEM.getNamespace()) && !getProgram().getNamespaceId().equals(NamespaceId.SYSTEM.getNamespace())) {
        // trying to access system namespace from a program outside system namespace is not allowed
        throw new IllegalArgumentException(String.format("Accessing Output %s in system namespace " + "is not allowed from the namespace %s", output.getName(), getProgram().getNamespaceId()));
    }
    String alias = output.getAlias();
    if (this.outputs.containsKey(alias)) {
        throw new IllegalArgumentException("Output already configured: " + alias);
    }
    ProvidedOutput providedOutput;
    if (output instanceof Output.DatasetOutput) {
        providedOutput = Outputs.transform((Output.DatasetOutput) output, this);
    } else if (output instanceof Output.OutputFormatProviderOutput) {
        OutputFormatProvider outputFormatProvider = ((Output.OutputFormatProviderOutput) output).getOutputFormatProvider();
        if (outputFormatProvider instanceof DatasetOutputCommitter) {
            // be able to call its methods in MainOutputCommitter. It needs to be a DatasetOutput.
            throw new IllegalArgumentException("Cannot add a DatasetOutputCommitter as an OutputFormatProviderOutput. " + "Add the output as a DatasetOutput.");
        }
        providedOutput = new ProvidedOutput(output, outputFormatProvider);
    } else {
        // shouldn't happen unless user defines their own Output class
        throw new IllegalArgumentException(String.format("Output %s has unknown output class %s", output.getName(), output.getClass().getCanonicalName()));
    }
    this.outputs.put(alias, providedOutput);
}
Also used : ProvidedOutput(co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) Output(co.cask.cdap.api.data.batch.Output) ProvidedOutput(co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) DatasetOutputCommitter(co.cask.cdap.api.data.batch.DatasetOutputCommitter) OutputFormatProvider(co.cask.cdap.api.data.batch.OutputFormatProvider)

Example 3 with ProvidedOutput

use of co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput in project cdap by caskdata.

the class MapReduceContextConfig method setOutputs.

private void setOutputs(List<ProvidedOutput> providedOutputs) {
    // we only need to serialize the original Output objects, not the entire ProvidedOutput
    List<Output.DatasetOutput> datasetOutputs = new ArrayList<>();
    for (ProvidedOutput providedOutput : providedOutputs) {
        Output output = providedOutput.getOutput();
        if (output instanceof Output.DatasetOutput) {
            datasetOutputs.add((Output.DatasetOutput) output);
        }
    }
    hConf.set(HCONF_ATTR_OUTPUTS, GSON.toJson(datasetOutputs));
}
Also used : ProvidedOutput(co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) Output(co.cask.cdap.api.data.batch.Output) ArrayList(java.util.ArrayList) ProvidedOutput(co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput)

Example 4 with ProvidedOutput

use of co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput in project cdap by caskdata.

the class MapReduceRuntimeService method setOutputsIfNeeded.

/**
 * Sets the configurations used for outputs.
 */
private void setOutputsIfNeeded(Job job) throws ClassNotFoundException {
    List<ProvidedOutput> outputsMap = context.getOutputs();
    fixOutputPermissions(job, outputsMap);
    LOG.debug("Using as output for MapReduce Job: {}", outputsMap);
    OutputFormatProvider rootOutputFormatProvider;
    if (outputsMap.isEmpty()) {
        // user is not going through our APIs to add output; propagate the job's output format
        rootOutputFormatProvider = new BasicOutputFormatProvider(job.getOutputFormatClass().getName(), Collections.<String, String>emptyMap());
    } else if (outputsMap.size() == 1) {
        // If only one output is configured through the context, then set it as the root OutputFormat
        rootOutputFormatProvider = outputsMap.get(0).getOutputFormatProvider();
    } else {
        // multiple output formats configured via the context. We should use a RecordWriter that doesn't support writing
        // as the root output format in this case to disallow writing directly on the context.
        // the OutputCommitter is effectively a no-op, as it runs as the RootOutputCommitter in MultipleOutputsCommitter
        rootOutputFormatProvider = new BasicOutputFormatProvider(UnsupportedOutputFormat.class.getName(), Collections.<String, String>emptyMap());
    }
    MultipleOutputsMainOutputWrapper.setRootOutputFormat(job, rootOutputFormatProvider.getOutputFormatClassName(), rootOutputFormatProvider.getOutputFormatConfiguration());
    job.setOutputFormatClass(MultipleOutputsMainOutputWrapper.class);
    for (ProvidedOutput output : outputsMap) {
        String outputName = output.getOutput().getAlias();
        String outputFormatClassName = output.getOutputFormatClassName();
        Map<String, String> outputConfig = output.getOutputFormatConfiguration();
        MultipleOutputs.addNamedOutput(job, outputName, outputFormatClassName, job.getOutputKeyClass(), job.getOutputValueClass(), outputConfig);
    }
}
Also used : ProvidedOutput(co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) UnsupportedOutputFormat(co.cask.cdap.internal.app.runtime.batch.dataset.UnsupportedOutputFormat) OutputFormatProvider(co.cask.cdap.api.data.batch.OutputFormatProvider)

Example 5 with ProvidedOutput

use of co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput in project cdap by caskdata.

the class MapReduceRuntimeService method fixOutputPermissions.

private void fixOutputPermissions(Job job, Map<String, ProvidedOutput> outputs) {
    Configuration jobconf = job.getConfiguration();
    Set<String> outputsWithUmask = new HashSet<>();
    Set<String> outputUmasks = new HashSet<>();
    for (Map.Entry<String, ProvidedOutput> entry : outputs.entrySet()) {
        String umask = entry.getValue().getOutputFormatConfiguration().get(HADOOP_UMASK_PROPERTY);
        if (umask != null) {
            outputsWithUmask.add(entry.getKey());
            outputUmasks.add(umask);
        }
    }
    boolean allOutputsHaveUmask = outputsWithUmask.size() == outputs.size();
    boolean allOutputsAgree = outputUmasks.size() == 1;
    boolean jobConfHasUmask = isProgrammaticConfig(jobconf, HADOOP_UMASK_PROPERTY);
    String jobConfUmask = jobconf.get(HADOOP_UMASK_PROPERTY);
    boolean mustFixUmasks = false;
    if (jobConfHasUmask) {
        // case 1: job conf has a programmatic umask. It prevails.
        mustFixUmasks = !outputsWithUmask.isEmpty();
        if (mustFixUmasks) {
            LOG.info("Overriding permissions of outputs {} because a umask of {} was set programmatically in the job " + "configuration.", outputsWithUmask, jobConfUmask);
        }
    } else if (allOutputsHaveUmask && allOutputsAgree) {
        // case 2: no programmatic umask in job conf, all outputs want the same umask: set it in job conf
        String umaskToUse = outputUmasks.iterator().next();
        jobconf.set(HADOOP_UMASK_PROPERTY, umaskToUse);
        LOG.debug("Setting umask of {} in job configuration because all outputs {} agree on it.", umaskToUse, outputsWithUmask);
    } else {
        // case 3: some outputs configure a umask, but not all of them, or not all the same: use job conf default
        mustFixUmasks = !outputsWithUmask.isEmpty();
        if (mustFixUmasks) {
            LOG.warn("Overriding permissions of outputs {} because they configure different permissions. Falling back " + "to default umask of {} in job configuration.", outputsWithUmask, jobConfUmask);
        }
    }
    // fix all output configurations that have a umask by removing that property from their configs
    if (mustFixUmasks) {
        for (String outputName : outputsWithUmask) {
            ProvidedOutput output = outputs.get(outputName);
            Map<String, String> outputConfig = new HashMap<>(output.getOutputFormatConfiguration());
            outputConfig.remove(HADOOP_UMASK_PROPERTY);
            outputs.put(outputName, new ProvidedOutput(output.getAlias(), output.getOutputFormatProvider(), output.getOutputFormatClassName(), outputConfig));
        }
    }
}
Also used : CConfiguration(co.cask.cdap.common.conf.CConfiguration) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) HashMap(java.util.HashMap) ProvidedOutput(co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) Map(java.util.Map) HashMap(java.util.HashMap) AbstractMap(java.util.AbstractMap) HashSet(java.util.HashSet)

Aggregations

ProvidedOutput (co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput)6 Output (co.cask.cdap.api.data.batch.Output)2 OutputFormatProvider (co.cask.cdap.api.data.batch.OutputFormatProvider)2 CConfiguration (co.cask.cdap.common.conf.CConfiguration)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 Configuration (org.apache.hadoop.conf.Configuration)2 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)2 ProgramLifecycle (co.cask.cdap.api.ProgramLifecycle)1 TxRunnable (co.cask.cdap.api.TxRunnable)1 TransactionControl (co.cask.cdap.api.annotation.TransactionControl)1 DatasetContext (co.cask.cdap.api.data.DatasetContext)1 DatasetOutputCommitter (co.cask.cdap.api.data.batch.DatasetOutputCommitter)1 AbstractMapReduce (co.cask.cdap.api.mapreduce.AbstractMapReduce)1 MapReduce (co.cask.cdap.api.mapreduce.MapReduce)1 CombineClassLoader (co.cask.cdap.common.lang.CombineClassLoader)1 WeakReferenceDelegatorClassLoader (co.cask.cdap.common.lang.WeakReferenceDelegatorClassLoader)1 UnsupportedOutputFormat (co.cask.cdap.internal.app.runtime.batch.dataset.UnsupportedOutputFormat)1 ProvisionException (com.google.inject.ProvisionException)1 IOException (java.io.IOException)1