Search in sources :

Example 1 with SparkSpecification

use of co.cask.cdap.api.spark.SparkSpecification in project cdap by caskdata.

the class AbstractSparkSubmitter method submit.

@Override
public final <V> ListenableFuture<V> submit(final SparkRuntimeContext runtimeContext, Map<String, String> configs, List<LocalizeResource> resources, File jobJar, final V result) {
    final SparkSpecification spec = runtimeContext.getSparkSpecification();
    final List<String> args = createSubmitArguments(spec, configs, resources, jobJar);
    // Spark submit is called from this executor
    // Use an executor to simplify logic that is needed to interrupt the running thread on stopping
    final ExecutorService executor = Executors.newSingleThreadExecutor(new ThreadFactory() {

        @Override
        public Thread newThread(Runnable r) {
            return new Thread(r, "spark-submitter-" + spec.getName() + "-" + runtimeContext.getRunId());
        }
    });
    // Latch for the Spark job completion
    final CountDownLatch completion = new CountDownLatch(1);
    final SparkJobFuture<V> resultFuture = new SparkJobFuture<V>(runtimeContext) {

        @Override
        protected void cancelTask() {
            // Try to shutdown the running spark job.
            triggerShutdown();
            // Wait for the Spark-Submit returns
            Uninterruptibles.awaitUninterruptibly(completion);
        }
    };
    // Submit the Spark job
    executor.submit(new Runnable() {

        @Override
        public void run() {
            List<String> extraArgs = beforeSubmit();
            try {
                String[] submitArgs = Iterables.toArray(Iterables.concat(args, extraArgs), String.class);
                submit(runtimeContext, submitArgs);
                onCompleted(true);
                resultFuture.set(result);
            } catch (Throwable t) {
                onCompleted(false);
                resultFuture.setException(t);
            } finally {
                completion.countDown();
            }
        }
    });
    // Shutdown the executor right after submit since the thread is only used for one submission.
    executor.shutdown();
    return resultFuture;
}
Also used : ThreadFactory(java.util.concurrent.ThreadFactory) CountDownLatch(java.util.concurrent.CountDownLatch) SparkSpecification(co.cask.cdap.api.spark.SparkSpecification) ExecutorService(java.util.concurrent.ExecutorService) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List)

Example 2 with SparkSpecification

use of co.cask.cdap.api.spark.SparkSpecification in project cdap by caskdata.

the class SparkProgramRunner method run.

@Override
public ProgramController run(Program program, ProgramOptions options) {
    // Get the RunId first. It is used for the creation of the ClassLoader closing thread.
    Arguments arguments = options.getArguments();
    RunId runId = ProgramRunners.getRunId(options);
    Deque<Closeable> closeables = new LinkedList<>();
    try {
        // Extract and verify parameters
        ApplicationSpecification appSpec = program.getApplicationSpecification();
        Preconditions.checkNotNull(appSpec, "Missing application specification.");
        ProgramType processorType = program.getType();
        Preconditions.checkNotNull(processorType, "Missing processor type.");
        Preconditions.checkArgument(processorType == ProgramType.SPARK, "Only Spark process type is supported.");
        SparkSpecification spec = appSpec.getSpark().get(program.getName());
        Preconditions.checkNotNull(spec, "Missing SparkSpecification for %s", program.getName());
        String host = options.getArguments().getOption(ProgramOptionConstants.HOST);
        Preconditions.checkArgument(host != null, "No hostname is provided");
        // Get the WorkflowProgramInfo if it is started by Workflow
        WorkflowProgramInfo workflowInfo = WorkflowProgramInfo.create(arguments);
        DatasetFramework programDatasetFramework = workflowInfo == null ? datasetFramework : NameMappedDatasetFramework.createFromWorkflowProgramInfo(datasetFramework, workflowInfo, appSpec);
        // Setup dataset framework context, if required
        if (programDatasetFramework instanceof ProgramContextAware) {
            ProgramId programId = program.getId();
            ((ProgramContextAware) programDatasetFramework).setContext(new BasicProgramContext(programId.run(runId)));
        }
        PluginInstantiator pluginInstantiator = createPluginInstantiator(options, program.getClassLoader());
        if (pluginInstantiator != null) {
            closeables.addFirst(pluginInstantiator);
        }
        SparkRuntimeContext runtimeContext = new SparkRuntimeContext(new Configuration(hConf), program, options, cConf, host, txClient, programDatasetFramework, discoveryServiceClient, metricsCollectionService, streamAdmin, workflowInfo, pluginInstantiator, secureStore, secureStoreManager, authorizationEnforcer, authenticationContext, messagingService, serviceAnnouncer, pluginFinder, locationFactory);
        closeables.addFirst(runtimeContext);
        Spark spark;
        try {
            spark = new InstantiatorFactory(false).get(TypeToken.of(program.<Spark>getMainClass())).create();
        } catch (Exception e) {
            LOG.error("Failed to instantiate Spark class for {}", spec.getClassName(), e);
            throw Throwables.propagate(e);
        }
        SparkSubmitter submitter = SparkRuntimeContextConfig.isLocal(hConf) ? new LocalSparkSubmitter() : new DistributedSparkSubmitter(hConf, locationFactory, host, runtimeContext, options.getArguments().getOption(Constants.AppFabric.APP_SCHEDULER_QUEUE));
        Service sparkRuntimeService = new SparkRuntimeService(cConf, spark, getPluginArchive(options), runtimeContext, submitter, locationFactory);
        sparkRuntimeService.addListener(createRuntimeServiceListener(closeables), Threads.SAME_THREAD_EXECUTOR);
        ProgramController controller = new SparkProgramController(sparkRuntimeService, runtimeContext);
        LOG.debug("Starting Spark Job. Context: {}", runtimeContext);
        if (SparkRuntimeContextConfig.isLocal(hConf) || UserGroupInformation.isSecurityEnabled()) {
            sparkRuntimeService.start();
        } else {
            ProgramRunners.startAsUser(cConf.get(Constants.CFG_HDFS_USER), sparkRuntimeService);
        }
        return controller;
    } catch (Throwable t) {
        closeAllQuietly(closeables);
        throw Throwables.propagate(t);
    }
}
Also used : ApplicationSpecification(co.cask.cdap.api.app.ApplicationSpecification) SparkSubmitter(co.cask.cdap.app.runtime.spark.submit.SparkSubmitter) DistributedSparkSubmitter(co.cask.cdap.app.runtime.spark.submit.DistributedSparkSubmitter) LocalSparkSubmitter(co.cask.cdap.app.runtime.spark.submit.LocalSparkSubmitter) CConfiguration(co.cask.cdap.common.conf.CConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Closeable(java.io.Closeable) DistributedSparkSubmitter(co.cask.cdap.app.runtime.spark.submit.DistributedSparkSubmitter) NameMappedDatasetFramework(co.cask.cdap.internal.app.runtime.workflow.NameMappedDatasetFramework) DatasetFramework(co.cask.cdap.data2.dataset2.DatasetFramework) InstantiatorFactory(co.cask.cdap.common.lang.InstantiatorFactory) SparkSpecification(co.cask.cdap.api.spark.SparkSpecification) ProgramType(co.cask.cdap.proto.ProgramType) RunId(org.apache.twill.api.RunId) ProgramController(co.cask.cdap.app.runtime.ProgramController) Arguments(co.cask.cdap.app.runtime.Arguments) MessagingService(co.cask.cdap.messaging.MessagingService) MetricsCollectionService(co.cask.cdap.api.metrics.MetricsCollectionService) Service(com.google.common.util.concurrent.Service) ProgramId(co.cask.cdap.proto.id.ProgramId) BasicProgramContext(co.cask.cdap.internal.app.runtime.BasicProgramContext) LinkedList(java.util.LinkedList) IOException(java.io.IOException) WorkflowProgramInfo(co.cask.cdap.internal.app.runtime.workflow.WorkflowProgramInfo) PluginInstantiator(co.cask.cdap.internal.app.runtime.plugin.PluginInstantiator) Spark(co.cask.cdap.api.spark.Spark) LocalSparkSubmitter(co.cask.cdap.app.runtime.spark.submit.LocalSparkSubmitter) ProgramContextAware(co.cask.cdap.data.ProgramContextAware)

Example 3 with SparkSpecification

use of co.cask.cdap.api.spark.SparkSpecification in project cdap by caskdata.

the class SparkRuntimeContext method getSparkSpecification.

private static SparkSpecification getSparkSpecification(Program program) {
    SparkSpecification spec = program.getApplicationSpecification().getSpark().get(program.getName());
    // Spec shouldn't be null, otherwise the spark program won't even get started
    Preconditions.checkState(spec != null, "SparkSpecification not found for %s", program.getId());
    return spec;
}
Also used : SparkSpecification(co.cask.cdap.api.spark.SparkSpecification)

Example 4 with SparkSpecification

use of co.cask.cdap.api.spark.SparkSpecification in project cdap by caskdata.

the class AbstractSparkSubmitter method createSubmitArguments.

/**
 * Creates the list of arguments that will be used for calling {@link SparkSubmit#main(String[])}.
 *
 * @param runtimeContext the {@link SparkRuntimeContext} for the spark program
 * @param configs set of Spark configurations
 * @param resources list of resources that needs to be localized to Spark containers
 * @param jobFile the job file for Spark
 * @return a list of arguments
 */
private List<String> createSubmitArguments(SparkRuntimeContext runtimeContext, Map<String, String> configs, List<LocalizeResource> resources, URI jobFile) {
    SparkSpecification spec = runtimeContext.getSparkSpecification();
    ImmutableList.Builder<String> builder = ImmutableList.builder();
    addMaster(configs, builder);
    builder.add("--conf").add("spark.app.name=" + spec.getName());
    BiConsumer<String, String> confAdder = (k, v) -> builder.add("--conf").add(k + "=" + v);
    configs.forEach(confAdder);
    getSubmitConf().forEach(confAdder);
    String archives = Joiner.on(',').join(Iterables.transform(Iterables.filter(resources, ARCHIVE_FILTER), RESOURCE_TO_PATH));
    String files = Joiner.on(',').join(Iterables.transform(Iterables.filter(resources, Predicates.not(ARCHIVE_FILTER)), RESOURCE_TO_PATH));
    if (!archives.isEmpty()) {
        builder.add("--archives").add(archives);
    }
    if (!files.isEmpty()) {
        builder.add("--files").add(files);
    }
    boolean isPySpark = jobFile.getPath().endsWith(".py");
    if (isPySpark) {
        // For python, add extra py library files
        String pyFiles = configs.get("spark.submit.pyFiles");
        if (pyFiles != null) {
            builder.add("--py-files").add(pyFiles);
        }
    } else {
        builder.add("--class").add(SparkMainWrapper.class.getName());
    }
    if ("file".equals(jobFile.getScheme())) {
        builder.add(jobFile.getPath());
    } else {
        builder.add(jobFile.toString());
    }
    if (!isPySpark) {
        // Add extra arguments for easily identifying the program from command line.
        // Arguments to user program is always coming from the runtime arguments.
        builder.add("--cdap.spark.program=" + runtimeContext.getProgramRunId().toString());
        builder.add("--cdap.user.main.class=" + spec.getMainClassName());
    }
    return builder.build();
}
Also used : Iterables(com.google.common.collect.Iterables) Arrays(java.util.Arrays) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) LoggerFactory(org.slf4j.LoggerFactory) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) SparkSpecification(co.cask.cdap.api.spark.SparkSpecification) SparkMainWrapper(co.cask.cdap.app.runtime.spark.SparkMainWrapper) Future(java.util.concurrent.Future) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) BiConsumer(java.util.function.BiConsumer) Predicates(com.google.common.base.Predicates) URI(java.net.URI) ThreadFactory(java.util.concurrent.ThreadFactory) ExecutorService(java.util.concurrent.ExecutorService) ClassLoaders(co.cask.cdap.common.lang.ClassLoaders) Function(com.google.common.base.Function) Uninterruptibles(com.google.common.util.concurrent.Uninterruptibles) Logger(org.slf4j.Logger) SparkSubmit(org.apache.spark.deploy.SparkSubmit) Executors(java.util.concurrent.Executors) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) SparkRuntimeContext(co.cask.cdap.app.runtime.spark.SparkRuntimeContext) Predicate(com.google.common.base.Predicate) LocalizeResource(co.cask.cdap.internal.app.runtime.distributed.LocalizeResource) Collections(java.util.Collections) AbstractFuture(com.google.common.util.concurrent.AbstractFuture) Joiner(com.google.common.base.Joiner) SparkSpecification(co.cask.cdap.api.spark.SparkSpecification) SparkMainWrapper(co.cask.cdap.app.runtime.spark.SparkMainWrapper) ImmutableList(com.google.common.collect.ImmutableList)

Example 5 with SparkSpecification

use of co.cask.cdap.api.spark.SparkSpecification in project cdap by caskdata.

the class DefaultSparkConfigurer method createSpecification.

public SparkSpecification createSpecification() {
    Set<String> datasets = new HashSet<>();
    // Grab all @Property and @Dataset fields
    Reflections.visit(spark, spark.getClass(), new PropertyFieldExtractor(properties), new DataSetFieldExtractor(datasets));
    return new SparkSpecification(spark.getClass().getName(), name, description, mainClassName, datasets, properties, clientResources, driverResources, executorResources, getHandlers());
}
Also used : SparkSpecification(co.cask.cdap.api.spark.SparkSpecification) PropertyFieldExtractor(co.cask.cdap.internal.specification.PropertyFieldExtractor) DataSetFieldExtractor(co.cask.cdap.internal.specification.DataSetFieldExtractor) HashSet(java.util.HashSet)

Aggregations

SparkSpecification (co.cask.cdap.api.spark.SparkSpecification)12 ApplicationSpecification (co.cask.cdap.api.app.ApplicationSpecification)4 ImmutableList (com.google.common.collect.ImmutableList)3 List (java.util.List)3 CountDownLatch (java.util.concurrent.CountDownLatch)3 ExecutorService (java.util.concurrent.ExecutorService)3 ThreadFactory (java.util.concurrent.ThreadFactory)3 Resources (co.cask.cdap.api.Resources)2 FlowSpecification (co.cask.cdap.api.flow.FlowSpecification)2 MapReduceSpecification (co.cask.cdap.api.mapreduce.MapReduceSpecification)2 ServiceSpecification (co.cask.cdap.api.service.ServiceSpecification)2 Spark (co.cask.cdap.api.spark.Spark)2 LocalizeResource (co.cask.cdap.internal.app.runtime.distributed.LocalizeResource)2 PluginInstantiator (co.cask.cdap.internal.app.runtime.plugin.PluginInstantiator)2 ProgramType (co.cask.cdap.proto.ProgramType)2 ProgramId (co.cask.cdap.proto.id.ProgramId)2 JsonObject (com.google.gson.JsonObject)2 ArtifactId (co.cask.cdap.api.artifact.ArtifactId)1 StreamSpecification (co.cask.cdap.api.data.stream.StreamSpecification)1 FlowletConnection (co.cask.cdap.api.flow.FlowletConnection)1