Search in sources :

Example 16 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class SparkStreamingPipelineRunner method handleJoin.

@Override
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
    String stageName = stageSpec.getName();
    BatchJoiner<?, ?, ?> joiner;
    if (plugin instanceof BatchAutoJoiner) {
        BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
        Map<String, Schema> inputSchemas = new HashMap<>();
        for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
            StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
            inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
        }
        FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
        AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
        failureCollector.getOrThrowException();
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        if (joinDefinition == null) {
            throw new IllegalStateException(String.format("Joiner stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
        }
        joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
    } else if (plugin instanceof BatchJoiner) {
        joiner = (BatchJoiner) plugin;
    } else {
        // should never happen unless there is a bug in the code. should have failed during deployment
        throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
    }
    BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
    joiner.initialize(joinerRuntimeContext);
    shufflers.add(stageName);
    return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
}
Also used : BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge)

Example 17 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class ValidationUtils method validate.

/**
 * Validate plugin based on the {@link StageValidationRequest}
 *
 * @param validationRequest {@link StageValidationRequest} with plugin properties
 * @param pluginConfigurer  {@link PluginConfigurer} for using the plugin
 * @param macroFn           {@link Function} for evaluating macros
 * @return {@link StageValidationResponse} in json format
 */
public static StageValidationResponse validate(String namespace, StageValidationRequest validationRequest, PluginConfigurer pluginConfigurer, Function<Map<String, String>, Map<String, String>> macroFn, FeatureFlagsProvider featureFlagsProvider) {
    ETLStage stageConfig = validationRequest.getStage();
    ValidatingConfigurer validatingConfigurer = new ValidatingConfigurer(pluginConfigurer, featureFlagsProvider);
    // Batch or Streaming doesn't matter for a single stage.
    PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> pipelineSpecGenerator = new BatchPipelineSpecGenerator(namespace, validatingConfigurer, null, Collections.emptySet(), Collections.emptySet(), Engine.SPARK, featureFlagsProvider);
    DefaultStageConfigurer stageConfigurer = new DefaultStageConfigurer(stageConfig.getName());
    for (StageSchema stageSchema : validationRequest.getInputSchemas()) {
        stageConfigurer.addInputSchema(stageSchema.getStage(), stageSchema.getSchema());
        stageConfigurer.addInputStage(stageSchema.getStage());
    }
    DefaultPipelineConfigurer pipelineConfigurer = new DefaultPipelineConfigurer(validatingConfigurer, stageConfig.getName(), Engine.SPARK, stageConfigurer, featureFlagsProvider);
    // evaluate macros
    Map<String, String> evaluatedProperties = macroFn.apply(stageConfig.getPlugin().getProperties());
    ETLPlugin originalConfig = stageConfig.getPlugin();
    ETLPlugin evaluatedConfig = new ETLPlugin(originalConfig.getName(), originalConfig.getType(), evaluatedProperties, originalConfig.getArtifactConfig());
    try {
        StageSpec spec = pipelineSpecGenerator.configureStage(stageConfig.getName(), evaluatedConfig, pipelineConfigurer).build();
        return new StageValidationResponse(spec);
    } catch (ValidationException e) {
        return new StageValidationResponse(e.getFailures());
    }
}
Also used : ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) BatchPipelineSpecGenerator(io.cdap.cdap.etl.batch.BatchPipelineSpecGenerator) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) DefaultStageConfigurer(io.cdap.cdap.etl.common.DefaultStageConfigurer) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) BatchPipelineSpec(io.cdap.cdap.etl.batch.BatchPipelineSpec) StageSchema(io.cdap.cdap.etl.proto.v2.validation.StageSchema) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ValidatingConfigurer(io.cdap.cdap.etl.validation.ValidatingConfigurer) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultPipelineConfigurer(io.cdap.cdap.etl.common.DefaultPipelineConfigurer) StageValidationResponse(io.cdap.cdap.etl.proto.v2.validation.StageValidationResponse)

Example 18 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class BatchSparkPipelineDriver method run.

@Override
public void run(DatasetContext context) throws Exception {
    BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
    try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
        String object = reader.readLine();
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
        sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
        sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
        stagePartitions = sourceSinkInfo.getStagePartitions();
    }
    datasetContext = context;
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    Map<String, StageStatisticsCollector> collectors = new HashMap<>();
    if (phaseSpec.pipelineContainsCondition()) {
        Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
        while (iterator.hasNext()) {
            StageSpec spec = iterator.next();
            collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
        }
    }
    boolean isSuccessful = true;
    try {
        PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
        boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
        boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
        boolean shouldDisablePushdown = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.DISABLE_ELT_PUSHDOWN, Boolean.FALSE.toString()));
        boolean isPreviewEnabled = phaseSpec.isPreviewEnabled(sec);
        // Initialize SQL engine instance if needed.
        if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null && !shouldDisablePushdown) {
            String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
            // Instantiate SQL engine and prepare run.
            try {
                MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
                Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
                sqlEngineAdapter = new BatchSQLEngineAdapter(phaseSpec.getSQLEngineStageSpec().getPlugin().getName(), (SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
                sqlEngineAdapter.prepareRun();
            } catch (InstantiationException ie) {
                LOG.error("Could not create plugin instance for SQLEngine class", ie);
            } finally {
                if (sqlEngineAdapter == null) {
                    LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
                }
            }
        }
        runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
    } catch (Throwable t) {
        // Mark this execution as not successful.
        isSuccessful = false;
        // Rethrow
        throw t;
    } finally {
        updateWorkflowToken(sec.getWorkflowToken(), collectors);
        // Close SQL Engine Adapter if neeeded,
        if (sqlEngineAdapter != null) {
            sqlEngineAdapter.onRunFinish(isSuccessful);
            sqlEngineAdapter.close();
        }
    }
}
Also used : Path(java.nio.file.Path) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) HashMap(java.util.HashMap) SingleConnectorFactory(io.cdap.cdap.etl.batch.connector.SingleConnectorFactory) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) BufferedReader(java.io.BufferedReader) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 19 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class SQLEngineCollection method tryMultiStoreDirect.

@Override
public Set<String> tryMultiStoreDirect(PhaseSpec phaseSpec, Set<String> sinks) {
    // Set to store names of all consumed sinks.
    Set<String> directStoreSinks = new HashSet<>();
    // Create list to store all tasks.
    List<Future<String>> directStoreFutures = new ArrayList<>(sinks.size());
    // Try to run the direct store task on all sink stages.
    for (String sinkName : sinks) {
        StageSpec stageSpec = phaseSpec.getPhase().getStage(sinkName);
        // Check if we are able to write this output directly
        if (stageSpec != null) {
            // Create an async task that is used to wait for the direct store task to complete.
            Supplier<String> task = () -> {
                // If the direct store task succeeds, we return the sink name. Otherwise, return null.
                if (tryStoreDirect(stageSpec)) {
                    return sinkName;
                }
                return null;
            };
            // We submit these in parallel to prevent blocking for each store task to complete in sequence.
            directStoreFutures.add(adapter.submitTask(task));
        }
    }
    // Wait for all the direct store tasks for this group, if any.
    for (Future<String> supplier : directStoreFutures) {
        try {
            // Get sink name from supplier
            String sinkName = supplier.get();
            // If the sink name is not null, it means this stage was consumed successfully.
            if (sinkName != null) {
                directStoreSinks.add(sinkName);
            }
        } catch (InterruptedException e) {
            throw Throwables.propagate(e);
        } catch (ExecutionException e) {
            // We don't propagate this exception as the regular sink workflow can continue.
            LOG.warn("Execution exception when executing Direct store task. Sink will proceed with default output.", e);
        }
    }
    return directStoreSinks;
}
Also used : StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) ArrayList(java.util.ArrayList) Future(java.util.concurrent.Future) ExecutionException(java.util.concurrent.ExecutionException) HashSet(java.util.HashSet)

Example 20 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class SparkPipelineRunner method addEmitted.

private EmittedRecords.Builder addEmitted(EmittedRecords.Builder builder, PipelinePhase pipelinePhase, StageSpec stageSpec, SparkCollection<RecordInfo<Object>> stageData, Dag dag, Set<String> branchers, Set<String> shufflers, boolean hasErrors, boolean hasAlerts) {
    builder.setRawData(stageData);
    if (shouldCache(dag, stageSpec.getName(), branchers, shufflers, stageData)) {
        stageData = stageData.cache();
    }
    if (hasErrors) {
        SparkCollection<ErrorRecord<Object>> errors = stageData.flatMap(stageSpec, new ErrorPassFilter<Object>());
        builder.setErrors(errors);
    }
    if (hasAlerts) {
        SparkCollection<Alert> alerts = stageData.flatMap(stageSpec, new AlertPassFilter());
        builder.setAlerts(alerts);
    }
    if (SplitterTransform.PLUGIN_TYPE.equals(stageSpec.getPluginType())) {
        // set collections for each port, implemented as a filter on the port.
        for (StageSpec.Port portSpec : stageSpec.getOutputPorts().values()) {
            String port = portSpec.getPort();
            SparkCollection<Object> portData = filterPortRecords(stageSpec, stageData, port);
            builder.addPort(port, portData);
        }
    } else {
        SparkCollection<Object> outputs = filterPortRecords(stageSpec, stageData, null);
        builder.setOutput(outputs);
    }
    return builder;
}
Also used : StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) Alert(io.cdap.cdap.etl.api.Alert) AlertPassFilter(io.cdap.cdap.etl.spark.function.AlertPassFilter) Port(io.cdap.cdap.etl.proto.v2.spec.StageSpec.Port) ErrorRecord(io.cdap.cdap.etl.api.ErrorRecord)

Aggregations

StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)74 HashMap (java.util.HashMap)42 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)30 HashSet (java.util.HashSet)24 Map (java.util.Map)24 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)20 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)20 Connection (io.cdap.cdap.etl.proto.Connection)18 Schema (io.cdap.cdap.api.data.schema.Schema)16 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)16 ArrayList (java.util.ArrayList)16 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)14 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)14 Test (org.junit.Test)14 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)12 PluginContext (io.cdap.cdap.api.plugin.PluginContext)10 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)10 List (java.util.List)10 WorkflowToken (io.cdap.cdap.api.workflow.WorkflowToken)8 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)8