use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class SparkStreamingPipelineRunner method handleJoin.
@Override
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
String stageName = stageSpec.getName();
BatchJoiner<?, ?, ?> joiner;
if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
Map<String, Schema> inputSchemas = new HashMap<>();
for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
}
FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
failureCollector.getOrThrowException();
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
if (joinDefinition == null) {
throw new IllegalStateException(String.format("Joiner stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
}
joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
} else if (plugin instanceof BatchJoiner) {
joiner = (BatchJoiner) plugin;
} else {
// should never happen unless there is a bug in the code. should have failed during deployment
throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
}
BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(joinerRuntimeContext);
shufflers.add(stageName);
return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class ValidationUtils method validate.
/**
* Validate plugin based on the {@link StageValidationRequest}
*
* @param validationRequest {@link StageValidationRequest} with plugin properties
* @param pluginConfigurer {@link PluginConfigurer} for using the plugin
* @param macroFn {@link Function} for evaluating macros
* @return {@link StageValidationResponse} in json format
*/
public static StageValidationResponse validate(String namespace, StageValidationRequest validationRequest, PluginConfigurer pluginConfigurer, Function<Map<String, String>, Map<String, String>> macroFn, FeatureFlagsProvider featureFlagsProvider) {
ETLStage stageConfig = validationRequest.getStage();
ValidatingConfigurer validatingConfigurer = new ValidatingConfigurer(pluginConfigurer, featureFlagsProvider);
// Batch or Streaming doesn't matter for a single stage.
PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> pipelineSpecGenerator = new BatchPipelineSpecGenerator(namespace, validatingConfigurer, null, Collections.emptySet(), Collections.emptySet(), Engine.SPARK, featureFlagsProvider);
DefaultStageConfigurer stageConfigurer = new DefaultStageConfigurer(stageConfig.getName());
for (StageSchema stageSchema : validationRequest.getInputSchemas()) {
stageConfigurer.addInputSchema(stageSchema.getStage(), stageSchema.getSchema());
stageConfigurer.addInputStage(stageSchema.getStage());
}
DefaultPipelineConfigurer pipelineConfigurer = new DefaultPipelineConfigurer(validatingConfigurer, stageConfig.getName(), Engine.SPARK, stageConfigurer, featureFlagsProvider);
// evaluate macros
Map<String, String> evaluatedProperties = macroFn.apply(stageConfig.getPlugin().getProperties());
ETLPlugin originalConfig = stageConfig.getPlugin();
ETLPlugin evaluatedConfig = new ETLPlugin(originalConfig.getName(), originalConfig.getType(), evaluatedProperties, originalConfig.getArtifactConfig());
try {
StageSpec spec = pipelineSpecGenerator.configureStage(stageConfig.getName(), evaluatedConfig, pipelineConfigurer).build();
return new StageValidationResponse(spec);
} catch (ValidationException e) {
return new StageValidationResponse(e.getFailures());
}
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class BatchSparkPipelineDriver method run.
@Override
public void run(DatasetContext context) throws Exception {
BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
String object = reader.readLine();
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
stagePartitions = sourceSinkInfo.getStagePartitions();
}
datasetContext = context;
PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
Map<String, StageStatisticsCollector> collectors = new HashMap<>();
if (phaseSpec.pipelineContainsCondition()) {
Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
while (iterator.hasNext()) {
StageSpec spec = iterator.next();
collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
}
}
boolean isSuccessful = true;
try {
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
boolean shouldDisablePushdown = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.DISABLE_ELT_PUSHDOWN, Boolean.FALSE.toString()));
boolean isPreviewEnabled = phaseSpec.isPreviewEnabled(sec);
// Initialize SQL engine instance if needed.
if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null && !shouldDisablePushdown) {
String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
// Instantiate SQL engine and prepare run.
try {
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
sqlEngineAdapter = new BatchSQLEngineAdapter(phaseSpec.getSQLEngineStageSpec().getPlugin().getName(), (SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
sqlEngineAdapter.prepareRun();
} catch (InstantiationException ie) {
LOG.error("Could not create plugin instance for SQLEngine class", ie);
} finally {
if (sqlEngineAdapter == null) {
LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
}
}
}
runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
} catch (Throwable t) {
// Mark this execution as not successful.
isSuccessful = false;
// Rethrow
throw t;
} finally {
updateWorkflowToken(sec.getWorkflowToken(), collectors);
// Close SQL Engine Adapter if neeeded,
if (sqlEngineAdapter != null) {
sqlEngineAdapter.onRunFinish(isSuccessful);
sqlEngineAdapter.close();
}
}
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class SQLEngineCollection method tryMultiStoreDirect.
@Override
public Set<String> tryMultiStoreDirect(PhaseSpec phaseSpec, Set<String> sinks) {
// Set to store names of all consumed sinks.
Set<String> directStoreSinks = new HashSet<>();
// Create list to store all tasks.
List<Future<String>> directStoreFutures = new ArrayList<>(sinks.size());
// Try to run the direct store task on all sink stages.
for (String sinkName : sinks) {
StageSpec stageSpec = phaseSpec.getPhase().getStage(sinkName);
// Check if we are able to write this output directly
if (stageSpec != null) {
// Create an async task that is used to wait for the direct store task to complete.
Supplier<String> task = () -> {
// If the direct store task succeeds, we return the sink name. Otherwise, return null.
if (tryStoreDirect(stageSpec)) {
return sinkName;
}
return null;
};
// We submit these in parallel to prevent blocking for each store task to complete in sequence.
directStoreFutures.add(adapter.submitTask(task));
}
}
// Wait for all the direct store tasks for this group, if any.
for (Future<String> supplier : directStoreFutures) {
try {
// Get sink name from supplier
String sinkName = supplier.get();
// If the sink name is not null, it means this stage was consumed successfully.
if (sinkName != null) {
directStoreSinks.add(sinkName);
}
} catch (InterruptedException e) {
throw Throwables.propagate(e);
} catch (ExecutionException e) {
// We don't propagate this exception as the regular sink workflow can continue.
LOG.warn("Execution exception when executing Direct store task. Sink will proceed with default output.", e);
}
}
return directStoreSinks;
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class SparkPipelineRunner method addEmitted.
private EmittedRecords.Builder addEmitted(EmittedRecords.Builder builder, PipelinePhase pipelinePhase, StageSpec stageSpec, SparkCollection<RecordInfo<Object>> stageData, Dag dag, Set<String> branchers, Set<String> shufflers, boolean hasErrors, boolean hasAlerts) {
builder.setRawData(stageData);
if (shouldCache(dag, stageSpec.getName(), branchers, shufflers, stageData)) {
stageData = stageData.cache();
}
if (hasErrors) {
SparkCollection<ErrorRecord<Object>> errors = stageData.flatMap(stageSpec, new ErrorPassFilter<Object>());
builder.setErrors(errors);
}
if (hasAlerts) {
SparkCollection<Alert> alerts = stageData.flatMap(stageSpec, new AlertPassFilter());
builder.setAlerts(alerts);
}
if (SplitterTransform.PLUGIN_TYPE.equals(stageSpec.getPluginType())) {
// set collections for each port, implemented as a filter on the port.
for (StageSpec.Port portSpec : stageSpec.getOutputPorts().values()) {
String port = portSpec.getPort();
SparkCollection<Object> portData = filterPortRecords(stageSpec, stageData, port);
builder.addPort(port, portData);
}
} else {
SparkCollection<Object> outputs = filterPortRecords(stageSpec, stageData, null);
builder.setOutput(outputs);
}
return builder;
}
Aggregations