Search in sources :

Example 1 with ETLSpark

use of co.cask.cdap.etl.spark.batch.ETLSpark in project cdap by caskdata.

the class SmartWorkflow method addProgram.

private void addProgram(String phaseName, WorkflowProgramAdder programAdder) {
    PipelinePhase phase = plan.getPhase(phaseName);
    // artificially added by the control dag flattening process. So nothing to add, skip it
    if (phase == null) {
        return;
    }
    // can't use phase name as a program name because it might contain invalid characters
    String programName = "phase-" + phaseNum;
    phaseNum++;
    // if this phase uses connectors, add the local dataset for that connector if we haven't already
    for (StageInfo connectorInfo : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
        String connectorName = connectorInfo.getName();
        String datasetName = connectorDatasets.get(connectorName);
        if (datasetName == null) {
            datasetName = "conn-" + connectorNum++;
            connectorDatasets.put(connectorName, datasetName);
            // add the local dataset
            ConnectorSource connectorSource = new ConnectorSource(datasetName, null);
            connectorSource.configure(getConfigurer());
        }
    }
    Map<String, String> phaseConnectorDatasets = new HashMap<>();
    for (StageInfo connectorStage : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
        phaseConnectorDatasets.put(connectorStage.getName(), connectorDatasets.get(connectorStage.getName()));
    }
    BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(programName, phase, spec.getResources(), spec.getDriverResources(), spec.getClientResources(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled(), phaseConnectorDatasets, spec.getNumOfRecordsPreview(), spec.getProperties());
    Set<String> pluginTypes = batchPhaseSpec.getPhase().getPluginTypes();
    if (pluginTypes.contains(Action.PLUGIN_TYPE)) {
        // actions will be all by themselves in a phase
        programAdder.addAction(new PipelineAction(batchPhaseSpec));
    } else if (pluginTypes.contains(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
        // spark programs will be all by themselves in a phase
        String stageName = phase.getStagesOfType(Constants.SPARK_PROGRAM_PLUGIN_TYPE).iterator().next().getName();
        StageSpec stageSpec = stageSpecs.get(stageName);
        applicationConfigurer.addSpark(new ExternalSparkProgram(batchPhaseSpec, stageSpec));
        programAdder.addSpark(programName);
    } else if (useSpark) {
        applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
        programAdder.addSpark(programName);
    } else {
        applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec));
        programAdder.addMapReduce(programName);
    }
}
Also used : ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) PipelineAction(co.cask.cdap.etl.batch.customaction.PipelineAction) ConnectorSource(co.cask.cdap.etl.batch.connector.ConnectorSource) ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec)

Example 2 with ETLSpark

use of co.cask.cdap.etl.spark.batch.ETLSpark in project cdap by caskdata.

the class ETLBatchApplication method configure.

@Override
public void configure() {
    ETLBatchConfig config = getConfig().convertOldConfig();
    setDescription(DEFAULT_DESCRIPTION);
    PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> specGenerator = new BatchPipelineSpecGenerator(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), TimePartitionedFileSet.class, FileSetProperties.builder().setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", Constants.ERROR_SCHEMA.toString()).build(), config.getEngine());
    BatchPipelineSpec spec = specGenerator.generateSpec(config);
    int sourceCount = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            sourceCount++;
        }
    }
    if (sourceCount != 1) {
        throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
    }
    PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
    PipelinePlan plan = planner.plan(spec);
    if (plan.getPhases().size() != 1) {
        // should never happen if there is only one source
        throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
    }
    PipelinePhase pipeline = plan.getPhases().values().iterator().next();
    switch(config.getEngine()) {
        case MAPREDUCE:
            BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
            addMapReduce(new ETLMapReduce(batchPhaseSpec));
            break;
        case SPARK:
            batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
            addSpark(new ETLSpark(batchPhaseSpec));
            break;
        default:
            throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
    }
    addWorkflow(new ETLWorkflow(spec, config.getEngine()));
    scheduleWorkflow(Schedules.builder(SCHEDULE_NAME).setDescription("ETL Batch schedule").createTimeSchedule(config.getSchedule()), ETLWorkflow.NAME);
}
Also used : PipelinePlan(co.cask.cdap.etl.planner.PipelinePlan) ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) HashMap(java.util.HashMap) AvroKeyOutputFormat(org.apache.avro.mapreduce.AvroKeyOutputFormat) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Aggregations

ETLMapReduce (co.cask.cdap.etl.batch.mapreduce.ETLMapReduce)2 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)2 ETLSpark (co.cask.cdap.etl.spark.batch.ETLSpark)2 StageSpec (co.cask.cdap.etl.spec.StageSpec)2 HashMap (java.util.HashMap)2 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)1 ConnectorSource (co.cask.cdap.etl.batch.connector.ConnectorSource)1 PipelineAction (co.cask.cdap.etl.batch.customaction.PipelineAction)1 PipelinePlan (co.cask.cdap.etl.planner.PipelinePlan)1 PipelinePlanner (co.cask.cdap.etl.planner.PipelinePlanner)1 StageInfo (co.cask.cdap.etl.planner.StageInfo)1 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)1 LinkedHashMap (java.util.LinkedHashMap)1 AvroKeyOutputFormat (org.apache.avro.mapreduce.AvroKeyOutputFormat)1