Search in sources :

Example 36 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class PipelineTest method testTextFileSinkAndDeletePostAction.

@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
    // create the pipeline config
    String inputName = "sinkTestInput";
    String outputName = "sinkTestOutput";
    String outputDirName = "users";
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
    sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
    sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
    ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
    // mapreduce writes multiple files to the output directory. Along with the actual output,
    // there are various .crc files that do not contain any of the output content.
    actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
    actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
    ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some data to the input fileset
    Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
    Map<String, String> users = new HashMap<>();
    users.put("samuel", "wallet");
    users.put("dwayne", "rock");
    users.put("christopher", "cowbell");
    List<StructuredRecord> inputRecords = new ArrayList<>();
    for (Map.Entry<String, String> userEntry : users.entrySet()) {
        String name = userEntry.getKey();
        String item = userEntry.getValue();
        inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
    }
    DataSetManager<Table> inputManager = getDataset(inputName);
    MockSource.writeInput(inputManager, inputRecords);
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${dir} macro will be substituted with "users" for our pipeline run
    runtimeArgs.put("dir", outputDirName);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<FileSet> outputManager = getDataset(outputName);
    FileSet output = outputManager.get();
    Location outputDir = output.getBaseLocation().append(outputDirName);
    Map<String, String> actual = new HashMap<>();
    for (Location outputFile : outputDir.list()) {
        if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
            Assert.fail("Post action did not delete file " + outputFile.getName());
        }
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] parts = line.split("\\|");
                actual.put(parts[0], parts[1]);
            }
        }
    }
    Assert.assertEquals(actual, users);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) BufferedReader(java.io.BufferedReader) ApplicationId(co.cask.cdap.proto.id.ApplicationId) HashMap(java.util.HashMap) Map(java.util.Map) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 37 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class ETLBatchApplication method configure.

@Override
public void configure() {
    ETLBatchConfig config = getConfig().convertOldConfig();
    setDescription(DEFAULT_DESCRIPTION);
    PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> specGenerator = new BatchPipelineSpecGenerator(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), TimePartitionedFileSet.class, FileSetProperties.builder().setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", Constants.ERROR_SCHEMA.toString()).build(), config.getEngine());
    BatchPipelineSpec spec = specGenerator.generateSpec(config);
    int sourceCount = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            sourceCount++;
        }
    }
    if (sourceCount != 1) {
        throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
    }
    PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
    PipelinePlan plan = planner.plan(spec);
    if (plan.getPhases().size() != 1) {
        // should never happen if there is only one source
        throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
    }
    PipelinePhase pipeline = plan.getPhases().values().iterator().next();
    switch(config.getEngine()) {
        case MAPREDUCE:
            BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
            addMapReduce(new ETLMapReduce(batchPhaseSpec));
            break;
        case SPARK:
            batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
            addSpark(new ETLSpark(batchPhaseSpec));
            break;
        default:
            throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
    }
    addWorkflow(new ETLWorkflow(spec, config.getEngine()));
    scheduleWorkflow(Schedules.builder(SCHEDULE_NAME).setDescription("ETL Batch schedule").createTimeSchedule(config.getSchedule()), ETLWorkflow.NAME);
}
Also used : PipelinePlan(co.cask.cdap.etl.planner.PipelinePlan) ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) HashMap(java.util.HashMap) AvroKeyOutputFormat(org.apache.avro.mapreduce.AvroKeyOutputFormat) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Example 38 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class DataPipelineTest method testExternalSparkProgramPipelines.

@Test
public void testExternalSparkProgramPipelines() throws Exception {
    File testDir = TMP_FOLDER.newFolder("sparkProgramTest");
    File input = new File(testDir, "poem.txt");
    try (PrintWriter writer = new PrintWriter(input.getAbsolutePath())) {
        writer.println("this");
        writer.println("is");
        writer.println("a");
        writer.println("poem");
        writer.println("it");
        writer.println("is");
        writer.println("a");
        writer.println("bad");
        writer.println("poem");
    }
    File wordCountOutput = new File(testDir, "poem_counts");
    File filterOutput = new File(testDir, "poem_filtered");
    String args = String.format("%s %s", input.getAbsolutePath(), wordCountOutput.getAbsolutePath());
    Map<String, String> wordCountProperties = ImmutableMap.of("program.args", args);
    Map<String, String> filterProperties = ImmutableMap.of("inputPath", input.getAbsolutePath(), "outputPath", filterOutput.getAbsolutePath(), "filterStr", "bad");
    ETLBatchConfig etlConfig = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("wordcount", new ETLPlugin(WORDCOUNT_PLUGIN, SPARK_TYPE, wordCountProperties, null))).addStage(new ETLStage("filter", new ETLPlugin(FILTER_PLUGIN, SPARK_TYPE, filterProperties, null))).addConnection("wordcount", "filter").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("sparkProgramTest");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    manager.start();
    manager.waitForRun(ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
    // check wordcount output
    /*
        this is a poem
        it is a bad poem
     */
    Map<String, Integer> expected = new HashMap<>();
    expected.put("this", 1);
    expected.put("is", 2);
    expected.put("a", 2);
    expected.put("poem", 2);
    expected.put("it", 1);
    expected.put("bad", 1);
    Map<String, Integer> counts = new HashMap<>();
    File[] files = wordCountOutput.listFiles();
    Assert.assertNotNull("No output files for wordcount found.", files);
    for (File file : files) {
        String fileName = file.getName();
        if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
            continue;
        }
        try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] fields = line.split(" ");
                counts.put(fields[0], Integer.parseInt(fields[1]));
            }
        }
    }
    Assert.assertEquals(expected, counts);
    // check filter output
    files = filterOutput.listFiles();
    Assert.assertNotNull("No output files for filter program found.", files);
    List<String> expectedLines = ImmutableList.of("this", "is", "a", "poem", "it", "is", "a", "poem");
    List<String> actualLines = new ArrayList<>();
    for (File file : files) {
        String fileName = file.getName();
        if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
            continue;
        }
        try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
            String line;
            while ((line = reader.readLine()) != null) {
                actualLines.add(line);
            }
        }
    }
    Assert.assertEquals(expectedLines, actualLines);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) HashMap(java.util.HashMap) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) ApplicationId(co.cask.cdap.proto.id.ApplicationId) File(java.io.File) PrintWriter(java.io.PrintWriter) Test(org.junit.Test)

Example 39 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class DataPipelineTest method testNoMacroMapReduce.

/**
   * Tests that if no macro is provided to the dataset name property, datasets will be created at config time.
   */
@Test
public void testNoMacroMapReduce() throws Exception {
    /*
     * Trivial MapReduce pipeline from batch source to batch sink.
     *
     * source --------- sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "configTimeMockSourceDataset"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "configTimeMockSinkDataset"))).addConnection("source", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    // set runtime arguments for macro substitution
    Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "SinkDataset", "source", "Source", "runtimeSource", "mockRuntimeSourceDataset");
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    // make sure the datasets were created at configure time
    Assert.assertNotNull(getDataset("configTimeMockSourceDataset").get());
    Assert.assertNotNull(getDataset("configTimeMockSinkDataset").get());
    workflowManager.setRuntimeArgs(runtimeArguments);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(co.cask.cdap.test.ApplicationManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) WorkflowManager(co.cask.cdap.test.WorkflowManager) ApplicationId(co.cask.cdap.proto.id.ApplicationId) AppRequest(co.cask.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 40 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class DataPipelineTest method testSequentialAggregators.

private void testSequentialAggregators(Engine engine) throws Exception {
    String sourceName = "linearAggInput-" + engine.name();
    String sinkName = "linearAggOutput-" + engine.name();
    /*
     * source --> filter1 --> aggregator1 --> aggregator2 --> filter2 --> sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").setEngine(engine).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "bob"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "jane"))).addStage(new ETLStage("aggregator1", IdentityAggregator.getPlugin())).addStage(new ETLStage("aggregator2", IdentityAggregator.getPlugin())).addConnection("source", "filter1").addConnection("filter1", "aggregator1").addConnection("aggregator1", "aggregator2").addConnection("aggregator2", "filter2").addConnection("filter2", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("LinearAggApp-" + engine);
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
    // write one record to each source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(sourceName));
    MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // check output
    DataSetManager<Table> sinkManager = getDataset(sinkName);
    Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel);
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    validateMetric(3, appId, "source.records.out");
    validateMetric(3, appId, "filter1.records.in");
    validateMetric(2, appId, "filter1.records.out");
    validateMetric(2, appId, "aggregator1.records.in");
    validateMetric(2, appId, "aggregator1.records.out");
    validateMetric(2, appId, "aggregator2.records.in");
    validateMetric(2, appId, "aggregator2.records.out");
    validateMetric(2, appId, "filter2.records.in");
    validateMetric(1, appId, "filter2.records.out");
    validateMetric(1, appId, "sink.records.out");
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId)

Aggregations

ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)47 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)46 ApplicationId (co.cask.cdap.proto.id.ApplicationId)32 ApplicationManager (co.cask.cdap.test.ApplicationManager)30 WorkflowManager (co.cask.cdap.test.WorkflowManager)30 AppRequest (co.cask.cdap.proto.artifact.AppRequest)27 Test (org.junit.Test)27 Table (co.cask.cdap.api.dataset.table.Table)26 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)25 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)24 Schema (co.cask.cdap.api.data.schema.Schema)22 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)9 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)6 BatchPipelineSpec (co.cask.cdap.etl.batch.BatchPipelineSpec)4 Resources (co.cask.cdap.api.Resources)2 FileSet (co.cask.cdap.api.dataset.lib.FileSet)2 PreviewManager (co.cask.cdap.app.preview.PreviewManager)2 PreviewRunner (co.cask.cdap.app.preview.PreviewRunner)2