Search in sources :

Example 26 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project hydrator-plugins by cdapio.

the class NormalizeTest method deployApplication.

private ApplicationManager deployApplication(Map<String, String> sourceProperties, String inputDatasetName, String outputDatasetName, String applicationName) throws Exception {
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputDatasetName));
    ETLStage transform = new ETLStage("normalize", new ETLPlugin("Normalize", Transform.PLUGIN_TYPE, sourceProperties, null));
    ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(transform).addStage(sink).addConnection(source.getName(), transform.getName()).addConnection(transform.getName(), sink.getName()).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(BATCH_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app(applicationName);
    return deployApplication(appId, appRequest);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) AppRequest(io.cdap.cdap.proto.artifact.AppRequest)

Example 27 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project hydrator-plugins by cdapio.

the class SparkPluginTest method testHttpStreamingSource.

@Test
public void testHttpStreamingSource() throws Exception {
    Assert.assertEquals(200, resetFeeds());
    final String content = "samuel jackson\ndwayne johnson\nchristopher walken";
    Assert.assertEquals(200, writeFeed("people", content));
    Map<String, String> properties = ImmutableMap.of("referenceName", "peopleFeed", "url", httpBase + "/feeds/people", "interval", "1");
    DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", new ETLPlugin("HTTPPoller", StreamingSource.PLUGIN_TYPE, properties, null))).addStage(new ETLStage("sink", MockSink.getPlugin("httpOutput"))).addConnection("source", "sink").setBatchInterval("1s").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(DATASTREAMS_ARTIFACT, pipelineConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("HTTPSourceApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    final DataSetManager<Table> outputManager = getDataset("httpOutput");
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Set<String> contents = new HashSet<>();
        for (StructuredRecord record : MockSink.readOutput(outputManager)) {
            contents.add(record.get("body"));
        }
        return contents.size() == 1 && contents.contains(content);
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 28 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project hydrator-plugins by cdapio.

the class SparkPluginTest method testFileSource.

@Test
public void testFileSource() throws Exception {
    Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("first", Schema.of(Schema.Type.STRING)), Schema.Field.of("last", Schema.of(Schema.Type.STRING)));
    File folder = tmpFolder.newFolder("fileSourceTest");
    File input1 = new File(folder, "input1.txt");
    File input2 = new File(folder, "input2.csv");
    File ignore1 = new File(folder, "input1.txt.done");
    File ignore2 = new File(folder, "input1");
    CharStreams.write("1,samuel,jackson\n2,dwayne,johnson", Files.newWriterSupplier(input1, Charsets.UTF_8));
    CharStreams.write("3,christopher,walken", Files.newWriterSupplier(input2, Charsets.UTF_8));
    CharStreams.write("0,nicolas,cage", Files.newWriterSupplier(ignore1, Charsets.UTF_8));
    CharStreams.write("0,orlando,bloom", Files.newWriterSupplier(ignore2, Charsets.UTF_8));
    Map<String, String> properties = ImmutableMap.<String, String>builder().put("path", folder.getAbsolutePath()).put("format", "csv").put("schema", schema.toString()).put("referenceName", "fileSourceTestInput").put("ignoreThreshold", "300").put("extensions", "txt,csv").build();
    DataStreamsConfig pipelineCfg = DataStreamsConfig.builder().addStage(new ETLStage("source", new ETLPlugin("File", StreamingSource.PLUGIN_TYPE, properties, null))).addStage(new ETLStage("sink", MockSink.getPlugin("fileOutput"))).addConnection("source", "sink").setBatchInterval("1s").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(DATASTREAMS_ARTIFACT, pipelineCfg);
    ApplicationId appId = NamespaceId.DEFAULT.app("FileSourceApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 1, TimeUnit.MINUTES);
    Map<Long, String> expected = ImmutableMap.of(1L, "samuel jackson", 2L, "dwayne johnson", 3L, "christopher walken");
    final DataSetManager<Table> outputManager = getDataset("fileOutput");
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Map<Long, String> actual = new HashMap<>();
        for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
            actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
        }
        return expected.equals(actual);
    }, 4, TimeUnit.MINUTES);
    // now write a new file to make sure new files are picked up.
    File input3 = new File(folder, "input3.txt");
    CharStreams.write("4,terry,crews\n5,rocky,balboa", Files.newWriterSupplier(input3, Charsets.UTF_8));
    Map<Long, String> expected2 = ImmutableMap.of(4L, "terry crews", 5L, "rocky balboa");
    Table outputTable = outputManager.get();
    Scanner scanner = outputTable.scan(null, null);
    Row row;
    while ((row = scanner.next()) != null) {
        outputTable.delete(row.getRow());
    }
    outputManager.flush();
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Map<Long, String> actual = new HashMap<>();
        for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
            actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
        }
        return expected2.equals(actual);
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
}
Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Row(io.cdap.cdap.api.dataset.table.Row) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) File(java.io.File) Test(org.junit.Test)

Example 29 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by cdapio.

the class PipelineSpecGenerator method configureStage.

/**
 * Configures a stage and returns the spec for it.
 *
 * @param stage the user provided configuration for the stage
 * @param validatedPipeline the validated pipeline config
 * @param pluginConfigurer configurer used to configure the stage
 * @return the spec for the stage
 * @throws ValidationException if the plugin threw an exception during configuration
 */
protected ConfiguredStage configureStage(ETLStage stage, ValidatedPipeline validatedPipeline, DefaultPipelineConfigurer pluginConfigurer) throws ValidationException {
    String stageName = stage.getName();
    ETLPlugin stagePlugin = stage.getPlugin();
    StageSpec.Builder specBuilder = configureStage(stageName, stagePlugin, pluginConfigurer);
    DefaultStageConfigurer stageConfigurer = pluginConfigurer.getStageConfigurer();
    String pluginType = stage.getPlugin().getType();
    if (pluginType.equals(SplitterTransform.PLUGIN_TYPE)) {
        Map<String, Schema> outputPortSchemas = stageConfigurer.getOutputPortSchemas();
        for (Map.Entry<String, String> outputEntry : validatedPipeline.getOutputPorts(stageName).entrySet()) {
            String outputStage = outputEntry.getKey();
            String outputPort = outputEntry.getValue();
            if (outputPort == null) {
                throw new IllegalArgumentException(String.format("Connection from Splitter '%s' to '%s' must specify a port.", stageName, outputStage));
            }
            specBuilder.addOutput(outputStage, outputPort, outputPortSchemas.get(outputPort));
        }
    } else {
        Schema outputSchema = stageConfigurer.getOutputSchema();
        // all the same
        if (Condition.PLUGIN_TYPE.equals(pluginType)) {
            outputSchema = null;
            for (Schema schema : stageConfigurer.getInputSchemas().values()) {
                if (schema != null) {
                    // todo: fix this cleanly and fully
                    if (outputSchema != null && !Schemas.equalsIgnoringRecordName(outputSchema, schema)) {
                        throw new IllegalArgumentException("Cannot have different input schemas going into stage " + stageName);
                    }
                    outputSchema = schema;
                }
            }
        }
        for (String outputStage : validatedPipeline.getOutputs(stageName)) {
            specBuilder.addOutput(outputStage, null, outputSchema);
        }
    }
    StageSpec stageSpec = specBuilder.setProcessTimingEnabled(validatedPipeline.isProcessTimingEnabled()).setStageLoggingEnabled(validatedPipeline.isStageLoggingEnabled()).setMaxPreviewRecords(validatedPipeline.getMaxPreviewRecords()).build();
    return new ConfiguredStage(stageSpec, pluginConfigurer.getPipelineProperties());
}
Also used : StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) DefaultStageConfigurer(io.cdap.cdap.etl.common.DefaultStageConfigurer) Map(java.util.Map) HashMap(java.util.HashMap)

Example 30 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by cdapio.

the class PipelineTest method testTextFileSinkAndDeletePostAction.

@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
    // create the pipeline config
    String inputName = "sinkTestInput";
    String outputName = "sinkTestOutput";
    String outputDirName = "users";
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
    sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
    sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
    ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
    // mapreduce writes multiple files to the output directory. Along with the actual output,
    // there are various .crc files that do not contain any of the output content.
    actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
    actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
    ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some data to the input fileset
    Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
    Map<String, String> users = new HashMap<>();
    users.put("samuel", "wallet");
    users.put("dwayne", "rock");
    users.put("christopher", "cowbell");
    List<StructuredRecord> inputRecords = new ArrayList<>();
    for (Map.Entry<String, String> userEntry : users.entrySet()) {
        String name = userEntry.getKey();
        String item = userEntry.getValue();
        inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
    }
    DataSetManager<Table> inputManager = getDataset(inputName);
    MockSource.writeInput(inputManager, inputRecords);
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${dir} macro will be substituted with "users" for our pipeline run
    runtimeArgs.put("dir", outputDirName);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<FileSet> outputManager = getDataset(outputName);
    FileSet output = outputManager.get();
    Location outputDir = output.getBaseLocation().append(outputDirName);
    Map<String, String> actual = new HashMap<>();
    for (Location outputFile : outputDir.list()) {
        if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
            Assert.fail("Post action did not delete file " + outputFile.getName());
        }
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] parts = line.split("\\|");
                actual.put(parts[0], parts[1]);
            }
        }
    }
    Assert.assertEquals(actual, users);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Table(io.cdap.cdap.api.dataset.table.Table) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) BufferedReader(java.io.BufferedReader) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashMap(java.util.HashMap) Map(java.util.Map) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Aggregations

ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)154 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)144 Test (org.junit.Test)136 ApplicationManager (io.cdap.cdap.test.ApplicationManager)102 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)101 Table (io.cdap.cdap.api.dataset.table.Table)79 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)77 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)70 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)59 HashMap (java.util.HashMap)57 Schema (io.cdap.cdap.api.data.schema.Schema)55 WorkflowManager (io.cdap.cdap.test.WorkflowManager)53 HashSet (java.util.HashSet)37 ImmutableMap (com.google.common.collect.ImmutableMap)36 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)35 ArrayList (java.util.ArrayList)34 StageValidationResponse (io.cdap.cdap.etl.proto.v2.validation.StageValidationResponse)20 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)18 StageValidationRequest (io.cdap.cdap.etl.proto.v2.validation.StageValidationRequest)18 File (java.io.File)17