Search in sources :

Example 11 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testMacrosSparkPipeline.

/**
   * Tests that if macros are provided
   */
@Test
public void testMacrosSparkPipeline() throws Exception {
    /*
     * Trivial Spark pipeline from batch source to batch sink.
     *
     * source --------- sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").setEngine(Engine.SPARK).addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("sparkinput", "${runtime${source}}"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("sparkoutput", "${runtime}${sink}"))).addConnection("source", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SparkApp");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    // set runtime arguments for macro substitution
    Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "SparkSinkDataset", "source", "Source", "runtimeSource", "mockRuntimeSparkSourceDataset");
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.setRuntimeArgs(runtimeArguments);
    // make sure the datasets don't exist beforehand
    Assert.assertNull(getDataset("mockRuntimeSparkSourceDataset").get());
    Assert.assertNull(getDataset("mockRuntimeSparkSinkDataset").get());
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // now the datasets should exist
    Assert.assertNotNull(getDataset("mockRuntimeSparkSourceDataset").get());
    Assert.assertNotNull(getDataset("mockRuntimeSparkSinkDataset").get());
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(co.cask.cdap.test.ApplicationManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) WorkflowManager(co.cask.cdap.test.WorkflowManager) ApplicationId(co.cask.cdap.proto.id.ApplicationId) AppRequest(co.cask.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 12 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testSecureStorePipeline.

/**
   * Tests the secure storage macro function in a pipelines by creating datasets from the secure store data.
   */
private void testSecureStorePipeline(Engine engine, String prefix) throws Exception {
    /*
     * Trivial pipeline from batch source to batch sink.
     *
     * source --------- sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("input", "${secure(" + prefix + "source)}"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("output", "${secure(" + prefix + "sink)}"))).addConnection("source", "sink").setEngine(engine).build();
    // place dataset names into secure storage
    getSecureStoreManager().putSecureData("default", prefix + "source", prefix + "MockSecureSourceDataset", "secure source dataset name", new HashMap<String, String>());
    getSecureStoreManager().putSecureData("default", prefix + "sink", prefix + "MockSecureSinkDataset", "secure dataset name", new HashMap<String, String>());
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("App-" + engine);
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    // make sure the datasets don't exist beforehand
    Assert.assertNull(getDataset(prefix + "MockSecureSourceDataset").get());
    Assert.assertNull(getDataset(prefix + "MockSecureSinkDataset").get());
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // now the datasets should exist
    Assert.assertNotNull(getDataset(prefix + "MockSecureSourceDataset").get());
    Assert.assertNotNull(getDataset(prefix + "MockSecureSinkDataset").get());
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(co.cask.cdap.test.ApplicationManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) WorkflowManager(co.cask.cdap.test.WorkflowManager) ApplicationId(co.cask.cdap.proto.id.ApplicationId) AppRequest(co.cask.cdap.proto.artifact.AppRequest)

Example 13 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testSinglePhase.

@Test
public void testSinglePhase() throws Exception {
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    /*
     * source --> sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockSource.getPlugin("singleInput", schema))).addStage(new ETLStage("sink", MockSink.getPlugin("singleOutput"))).addConnection("source", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SinglePhaseApp");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    // write records to source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("singleInput"));
    MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // check sink
    DataSetManager<Table> sinkManager = getDataset("singleOutput");
    Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    validateMetric(2, appId, "source.records.out");
    validateMetric(2, appId, "sink.records.in");
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ApplicationId(co.cask.cdap.proto.id.ApplicationId) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 14 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testExternalDatasetTracking.

private void testExternalDatasetTracking(Engine engine, boolean backwardsCompatible) throws Exception {
    String suffix = engine.name() + (backwardsCompatible ? "-bc" : "");
    // Define input/output datasets
    String expectedExternalDatasetInput = "fileInput-" + suffix;
    String expectedExternalDatasetOutput = "fileOutput-" + suffix;
    // Define input/output directories
    File inputDir = TMP_FOLDER.newFolder("input-" + suffix);
    String inputFile = "input-file1.txt";
    File outputDir = TMP_FOLDER.newFolder("output-" + suffix);
    File outputSubDir1 = new File(outputDir, "subdir1");
    File outputSubDir2 = new File(outputDir, "subdir2");
    if (!backwardsCompatible) {
        // Assert that there are no external datasets
        Assert.assertNull(getDataset(NamespaceId.DEFAULT.dataset(expectedExternalDatasetInput)).get());
        Assert.assertNull(getDataset(NamespaceId.DEFAULT.dataset(expectedExternalDatasetOutput)).get());
    }
    ETLBatchConfig.Builder builder = ETLBatchConfig.builder("* * * * *");
    ETLBatchConfig etlConfig = builder.setEngine(engine).addStage(new ETLStage("source", MockExternalSource.getPlugin(expectedExternalDatasetInput, inputDir.getAbsolutePath()))).addStage(new ETLStage("sink1", MockExternalSink.getPlugin(backwardsCompatible ? null : expectedExternalDatasetOutput, "dir1", outputSubDir1.getAbsolutePath()))).addStage(new ETLStage("sink2", MockExternalSink.getPlugin(backwardsCompatible ? null : expectedExternalDatasetOutput, "dir2", outputSubDir2.getAbsolutePath()))).addConnection("source", "sink1").addConnection("source", "sink2").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ExternalDatasetApp-" + suffix);
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
    ImmutableList<StructuredRecord> allInput = ImmutableList.of(recordSamuel, recordBob, recordJane);
    // Create input files
    MockExternalSource.writeInput(new File(inputDir, inputFile).getAbsolutePath(), allInput);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    List<RunRecord> history = workflowManager.getHistory();
    // there should be only one completed run
    Assert.assertEquals(1, history.size());
    Assert.assertEquals(ProgramRunStatus.COMPLETED, history.get(0).getStatus());
    // Assert output
    Assert.assertEquals(allInput, MockExternalSink.readOutput(outputSubDir1.getAbsolutePath()));
    Assert.assertEquals(allInput, MockExternalSink.readOutput(outputSubDir2.getAbsolutePath()));
    if (!backwardsCompatible) {
        // Assert that external datasets got created
        Assert.assertNotNull(getDataset(NamespaceId.DEFAULT.dataset(expectedExternalDatasetInput)).get());
        Assert.assertNotNull(getDataset(NamespaceId.DEFAULT.dataset(expectedExternalDatasetOutput)).get());
    }
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) RunRecord(co.cask.cdap.proto.RunRecord) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) File(java.io.File)

Example 15 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testServiceUrl.

public void testServiceUrl(Engine engine) throws Exception {
    // Deploy the ServiceApp application
    ApplicationManager appManager = deployApplication(ServiceApp.class);
    // Start Greeting service and use it
    ServiceManager serviceManager = appManager.getServiceManager(ServiceApp.Name.SERVICE_NAME).start();
    // Wait service startup
    serviceManager.waitForStatus(true);
    URL url = new URL(serviceManager.getServiceURL(), "name");
    HttpRequest httpRequest = HttpRequest.post(url).withBody("bob").build();
    HttpResponse httpResponse = HttpRequests.execute(httpRequest);
    Assert.assertEquals(HttpURLConnection.HTTP_OK, httpResponse.getResponseCode());
    url = new URL(serviceManager.getServiceURL(), "name/bob");
    HttpURLConnection connection = (HttpURLConnection) url.openConnection();
    Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
    String response;
    try {
        response = new String(ByteStreams.toByteArray(connection.getInputStream()), Charsets.UTF_8);
    } finally {
        connection.disconnect();
    }
    Assert.assertEquals("bob", response);
    String sourceName = "ServiceUrlInput-" + engine.name();
    String sinkName = "ServiceUrlOutput-" + engine.name();
    /*
     * source --> filter --> sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").setEngine(engine).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("filter", FilterTransform.getPlugin("name"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "filter").addConnection("filter", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ServiceUrl-" + engine);
    appManager = deployApplication(appId.toId(), appRequest);
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
    // write one record to each source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(sourceName));
    MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // check output
    DataSetManager<Table> sinkManager = getDataset(sinkName);
    Set<StructuredRecord> expected = ImmutableSet.of(recordBob);
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    serviceManager.stop();
    serviceManager.waitForRun(ProgramRunStatus.KILLED, 180, TimeUnit.SECONDS);
}
Also used : HttpRequest(co.cask.common.http.HttpRequest) ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) HttpResponse(co.cask.common.http.HttpResponse) URL(java.net.URL) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) HttpURLConnection(java.net.HttpURLConnection) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ServiceManager(co.cask.cdap.test.ServiceManager) ApplicationId(co.cask.cdap.proto.id.ApplicationId)

Aggregations

ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)50 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)37 AppRequest (co.cask.cdap.proto.artifact.AppRequest)35 ApplicationId (co.cask.cdap.proto.id.ApplicationId)35 ApplicationManager (co.cask.cdap.test.ApplicationManager)32 Test (org.junit.Test)31 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)27 Schema (co.cask.cdap.api.data.schema.Schema)26 WorkflowManager (co.cask.cdap.test.WorkflowManager)24 Table (co.cask.cdap.api.dataset.table.Table)23 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)19 ArrayList (java.util.ArrayList)9 HashSet (java.util.HashSet)6 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)5 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)5 File (java.io.File)5 TimeoutException (java.util.concurrent.TimeoutException)5 ETLRealtimeConfig (co.cask.cdap.etl.proto.v2.ETLRealtimeConfig)4 WorkerManager (co.cask.cdap.test.WorkerManager)4 PreviewManager (co.cask.cdap.app.preview.PreviewManager)3