use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.
the class DataStreamsTest method testWindower.
@Test
public void testWindower() throws Exception {
/*
* source --> window(width=10,interval=1) --> aggregator --> filter --> sink
*/
Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
String sinkName = "windowOut";
// source sleeps 1 second between outputs
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
// the sink should contain at least one record with count of 3, and no records with more than 3.
// less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
// that contains all 3.
final DataSetManager<Table> outputManager = getDataset(sinkName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
outputManager.flush();
boolean sawThree = false;
for (StructuredRecord record : MockSink.readOutput(outputManager)) {
long count = record.get("ct");
if (count == 3L) {
sawThree = true;
}
Assert.assertTrue(count <= 3L);
}
return sawThree;
}
}, 2, TimeUnit.MINUTES);
sparkManager.stop();
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.
the class DataStreamsTest method testAutoJoin.
@Test
public void testAutoJoin() throws Exception {
/*
* customers ----------|
* |
* |---> join ---> sink
* |
* transactions -------|
*/
Schema inputSchema1 = Schema.recordOf("customer", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("transaction", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)));
Schema outSchema = Schema.recordOf("customers.transactions", Schema.Field.of("customers_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customers_customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_item_id", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord tx1 = StructuredRecord.builder(inputSchema2).set("t_id", "1").set("customer_id", "1").set("item_id", "11").build();
StructuredRecord tx2 = StructuredRecord.builder(inputSchema2).set("t_id", "2").set("customer_id", "3").set("item_id", "22").build();
StructuredRecord tx3 = StructuredRecord.builder(inputSchema2).set("t_id", "3").set("customer_id", "4").set("item_id", "33").build();
List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
List<StructuredRecord> input2 = ImmutableList.of(tx1, tx2, tx3);
String outputName = UUID.randomUUID().toString();
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("customers", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("transactions", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("customers", "transactions"), Collections.singletonList("customer_id"), Collections.singletonList("transactions"), Collections.emptyList(), Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(outputName))).addConnection("customers", "join").addConnection("transactions", "join").addConnection("join", "sink").setBatchInterval("5s").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("AutoJoinerApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
StructuredRecord join1 = StructuredRecord.builder(outSchema).set("customers_customer_id", "1").set("customers_customer_name", "samuel").set("transactions_t_id", "1").set("transactions_customer_id", "1").set("transactions_item_id", "11").build();
StructuredRecord join2 = StructuredRecord.builder(outSchema).set("customers_customer_id", "3").set("customers_customer_name", "jane").set("transactions_t_id", "2").set("transactions_customer_id", "3").set("transactions_item_id", "22").build();
StructuredRecord join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_customer_id", "4").set("transactions_item_id", "33").build();
Set<StructuredRecord> expected = ImmutableSet.of(join1, join2, join3);
DataSetManager<Table> outputManager = getDataset(outputName);
Tasks.waitFor(true, () -> {
outputManager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
return expected.equals(outputRecords);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(10, TimeUnit.SECONDS);
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.
the class DataStreamsTest method testStageConsolidation.
@Test
public void testStageConsolidation() throws Exception {
/*
|non-null port --> sink1
items1 --> null splitter --|
|null port--> sink2
^
|--> filter out id == 0 ---------|
items2 --| |
| |--> error collector --> sink3
|
| |--> filter out id == 1 --> sink4
|--> aggregator --|
|--> filter out id == 2 --> sink5
*/
Schema schema = Schema.recordOf("item", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))));
StructuredRecord itemNull = StructuredRecord.builder(schema).build();
StructuredRecord item0 = StructuredRecord.builder(schema).set("id", 0).build();
StructuredRecord item1 = StructuredRecord.builder(schema).set("id", 1).build();
StructuredRecord item2 = StructuredRecord.builder(schema).set("id", 2).build();
StructuredRecord item3 = StructuredRecord.builder(schema).set("id", 3).build();
List<StructuredRecord> input1Records = Arrays.asList(itemNull, item3);
List<StructuredRecord> input2Records = Arrays.asList(item0, item1, item2);
File outputDir = TMP_FOLDER.newFolder();
String output1 = new File(outputDir, "output1").getAbsolutePath();
String output2 = new File(outputDir, "output2").getAbsolutePath();
String output3 = new File(outputDir, "output3").getAbsolutePath();
String output4 = new File(outputDir, "output4").getAbsolutePath();
String output5 = new File(outputDir, "output5").getAbsolutePath();
DataStreamsConfig config = DataStreamsConfig.builder().addStage(new ETLStage("items1", MockSource.getPlugin(schema, input1Records))).addStage(new ETLStage("items2", MockSource.getPlugin(schema, input2Records))).addStage(new ETLStage("nullSplitter", NullFieldSplitterTransform.getPlugin("id"))).addStage(new ETLStage("filter0", IntValueFilterTransform.getPlugin("id", 0))).addStage(new ETLStage("collector", FlattenErrorTransform.getPlugin())).addStage(new ETLStage("filter1", IntValueFilterTransform.getPlugin("id", 1))).addStage(new ETLStage("filter2", IntValueFilterTransform.getPlugin("id", 2))).addStage(new ETLStage("identityAggregator", IdentityAggregator.getPlugin())).addStage(new ETLStage("sink1", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "s1", output1))).addStage(new ETLStage("sink2", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "s2", output2))).addStage(new ETLStage("sink3", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "s3", output3))).addStage(new ETLStage("sink4", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "s4", output4))).addStage(new ETLStage("sink5", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "s5", output5))).addConnection("items1", "nullSplitter").addConnection("nullSplitter", "sink1", "non-null").addConnection("nullSplitter", "sink2", "null").addConnection("items2", "filter0").addConnection("items2", "identityAggregator").addConnection("filter0", "sink2").addConnection("filter0", "collector").addConnection("identityAggregator", "filter1").addConnection("identityAggregator", "filter2").addConnection("collector", "sink3").addConnection("filter1", "sink4").addConnection("filter2", "sink5").setProperties(Collections.singletonMap(io.cdap.cdap.etl.common.Constants.SPARK_PIPELINE_AUTOCACHE_ENABLE_FLAG, "false")).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("StageConsolidationTest");
ApplicationManager appManager = deployApplication(appId, appRequest);
// run pipeline
Map<String, String> args = Collections.singletonMap(io.cdap.cdap.etl.common.Constants.CONSOLIDATE_STAGES, "true");
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.startAndWaitForGoodRun(args, ProgramRunStatus.RUNNING, 5, TimeUnit.MINUTES);
Schema errorSchema = Schema.recordOf("erroritem", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("errMsg", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("errCode", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("errStage", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord expectedError = StructuredRecord.builder(errorSchema).set("id", 0).set("errMsg", IntValueFilterTransform.ERROR_MESSAGE).set("errCode", IntValueFilterTransform.ERROR_CODE).set("errStage", "filter0").build();
Set<StructuredRecord> sink1Expected = Collections.singleton(item3);
Set<StructuredRecord> sink2Expected = new HashSet<>(Arrays.asList(itemNull, item1, item2));
Set<StructuredRecord> sink3Expected = Collections.singleton(expectedError);
Set<StructuredRecord> sink4Expected = new HashSet<>(Arrays.asList(item0, item2));
Set<StructuredRecord> sink5Expected = new HashSet<>(Arrays.asList(item0, item1));
Tasks.waitFor(true, () -> sink1Expected.equals(new HashSet<>(MockExternalSink.readOutput(output1, schema))) && sink2Expected.equals(new HashSet<>(MockExternalSink.readOutput(output2, schema))) && sink3Expected.equals(new HashSet<>(MockExternalSink.readOutput(output3, errorSchema))) && sink4Expected.equals(new HashSet<>(MockExternalSink.readOutput(output4, schema))) && sink5Expected.equals(new HashSet<>(MockExternalSink.readOutput(output5, schema))), 3, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(1, TimeUnit.MINUTES);
// check output
validateMetric(appId, "nullSplitter.records.in", 2);
validateMetric(appId, "nullSplitter.records.out.null", 1);
validateMetric(appId, "nullSplitter.records.out.non-null", 1);
validateMetric(appId, "filter0.records.out", 2);
validateMetric(appId, "filter0.records.error", 1);
validateMetric(appId, "identityAggregator.records.in", 3);
validateMetric(appId, "identityAggregator.records.out", 3);
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project hydrator-plugins by cdapio.
the class SparkPluginTest method testHttpStreamingSource.
@Test
public void testHttpStreamingSource() throws Exception {
Assert.assertEquals(200, resetFeeds());
final String content = "samuel jackson\ndwayne johnson\nchristopher walken";
Assert.assertEquals(200, writeFeed("people", content));
Map<String, String> properties = ImmutableMap.of("referenceName", "peopleFeed", "url", httpBase + "/feeds/people", "interval", "1");
DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", new ETLPlugin("HTTPPoller", StreamingSource.PLUGIN_TYPE, properties, null))).addStage(new ETLStage("sink", MockSink.getPlugin("httpOutput"))).addConnection("source", "sink").setBatchInterval("1s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(DATASTREAMS_ARTIFACT, pipelineConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("HTTPSourceApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
final DataSetManager<Table> outputManager = getDataset("httpOutput");
Tasks.waitFor(true, () -> {
outputManager.flush();
Set<String> contents = new HashSet<>();
for (StructuredRecord record : MockSink.readOutput(outputManager)) {
contents.add(record.get("body"));
}
return contents.size() == 1 && contents.contains(content);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project hydrator-plugins by cdapio.
the class SparkPluginTest method testFileSource.
@Test
public void testFileSource() throws Exception {
Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("first", Schema.of(Schema.Type.STRING)), Schema.Field.of("last", Schema.of(Schema.Type.STRING)));
File folder = tmpFolder.newFolder("fileSourceTest");
File input1 = new File(folder, "input1.txt");
File input2 = new File(folder, "input2.csv");
File ignore1 = new File(folder, "input1.txt.done");
File ignore2 = new File(folder, "input1");
CharStreams.write("1,samuel,jackson\n2,dwayne,johnson", Files.newWriterSupplier(input1, Charsets.UTF_8));
CharStreams.write("3,christopher,walken", Files.newWriterSupplier(input2, Charsets.UTF_8));
CharStreams.write("0,nicolas,cage", Files.newWriterSupplier(ignore1, Charsets.UTF_8));
CharStreams.write("0,orlando,bloom", Files.newWriterSupplier(ignore2, Charsets.UTF_8));
Map<String, String> properties = ImmutableMap.<String, String>builder().put("path", folder.getAbsolutePath()).put("format", "csv").put("schema", schema.toString()).put("referenceName", "fileSourceTestInput").put("ignoreThreshold", "300").put("extensions", "txt,csv").build();
DataStreamsConfig pipelineCfg = DataStreamsConfig.builder().addStage(new ETLStage("source", new ETLPlugin("File", StreamingSource.PLUGIN_TYPE, properties, null))).addStage(new ETLStage("sink", MockSink.getPlugin("fileOutput"))).addConnection("source", "sink").setBatchInterval("1s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(DATASTREAMS_ARTIFACT, pipelineCfg);
ApplicationId appId = NamespaceId.DEFAULT.app("FileSourceApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 1, TimeUnit.MINUTES);
Map<Long, String> expected = ImmutableMap.of(1L, "samuel jackson", 2L, "dwayne johnson", 3L, "christopher walken");
final DataSetManager<Table> outputManager = getDataset("fileOutput");
Tasks.waitFor(true, () -> {
outputManager.flush();
Map<Long, String> actual = new HashMap<>();
for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
}
return expected.equals(actual);
}, 4, TimeUnit.MINUTES);
// now write a new file to make sure new files are picked up.
File input3 = new File(folder, "input3.txt");
CharStreams.write("4,terry,crews\n5,rocky,balboa", Files.newWriterSupplier(input3, Charsets.UTF_8));
Map<Long, String> expected2 = ImmutableMap.of(4L, "terry crews", 5L, "rocky balboa");
Table outputTable = outputManager.get();
Scanner scanner = outputTable.scan(null, null);
Row row;
while ((row = scanner.next()) != null) {
outputTable.delete(row.getRow());
}
outputManager.flush();
Tasks.waitFor(true, () -> {
outputManager.flush();
Map<Long, String> actual = new HashMap<>();
for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
}
return expected2.equals(actual);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
}
Aggregations