use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkCompute.
private void testSinglePhaseWithSparkCompute() throws Exception {
/*
* source --> sparkcompute --> sink
*/
String classifiedTextsTable = "classifiedTextTable";
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(NaiveBayesTrainer.TEXTS_TO_CLASSIFY_SOURCE, SpamMessage.SCHEMA))).addStage(new ETLStage("sparkcompute", new ETLPlugin(NaiveBayesClassifier.PLUGIN_NAME, SparkCompute.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "fieldToSet", SpamMessage.SPAM_PREDICTION_FIELD), null))).addStage(new ETLStage("sink", MockSink.getPlugin(classifiedTextsTable))).addConnection("source", "sparkcompute").addConnection("sparkcompute", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkComputeApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// write some some messages to be classified
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("how are you doing today").toStructuredRecord());
messagesToWrite.add(new SpamMessage("free money money").toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you doing today").toStructuredRecord());
messagesToWrite.add(new SpamMessage("genuine report").toStructuredRecord());
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY_SOURCE));
MockSource.writeInput(inputManager, messagesToWrite);
// manually trigger the pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> classifiedTexts = getDataset(classifiedTextsTable);
List<StructuredRecord> structuredRecords = MockSink.readOutput(classifiedTexts);
Set<SpamMessage> results = new HashSet<>();
for (StructuredRecord structuredRecord : structuredRecords) {
results.add(SpamMessage.fromStructuredRecord(structuredRecord));
}
Set<SpamMessage> expected = new HashSet<>();
expected.add(new SpamMessage("how are you doing today", 0.0));
// only 'free money money' should be predicated as spam
expected.add(new SpamMessage("free money money", 1.0));
expected.add(new SpamMessage("what are you doing today", 0.0));
expected.add(new SpamMessage("genuine report", 0.0));
Assert.assertEquals(expected, results);
validateMetric(4, appId, "source.records.out");
validateMetric(4, appId, "sparkcompute.records.in");
validateMetric(4, appId, "sink.records.in");
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataPipelineTest method runPipelineForMetadata.
private void runPipelineForMetadata(MetadataAdmin metadataAdmin, Set<MetadataOperation> operations) throws Exception {
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
/*
* source --> sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin("singleInput", schema, operations))).addStage(new ETLStage("sink", MockSink.getPlugin("singleOutput"))).addConnection("source", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("MetadataTestApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// wait for the system metadata for the app and the dataset to show up - the pipeline validates them
Tasks.waitFor(false, () -> metadataAdmin.getProperties(MetadataScope.SYSTEM, appId.toMetadataEntity()).isEmpty(), 10, TimeUnit.SECONDS);
Tasks.waitFor(false, () -> metadataAdmin.getProperties(MetadataScope.SYSTEM, NamespaceId.DEFAULT.dataset("singleInput").toMetadataEntity()).isEmpty(), 10, TimeUnit.SECONDS);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
int numRuns = workflowManager.getHistory().size();
workflowManager.start();
workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, numRuns + 1, 5, TimeUnit.MINUTES);
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataStreamsTest method testTransformComputeWithMacros.
@Test
public void testTransformComputeWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = new ArrayList<>();
StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
input.add(samuelRecord);
input.add(jacksonRecord);
input.add(dwayneRecord);
input.add(johnsonRecord);
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
Set<StructuredRecord> expected = new HashSet<>();
expected.add(samuelRecord);
expected.add(jacksonRecord);
testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
validateMetric(appId, "source.records.out", 4);
validateMetric(appId, "sleep.records.in", 4);
validateMetric(appId, "sleep.records.out", 4);
validateMetric(appId, "filter1.records.in", 4);
validateMetric(appId, "filter1.records.out", 3);
validateMetric(appId, "filter2.records.in", 3);
validateMetric(appId, "filter2.records.out", 2);
validateMetric(appId, "sink.records.in", 2);
Assert.assertTrue(getMetric(appId, "sleep." + io.cdap.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
expected.clear();
expected.add(dwayneRecord);
expected.add(johnsonRecord);
testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataStreamsTest method testLineageWithMacros.
@Test
public void testLineageWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
String srcName = "lineageSource";
String sinkName1 = "lineageOutput1";
String sinkName2 = "lineageOutput2";
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
LineageAdmin lineageAdmin = getLineageAdmin();
// wait for the lineage get populated
Tasks.waitFor(true, () -> {
Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
}, 10, TimeUnit.SECONDS);
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
Assert.assertEquals(1, outgoing.size());
Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
Assert.assertEquals(expectedRelations, outgoing);
// here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
TimeUnit.SECONDS.sleep(1);
long startTimeMillis = System.currentTimeMillis();
runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
// wait for the lineage get populated
Tasks.waitFor(true, () -> {
Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
long end = System.currentTimeMillis();
DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
}, 10, TimeUnit.SECONDS);
lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
outgoing = summary.getOutgoing();
Assert.assertEquals(1, outgoing.size());
expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
Assert.assertEquals(expectedRelations, outgoing);
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataStreamsTest method testWindower.
@Test
public void testWindower() throws Exception {
/*
* source --> window(width=10,interval=1) --> aggregator --> filter --> sink
*/
Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
String sinkName = "windowOut";
// source sleeps 1 second between outputs
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
// the sink should contain at least one record with count of 3, and no records with more than 3.
// less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
// that contains all 3.
final DataSetManager<Table> outputManager = getDataset(sinkName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
outputManager.flush();
boolean sawThree = false;
for (StructuredRecord record : MockSink.readOutput(outputManager)) {
long count = record.get("ct");
if (count == 3L) {
sawThree = true;
}
Assert.assertTrue(count <= 3L);
}
return sawThree;
}
}, 2, TimeUnit.MINUTES);
sparkManager.stop();
}
Aggregations