use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class MultiHopFlowCompilerTest method testCompileCombinedDatasetFlow.
@Test(dependsOnMethods = "testCompileMultiDatasetFlow")
public void testCompileCombinedDatasetFlow() throws Exception {
FlowSpec spec = createFlowSpec("flow/flow4.conf", "HDFS-1", "HDFS-3", true, false);
Dag<JobExecutionPlan> dag = specCompiler.compileFlow(spec);
// Should be 2 jobs, each containing 3 datasets
Assert.assertEquals(dag.getNodes().size(), 2);
Assert.assertEquals(dag.getEndNodes().size(), 1);
Assert.assertEquals(dag.getStartNodes().size(), 1);
String copyJobName = Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join("testFlowGroup", "testFlowName", "Distcp", "HDFS-1", "HDFS-3", "hdfsToHdfs");
Config jobConfig = dag.getStartNodes().get(0).getValue().getJobSpec().getConfig();
String jobName = jobConfig.getString(ConfigurationKeys.JOB_NAME_KEY);
Assert.assertTrue(jobName.startsWith(copyJobName));
Assert.assertTrue(jobConfig.getString(ConfigurableGlobDatasetFinder.DATASET_FINDER_PATTERN_KEY).endsWith("{dataset0,dataset1,dataset2}"));
String retentionJobName = Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join("testFlowGroup", "testFlowName", "SnapshotRetention", "HDFS-3", "HDFS-3", "hdfsRetention");
Config jobConfig2 = dag.getEndNodes().get(0).getValue().getJobSpec().getConfig();
String jobName2 = jobConfig2.getString(ConfigurationKeys.JOB_NAME_KEY);
Assert.assertTrue(jobName2.startsWith(retentionJobName));
Assert.assertTrue(jobConfig2.getString(ConfigurableGlobDatasetFinder.DATASET_FINDER_PATTERN_KEY).endsWith("{dataset0,dataset1,dataset2}"));
}
use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class MultiHopFlowCompilerTest method testMissingSourceNodeError.
@Test(dependsOnMethods = "testUnresolvedFlow")
public void testMissingSourceNodeError() throws Exception {
FlowSpec spec = createFlowSpec("flow/flow5.conf", "HDFS-NULL", "HDFS-3", false, false);
Dag<JobExecutionPlan> dag = specCompiler.compileFlow(spec);
Assert.assertEquals(dag, null);
Assert.assertEquals(spec.getCompilationErrors().size(), 1);
spec.getCompilationErrors().stream().anyMatch(s -> s.errorMessage.contains("Flowgraph does not have a node with id"));
}
use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class MultiHopFlowCompilerTest method testUnresolvedFlow.
@Test(dependsOnMethods = "testCompileCombinedDatasetFlow")
public void testUnresolvedFlow() throws Exception {
FlowSpec spec = createFlowSpec("flow/flow5.conf", "HDFS-1", "HDFS-3", false, false);
Dag<JobExecutionPlan> dag = specCompiler.compileFlow(spec);
Assert.assertNull(dag);
Assert.assertEquals(spec.getCompilationErrors().stream().map(c -> c.errorMessage).collect(Collectors.toSet()).size(), 1);
spec.getCompilationErrors().stream().anyMatch(s -> s.errorMessage.contains(AzkabanProjectConfig.USER_TO_PROXY));
}
use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class MultiHopFlowCompilerTest method testCompileFlowAfterFirstEdgeDeletion.
@Test(dependsOnMethods = "testCompileFlowWithRetention")
public void testCompileFlowAfterFirstEdgeDeletion() throws URISyntaxException, IOException {
// Delete the self edge on HDFS-1 that performs convert-to-json-and-encrypt.
this.flowGraph.deleteFlowEdge("HDFS-1_HDFS-1_hdfsConvertToJsonAndEncrypt");
FlowSpec spec = createFlowSpec("flow/flow1.conf", "LocalFS-1", "ADLS-1", false, false);
Dag<JobExecutionPlan> jobDag = this.specCompiler.compileFlow(spec);
Assert.assertEquals(jobDag.getNodes().size(), 4);
Assert.assertEquals(jobDag.getStartNodes().size(), 1);
Assert.assertEquals(jobDag.getEndNodes().size(), 1);
// Get the 1st hop - Distcp from "LocalFS-1" to "HDFS-2"
DagNode<JobExecutionPlan> startNode = jobDag.getStartNodes().get(0);
JobExecutionPlan jobExecutionPlan = startNode.getValue();
JobSpec jobSpec = jobExecutionPlan.getJobSpec();
// Ensure the resolved job config for the first hop has the correct substitutions.
Config jobConfig = jobSpec.getConfig();
String flowGroup = "testFlowGroup";
String flowName = "testFlowName";
String expectedJobName1 = Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join(flowGroup, flowName, "Distcp", "LocalFS-1", "HDFS-2", "localToHdfs");
String jobName1 = jobConfig.getString(ConfigurationKeys.JOB_NAME_KEY);
Assert.assertTrue(jobName1.startsWith(expectedJobName1));
String from = jobConfig.getString("from");
String to = jobConfig.getString("to");
Assert.assertEquals(from, "/data/out/testTeam/testDataset");
Assert.assertEquals(to, "/data/out/testTeam/testDataset");
String sourceFsUri = jobConfig.getString("fs.uri");
Assert.assertEquals(sourceFsUri, "file:///");
Assert.assertEquals(jobConfig.getString("source.filebased.fs.uri"), sourceFsUri);
Assert.assertEquals(jobConfig.getString("state.store.fs.uri"), sourceFsUri);
String targetFsUri = jobConfig.getString("target.filebased.fs.uri");
Assert.assertEquals(targetFsUri, "hdfs://hadoopnn02.grid.linkedin.com:8888/");
Assert.assertEquals(jobConfig.getString("writer.fs.uri"), targetFsUri);
Assert.assertEquals(new Path(jobConfig.getString("gobblin.dataset.pattern")), new Path(from));
Assert.assertEquals(jobConfig.getString("data.publisher.final.dir"), to);
Assert.assertEquals(jobConfig.getString("type"), "java");
Assert.assertEquals(jobConfig.getString("job.class"), "org.apache.gobblin.runtime.local.LocalJobLauncher");
Assert.assertEquals(jobConfig.getString("launcher.type"), "LOCAL");
// Ensure the spec executor has the correct configurations
SpecExecutor specExecutor = jobExecutionPlan.getSpecExecutor();
Assert.assertEquals(specExecutor.getUri().toString(), "fs:///");
Assert.assertEquals(specExecutor.getClass().getCanonicalName(), "org.apache.gobblin.runtime.spec_executorInstance.InMemorySpecExecutor");
// Get the 2nd hop - "HDFS-2 to HDFS-2 : convert avro to json and encrypt"
Assert.assertEquals(jobDag.getChildren(startNode).size(), 1);
DagNode<JobExecutionPlan> secondHopNode = jobDag.getChildren(startNode).get(0);
jobExecutionPlan = secondHopNode.getValue();
jobConfig = jobExecutionPlan.getJobSpec().getConfig();
String expectedJobName2 = Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join(flowGroup, flowName, "ConvertToJsonAndEncrypt", "HDFS-2", "HDFS-2", "hdfsConvertToJsonAndEncrypt");
String jobName2 = jobConfig.getString(ConfigurationKeys.JOB_NAME_KEY);
Assert.assertTrue(jobName2.startsWith(expectedJobName2));
Assert.assertEquals(jobConfig.getString(ConfigurationKeys.JOB_DEPENDENCIES), jobName1);
from = jobConfig.getString("from");
to = jobConfig.getString("to");
Assert.assertEquals(from, "/data/out/testTeam/testDataset");
Assert.assertEquals(to, "/data/encrypted/testTeam/testDataset");
Assert.assertEquals(jobConfig.getString("source.filebased.data.directory"), from);
Assert.assertEquals(jobConfig.getString("data.publisher.final.dir"), to);
specExecutor = jobExecutionPlan.getSpecExecutor();
Assert.assertEquals(specExecutor.getUri().toString(), "https://azkaban02.gobblin.net:8443");
Assert.assertEquals(specExecutor.getClass().getCanonicalName(), "org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest.TestAzkabanSpecExecutor");
// Get the 3rd hop - "Distcp HDFS-2 to HDFS-4"
Assert.assertEquals(jobDag.getChildren(secondHopNode).size(), 1);
DagNode<JobExecutionPlan> thirdHopNode = jobDag.getChildren(secondHopNode).get(0);
jobExecutionPlan = thirdHopNode.getValue();
jobConfig = jobExecutionPlan.getJobSpec().getConfig();
String expectedJobName3 = Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join(flowGroup, flowName, "Distcp", "HDFS-2", "HDFS-4", "hdfsToHdfs");
String jobName3 = jobConfig.getString(ConfigurationKeys.JOB_NAME_KEY);
Assert.assertTrue(jobName3.startsWith(expectedJobName3));
Assert.assertEquals(jobConfig.getString(ConfigurationKeys.JOB_DEPENDENCIES), jobName2);
from = jobConfig.getString("from");
to = jobConfig.getString("to");
Assert.assertEquals(from, "/data/encrypted/testTeam/testDataset");
Assert.assertEquals(to, "/data/encrypted/testTeam/testDataset");
Assert.assertEquals(jobConfig.getString("source.filebased.fs.uri"), "hdfs://hadoopnn02.grid.linkedin.com:8888/");
Assert.assertEquals(jobConfig.getString("target.filebased.fs.uri"), "hdfs://hadoopnn04.grid.linkedin.com:8888/");
Assert.assertEquals(jobConfig.getString("type"), "hadoopJava");
Assert.assertEquals(jobConfig.getString("job.class"), "org.apache.gobblin.azkaban.AzkabanJobLauncher");
Assert.assertEquals(jobConfig.getString("launcher.type"), "MAPREDUCE");
// Ensure the spec executor has the correct configurations
specExecutor = jobExecutionPlan.getSpecExecutor();
Assert.assertEquals(specExecutor.getUri().toString(), "https://azkaban02.gobblin.net:8443");
Assert.assertEquals(specExecutor.getClass().getCanonicalName(), "org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest.TestAzkabanSpecExecutor");
// Get the 4th hop - "Distcp from HDFS-4 to ADLS-1"
Assert.assertEquals(jobDag.getChildren(thirdHopNode).size(), 1);
DagNode<JobExecutionPlan> fourthHopNode = jobDag.getChildren(thirdHopNode).get(0);
jobExecutionPlan = fourthHopNode.getValue();
jobConfig = jobExecutionPlan.getJobSpec().getConfig();
String expectedJobName4 = Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join(flowGroup, flowName, "DistcpToADL", "HDFS-4", "ADLS-1", "hdfsToAdl");
String jobName4 = jobConfig.getString(ConfigurationKeys.JOB_NAME_KEY);
Assert.assertTrue(jobName4.startsWith(expectedJobName4));
Assert.assertEquals(jobConfig.getString(ConfigurationKeys.JOB_DEPENDENCIES), jobName3);
from = jobConfig.getString("from");
to = jobConfig.getString("to");
Assert.assertEquals(from, "/data/encrypted/testTeam/testDataset");
Assert.assertEquals(to, "/data/encrypted/testTeam/testDataset");
Assert.assertEquals(jobConfig.getString("source.filebased.fs.uri"), "hdfs://hadoopnn04.grid.linkedin.com:8888/");
Assert.assertEquals(jobConfig.getString("target.filebased.fs.uri"), "adl://azuredatalakestore.net/");
Assert.assertEquals(jobConfig.getString("type"), "hadoopJava");
Assert.assertEquals(jobConfig.getString("job.class"), "org.apache.gobblin.azkaban.AzkabanJobLauncher");
Assert.assertEquals(jobConfig.getString("launcher.type"), "MAPREDUCE");
Assert.assertEquals(jobConfig.getString("dfs.adls.oauth2.client.id"), "1234");
Assert.assertEquals(jobConfig.getString("writer.encrypted.dfs.adls.oauth2.credential"), "credential");
Assert.assertEquals(jobConfig.getString("encrypt.key.loc"), "/user/testUser/master.password");
// Ensure the spec executor has the correct configurations
specExecutor = jobExecutionPlan.getSpecExecutor();
Assert.assertEquals(specExecutor.getUri().toString(), "https://azkaban04.gobblin.net:8443");
Assert.assertEquals(specExecutor.getClass().getCanonicalName(), "org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest.TestAzkabanSpecExecutor");
// Ensure the fourth hop is the last
Assert.assertEquals(jobDag.getEndNodes().get(0), fourthHopNode);
}
use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class MultiHopFlowCompilerTest method testCompileFlowAfterSecondEdgeDeletion.
@Test(dependsOnMethods = "testCompileFlowAfterFirstEdgeDeletion")
public void testCompileFlowAfterSecondEdgeDeletion() throws URISyntaxException, IOException {
// Delete the self edge on HDFS-2 that performs convert-to-json-and-encrypt.
this.flowGraph.deleteFlowEdge("HDFS-2_HDFS-2_hdfsConvertToJsonAndEncrypt");
FlowSpec spec = createFlowSpec("flow/flow1.conf", "LocalFS-1", "ADLS-1", false, false);
Dag<JobExecutionPlan> jobDag = this.specCompiler.compileFlow(spec);
// Ensure no path to destination.
Assert.assertEquals(jobDag, null);
}
Aggregations