use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class FlowGraphPath method updateJobDependencies.
/**
* A method to modify the {@link ConfigurationKeys#JOB_DEPENDENCIES} specified in a {@link JobTemplate} to those
* which are usable in a {@link JobSpec}.
* The {@link ConfigurationKeys#JOB_DEPENDENCIES} specified in a JobTemplate use the JobTemplate names
* (i.e. the file names of the templates without the extension). However, the same {@link FlowTemplate} may be used
* across multiple {@link FlowEdge}s. To ensure that we capture dependencies between jobs correctly as Dags from
* successive hops are merged, we translate the {@link JobTemplate} name specified in the dependencies config to
* {@link ConfigurationKeys#JOB_NAME_KEY} from the corresponding {@link JobSpec}, which is guaranteed to be globally unique.
* For example, consider a {@link JobTemplate} with URI job1.job which has "job.dependencies=job2,job3" (where job2.job and job3.job are
* URIs of other {@link JobTemplate}s). Also, let the job.name config for the three jobs (after {@link JobSpec} is compiled) be as follows:
* "job.name=flowgrp1_flowName1_jobName1_1111", "job.name=flowgrp1_flowName1_jobName2_1121", and "job.name=flowgrp1_flowName1_jobName3_1131". Then,
* for job1, this method will set "job.dependencies=flowgrp1_flowName1_jobName2_1121, flowgrp1_flowName1_jobName3_1131".
* @param jobExecutionPlans a list of {@link JobExecutionPlan}s
* @param templateToJobNameMap a HashMap that has the mapping from the {@link JobTemplate} names to job.name in corresponding
* {@link JobSpec}
*/
private void updateJobDependencies(List<JobExecutionPlan> jobExecutionPlans, Map<String, String> templateToJobNameMap) {
for (JobExecutionPlan jobExecutionPlan : jobExecutionPlans) {
JobSpec jobSpec = jobExecutionPlan.getJobSpec();
if (jobSpec.getConfig().hasPath(ConfigurationKeys.JOB_DEPENDENCIES)) {
List<String> jobDependencies = ConfigUtils.getStringList(jobSpec.getConfig(), ConfigurationKeys.JOB_DEPENDENCIES);
List<String> updatedDependenciesList = new ArrayList<>(jobDependencies.size());
for (String dependency : jobDependencies) {
if (!templateToJobNameMap.containsKey(dependency)) {
// We should never hit this condition. The logic here is a safety check.
throw new RuntimeException("TemplateToJobNameMap does not contain dependency " + dependency);
}
updatedDependenciesList.add(templateToJobNameMap.get(dependency));
}
String updatedDependencies = Joiner.on(",").join(updatedDependenciesList);
jobSpec.setConfig(jobSpec.getConfig().withValue(ConfigurationKeys.JOB_DEPENDENCIES, ConfigValueFactory.fromAnyRef(updatedDependencies)));
}
}
}
use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class IdentityFlowToJobSpecCompiler method compileFlow.
@Override
public Dag<JobExecutionPlan> compileFlow(Spec spec) {
Preconditions.checkNotNull(spec);
Preconditions.checkArgument(spec instanceof FlowSpec, "IdentityFlowToJobSpecCompiler only converts FlowSpec to JobSpec");
long startTime = System.nanoTime();
FlowSpec flowSpec = (FlowSpec) spec;
String source = flowSpec.getConfig().getString(ServiceConfigKeys.FLOW_SOURCE_IDENTIFIER_KEY);
String destination = flowSpec.getConfig().getString(ServiceConfigKeys.FLOW_DESTINATION_IDENTIFIER_KEY);
log.info(String.format("Compiling flow for source: %s and destination: %s", source, destination));
JobSpec jobSpec = jobSpecGenerator(flowSpec);
Instrumented.markMeter(this.flowCompilationSuccessFulMeter);
Instrumented.updateTimer(this.flowCompilationTimer, System.nanoTime() - startTime, TimeUnit.NANOSECONDS);
List<JobExecutionPlan> jobExecutionPlans;
try {
jobExecutionPlans = getJobExecutionPlans(source, destination, jobSpec);
} catch (InterruptedException | ExecutionException e) {
Instrumented.markMeter(this.flowCompilationFailedMeter);
throw new RuntimeException("Cannot determine topology capabilities", e);
}
return new JobExecutionPlanDagFactory().createDag(jobExecutionPlans);
}
use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class IdentityFlowToJobSpecCompiler method getJobExecutionPlans.
private List<JobExecutionPlan> getJobExecutionPlans(String source, String destination, JobSpec jobSpec) throws ExecutionException, InterruptedException {
List<JobExecutionPlan> jobExecutionPlans = new ArrayList<>();
for (TopologySpec topologySpec : topologySpecMap.values()) {
Map<ServiceNode, ServiceNode> capabilities = topologySpec.getSpecExecutor().getCapabilities().get();
for (Map.Entry<ServiceNode, ServiceNode> capability : capabilities.entrySet()) {
log.info(String.format("Evaluating current JobSpec: %s against TopologySpec: %s with " + "capability of source: %s and destination: %s ", jobSpec.getUri(), topologySpec.getUri(), capability.getKey(), capability.getValue()));
if (source.equals(capability.getKey().getNodeName()) && destination.equals(capability.getValue().getNodeName())) {
JobExecutionPlan jobExecutionPlan = new JobExecutionPlan(jobSpec, topologySpec.getSpecExecutor());
log.info(String.format("Current JobSpec: %s is executable on TopologySpec: %s. Added TopologySpec as candidate.", jobSpec.getUri(), topologySpec.getUri()));
log.info("Since we found a candidate executor, we will not try to compute more. " + "(Intended limitation for IdentityFlowToJobSpecCompiler)");
jobExecutionPlans.add(jobExecutionPlan);
return jobExecutionPlans;
}
}
}
return jobExecutionPlans;
}
use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class MultiHopFlowCompiler method compileFlow.
/**
* j
* @param spec an instance of {@link FlowSpec}.
* @return A DAG of {@link JobExecutionPlan}s, which encapsulates the compiled {@link org.apache.gobblin.runtime.api.JobSpec}s
* together with the {@link SpecExecutor} where the job can be executed.
*/
@Override
public Dag<JobExecutionPlan> compileFlow(Spec spec) {
Preconditions.checkNotNull(spec);
Preconditions.checkArgument(spec instanceof FlowSpec, "MultiHopFlowCompiler only accepts FlowSpecs");
long startTime = System.nanoTime();
FlowSpec flowSpec = (FlowSpec) spec;
String source = ConfigUtils.getString(flowSpec.getConfig(), ServiceConfigKeys.FLOW_SOURCE_IDENTIFIER_KEY, "");
String destination = ConfigUtils.getString(flowSpec.getConfig(), ServiceConfigKeys.FLOW_DESTINATION_IDENTIFIER_KEY, "");
DataNode sourceNode = this.flowGraph.getNode(source);
if (sourceNode == null) {
flowSpec.addCompilationError(source, destination, String.format("Flowgraph does not have a node with id %s", source));
return null;
}
List<String> destNodeIds = ConfigUtils.getStringList(flowSpec.getConfig(), ServiceConfigKeys.FLOW_DESTINATION_IDENTIFIER_KEY);
List<DataNode> destNodes = destNodeIds.stream().map(this.flowGraph::getNode).collect(Collectors.toList());
if (destNodes.contains(null)) {
flowSpec.addCompilationError(source, destination, String.format("Flowgraph does not have a node with id %s", destNodeIds.get(destNodes.indexOf(null))));
return null;
}
log.info(String.format("Compiling flow for source: %s and destination: %s", source, destination));
List<FlowSpec> flowSpecs = splitFlowSpec(flowSpec);
Dag<JobExecutionPlan> jobExecutionPlanDag = new Dag<>(new ArrayList<>());
try {
this.rwLock.readLock().lock();
for (FlowSpec datasetFlowSpec : flowSpecs) {
for (DataNode destNode : destNodes) {
long authStartTime = System.nanoTime();
try {
boolean authorized = this.dataMovementAuthorizer.isMovementAuthorized(flowSpec, sourceNode, destNode);
Instrumented.updateTimer(dataAuthorizationTimer, System.nanoTime() - authStartTime, TimeUnit.NANOSECONDS);
if (!authorized) {
String message = String.format("Data movement is not authorized for flow: %s, source: %s, destination: %s", flowSpec.getUri().toString(), source, destination);
log.error(message);
datasetFlowSpec.addCompilationError(source, destination, message);
return null;
}
} catch (Exception e) {
Instrumented.markMeter(flowCompilationFailedMeter);
datasetFlowSpec.addCompilationError(source, destination, Throwables.getStackTraceAsString(e));
return null;
}
}
// Compute the path from source to destination.
FlowGraphPath flowGraphPath = flowGraph.findPath(datasetFlowSpec);
if (flowGraphPath != null) {
// Convert the path into a Dag of JobExecutionPlans.
jobExecutionPlanDag = jobExecutionPlanDag.merge(flowGraphPath.asDag(this.config));
}
}
if (jobExecutionPlanDag.isEmpty()) {
Instrumented.markMeter(flowCompilationFailedMeter);
String message = String.format("No path found from source: %s and destination: %s", source, destination);
log.info(message);
if (!flowSpec.getCompilationErrors().stream().anyMatch(compilationError -> compilationError.errorPriority == 0)) {
flowSpec.addCompilationError(source, destination, message);
}
return null;
}
} catch (PathFinder.PathFinderException | SpecNotFoundException | JobTemplate.TemplateException | URISyntaxException | ReflectiveOperationException e) {
Instrumented.markMeter(flowCompilationFailedMeter);
String message = String.format("Exception encountered while compiling flow for source: %s and destination: %s, %s", source, destination, Throwables.getStackTraceAsString(e));
log.error(message, e);
flowSpec.addCompilationError(source, destination, message);
return null;
} finally {
this.rwLock.readLock().unlock();
}
Instrumented.markMeter(flowCompilationSuccessFulMeter);
Instrumented.updateTimer(flowCompilationTimer, System.nanoTime() - startTime, TimeUnit.NANOSECONDS);
return jobExecutionPlanDag;
}
use of org.apache.gobblin.service.modules.spec.JobExecutionPlan in project incubator-gobblin by apache.
the class MultiHopFlowCompilerTest method testMulticastPath.
@Test(dependsOnMethods = "testCompileFlowSingleHop")
public void testMulticastPath() throws IOException, URISyntaxException {
FlowSpec spec = createFlowSpec("flow/flow2.conf", "LocalFS-1", "HDFS-3,HDFS-4", false, false);
Dag<JobExecutionPlan> jobDag = this.specCompiler.compileFlow(spec);
Assert.assertEquals(jobDag.getNodes().size(), 4);
Assert.assertEquals(jobDag.getEndNodes().size(), 2);
Assert.assertEquals(jobDag.getStartNodes().size(), 2);
// First hop must be from LocalFS to HDFS-1 and HDFS-2
Set<String> jobNames = new HashSet<>();
jobNames.add(Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join("testFlowGroup", "testFlowName", "Distcp", "LocalFS-1", "HDFS-1", "localToHdfs"));
jobNames.add(Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join("testFlowGroup", "testFlowName", "Distcp", "LocalFS-1", "HDFS-2", "localToHdfs"));
for (DagNode<JobExecutionPlan> dagNode : jobDag.getStartNodes()) {
Config jobConfig = dagNode.getValue().getJobSpec().getConfig();
String jobName = jobConfig.getString(ConfigurationKeys.JOB_NAME_KEY);
Assert.assertTrue(jobNames.stream().anyMatch(jobName::startsWith));
}
// Second hop must be from HDFS-1/HDFS-2 to HDFS-3/HDFS-4 respectively.
jobNames = new HashSet<>();
jobNames.add(Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join("testFlowGroup", "testFlowName", "Distcp", "HDFS-1", "HDFS-3", "hdfsToHdfs"));
jobNames.add(Joiner.on(JobExecutionPlan.Factory.JOB_NAME_COMPONENT_SEPARATION_CHAR).join("testFlowGroup", "testFlowName", "Distcp", "HDFS-2", "HDFS-4", "hdfsToHdfs"));
for (DagNode<JobExecutionPlan> dagNode : jobDag.getStartNodes()) {
List<DagNode<JobExecutionPlan>> nextNodes = jobDag.getChildren(dagNode);
Assert.assertEquals(nextNodes.size(), 1);
Config jobConfig = nextNodes.get(0).getValue().getJobSpec().getConfig();
String jobName = jobConfig.getString(ConfigurationKeys.JOB_NAME_KEY);
Assert.assertTrue(jobNames.stream().anyMatch(jobName::startsWith));
}
}
Aggregations