use of org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult in project titan by thinkaurelius.
the class FulgoraGraphComputer method submit.
@Override
public Future<ComputerResult> submit() {
if (executed)
throw Exceptions.computerHasAlreadyBeenSubmittedAVertexProgram();
else
executed = true;
// it is not possible execute a computer if it has no vertex program nor mapreducers
if (null == vertexProgram && mapReduces.isEmpty())
throw GraphComputer.Exceptions.computerHasNoVertexProgramNorMapReducers();
// it is possible to run mapreducers without a vertex program
if (null != vertexProgram) {
GraphComputerHelper.validateProgramOnComputer(this, vertexProgram);
this.mapReduces.addAll(this.vertexProgram.getMapReducers());
}
// if the user didn't set desired persistence/resultgraph, then get from vertex program or else, no persistence
this.persistMode = GraphComputerHelper.getPersistState(Optional.ofNullable(this.vertexProgram), Optional.ofNullable(this.persistMode));
this.resultGraphMode = GraphComputerHelper.getResultGraphState(Optional.ofNullable(this.vertexProgram), Optional.ofNullable(this.resultGraphMode));
// determine the legality persistence and result graph options
if (!this.features().supportsResultGraphPersistCombination(this.resultGraphMode, this.persistMode))
throw GraphComputer.Exceptions.resultGraphPersistCombinationNotSupported(this.resultGraphMode, this.persistMode);
memory = new FulgoraMemory(vertexProgram, mapReduces);
return CompletableFuture.<ComputerResult>supplyAsync(() -> {
final long time = System.currentTimeMillis();
if (null != vertexProgram) {
// ##### Execute vertex program
vertexMemory = new FulgoraVertexMemory(expectedNumVertices, graph.getIDManager(), vertexProgram);
// execute the vertex program
vertexProgram.setup(memory);
memory.completeSubRound();
for (int iteration = 1; ; iteration++) {
vertexMemory.nextIteration(vertexProgram.getMessageScopes(memory));
jobId = name + "#" + iteration;
VertexProgramScanJob.Executor job = VertexProgramScanJob.getVertexProgramScanJob(graph, memory, vertexMemory, vertexProgram);
StandardScanner.Builder scanBuilder = graph.getBackend().buildEdgeScanJob();
scanBuilder.setJobId(jobId);
scanBuilder.setNumProcessingThreads(numThreads);
scanBuilder.setWorkBlockSize(readBatchSize);
scanBuilder.setJob(job);
PartitionedVertexProgramExecutor pvpe = new PartitionedVertexProgramExecutor(graph, memory, vertexMemory, vertexProgram);
try {
//Iterates over all vertices and computes the vertex program on all non-partitioned vertices. For partitioned ones, the data is aggregated
ScanMetrics jobResult = scanBuilder.execute().get();
long failures = jobResult.get(ScanMetrics.Metric.FAILURE);
if (failures > 0) {
throw new TitanException("Failed to process [" + failures + "] vertices in vertex program iteration [" + iteration + "]. Computer is aborting.");
}
//Runs the vertex program on all aggregated, partitioned vertices.
pvpe.run(numThreads, jobResult);
failures = jobResult.getCustom(PartitionedVertexProgramExecutor.PARTITION_VERTEX_POSTFAIL);
if (failures > 0) {
throw new TitanException("Failed to process [" + failures + "] partitioned vertices in vertex program iteration [" + iteration + "]. Computer is aborting.");
}
} catch (Exception e) {
throw new TitanException(e);
}
vertexMemory.completeIteration();
memory.completeSubRound();
try {
if (this.vertexProgram.terminate(this.memory)) {
break;
}
} finally {
memory.incrIteration();
memory.completeSubRound();
}
}
}
// ##### Execute mapreduce jobs
// Collect map jobs
Map<MapReduce, FulgoraMapEmitter> mapJobs = new HashMap<>(mapReduces.size());
for (MapReduce mapReduce : mapReduces) {
if (mapReduce.doStage(MapReduce.Stage.MAP)) {
FulgoraMapEmitter mapEmitter = new FulgoraMapEmitter<>(mapReduce.doStage(MapReduce.Stage.REDUCE));
mapJobs.put(mapReduce, mapEmitter);
}
}
// Execute map jobs
jobId = name + "#map";
VertexMapJob.Executor job = VertexMapJob.getVertexMapJob(graph, vertexMemory, mapJobs);
StandardScanner.Builder scanBuilder = graph.getBackend().buildEdgeScanJob();
scanBuilder.setJobId(jobId);
scanBuilder.setNumProcessingThreads(numThreads);
scanBuilder.setWorkBlockSize(readBatchSize);
scanBuilder.setJob(job);
try {
ScanMetrics jobResult = scanBuilder.execute().get();
long failures = jobResult.get(ScanMetrics.Metric.FAILURE);
if (failures > 0) {
throw new TitanException("Failed to process [" + failures + "] vertices in map phase. Computer is aborting.");
}
failures = jobResult.getCustom(VertexMapJob.MAP_JOB_FAILURE);
if (failures > 0) {
throw new TitanException("Failed to process [" + failures + "] individual map jobs. Computer is aborting.");
}
} catch (Exception e) {
throw new TitanException(e);
}
// Execute reduce phase and add to memory
for (Map.Entry<MapReduce, FulgoraMapEmitter> mapJob : mapJobs.entrySet()) {
FulgoraMapEmitter<?, ?> mapEmitter = mapJob.getValue();
MapReduce mapReduce = mapJob.getKey();
// sort results if a map output sort is defined
mapEmitter.complete(mapReduce);
if (mapReduce.doStage(MapReduce.Stage.REDUCE)) {
final FulgoraReduceEmitter<?, ?> reduceEmitter = new FulgoraReduceEmitter<>();
try (WorkerPool workers = new WorkerPool(numThreads)) {
workers.submit(() -> mapReduce.workerStart(MapReduce.Stage.REDUCE));
for (final Map.Entry queueEntry : mapEmitter.reduceMap.entrySet()) {
workers.submit(() -> mapReduce.reduce(queueEntry.getKey(), ((Iterable) queueEntry.getValue()).iterator(), reduceEmitter));
}
workers.submit(() -> mapReduce.workerEnd(MapReduce.Stage.REDUCE));
} catch (Exception e) {
throw new TitanException("Exception while executing reduce phase", e);
}
// mapEmitter.reduceMap.entrySet().parallelStream().forEach(entry -> mapReduce.reduce(entry.getKey(), entry.getValue().iterator(), reduceEmitter));
// sort results if a reduce output sort is defined
reduceEmitter.complete(mapReduce);
mapReduce.addResultToMemory(this.memory, reduceEmitter.reduceQueue.iterator());
} else {
mapReduce.addResultToMemory(this.memory, mapEmitter.mapQueue.iterator());
}
}
// #### Write mutated properties back into graph
Graph resultgraph = graph;
if (persistMode == Persist.NOTHING && resultGraphMode == ResultGraph.NEW) {
resultgraph = EmptyGraph.instance();
} else if (persistMode != Persist.NOTHING && vertexProgram != null && !vertexProgram.getElementComputeKeys().isEmpty()) {
//First, create property keys in graph if they don't already exist
TitanManagement mgmt = graph.openManagement();
try {
for (String key : vertexProgram.getElementComputeKeys()) {
if (!mgmt.containsPropertyKey(key))
log.warn("Property key [{}] is not part of the schema and will be created. It is advised to initialize all keys.", key);
mgmt.getOrCreatePropertyKey(key);
}
mgmt.commit();
} finally {
if (mgmt != null && mgmt.isOpen())
mgmt.rollback();
}
//TODO: Filter based on VertexProgram
Map<Long, Map<String, Object>> mutatedProperties = Maps.transformValues(vertexMemory.getMutableVertexProperties(), new Function<Map<String, Object>, Map<String, Object>>() {
@Nullable
@Override
public Map<String, Object> apply(@Nullable Map<String, Object> o) {
return Maps.filterKeys(o, s -> !NON_PERSISTING_KEYS.contains(s));
}
});
if (resultGraphMode == ResultGraph.ORIGINAL) {
AtomicInteger failures = new AtomicInteger(0);
try (WorkerPool workers = new WorkerPool(numThreads)) {
List<Map.Entry<Long, Map<String, Object>>> subset = new ArrayList<>(writeBatchSize / vertexProgram.getElementComputeKeys().size());
int currentSize = 0;
for (Map.Entry<Long, Map<String, Object>> entry : mutatedProperties.entrySet()) {
subset.add(entry);
currentSize += entry.getValue().size();
if (currentSize >= writeBatchSize) {
workers.submit(new VertexPropertyWriter(subset, failures));
subset = new ArrayList<>(subset.size());
currentSize = 0;
}
}
if (!subset.isEmpty())
workers.submit(new VertexPropertyWriter(subset, failures));
} catch (Exception e) {
throw new TitanException("Exception while attempting to persist result into graph", e);
}
if (failures.get() > 0)
throw new TitanException("Could not persist program results to graph. Check log for details.");
} else if (resultGraphMode == ResultGraph.NEW) {
resultgraph = graph.newTransaction();
for (Map.Entry<Long, Map<String, Object>> vprop : mutatedProperties.entrySet()) {
Vertex v = resultgraph.vertices(vprop.getKey()).next();
for (Map.Entry<String, Object> prop : vprop.getValue().entrySet()) {
v.property(VertexProperty.Cardinality.single, prop.getKey(), prop.getValue());
}
}
}
}
// update runtime and return the newly computed graph
this.memory.setRuntime(System.currentTimeMillis() - time);
this.memory.complete();
return new DefaultComputerResult(resultgraph, this.memory);
});
}
use of org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult in project janusgraph by JanusGraph.
the class FulgoraGraphComputer method submit.
@Override
public Future<ComputerResult> submit() {
if (executed)
throw Exceptions.computerHasAlreadyBeenSubmittedAVertexProgram();
else
executed = true;
// it is not possible execute a computer if it has no vertex program nor map-reducers
if (null == vertexProgram && mapReduces.isEmpty())
throw GraphComputer.Exceptions.computerHasNoVertexProgramNorMapReducers();
// it is possible to run map-reducers without a vertex program
if (null != vertexProgram) {
GraphComputerHelper.validateProgramOnComputer(this, vertexProgram);
this.mapReduces.addAll(this.vertexProgram.getMapReducers());
}
// if the user didn't set desired persistence/resultgraph, then get from vertex program or else, no persistence
this.persistMode = GraphComputerHelper.getPersistState(Optional.ofNullable(this.vertexProgram), Optional.ofNullable(this.persistMode));
this.resultGraphMode = GraphComputerHelper.getResultGraphState(Optional.ofNullable(this.vertexProgram), Optional.ofNullable(this.resultGraphMode));
// determine the legality persistence and result graph options
if (!this.features().supportsResultGraphPersistCombination(this.resultGraphMode, this.persistMode))
throw GraphComputer.Exceptions.resultGraphPersistCombinationNotSupported(this.resultGraphMode, this.persistMode);
// ensure requested workers are not larger than supported workers
if (this.numThreads > this.features().getMaxWorkers())
throw GraphComputer.Exceptions.computerRequiresMoreWorkersThanSupported(this.numThreads, this.features().getMaxWorkers());
memory = new FulgoraMemory(vertexProgram, mapReduces);
return CompletableFuture.supplyAsync(() -> {
final long time = System.currentTimeMillis();
if (null != vertexProgram) {
// ##### Execute vertex program
vertexMemory = new FulgoraVertexMemory(expectedNumVertices, graph.getIDManager(), vertexProgram);
// execute the vertex program
vertexProgram.setup(memory);
try (VertexProgramScanJob.Executor job = VertexProgramScanJob.getVertexProgramScanJob(graph, memory, vertexMemory, vertexProgram)) {
for (int iteration = 1; ; iteration++) {
memory.completeSubRound();
vertexMemory.nextIteration(vertexProgram.getMessageScopes(memory));
jobId = name + "#" + iteration;
StandardScanner.Builder scanBuilder = graph.getBackend().buildEdgeScanJob();
scanBuilder.setJobId(jobId);
scanBuilder.setNumProcessingThreads(numThreads);
scanBuilder.setWorkBlockSize(readBatchSize);
scanBuilder.setJob(job);
PartitionedVertexProgramExecutor programExecutor = new PartitionedVertexProgramExecutor(graph, memory, vertexMemory, vertexProgram);
try {
// Iterates over all vertices and computes the vertex program on all non-partitioned vertices. For partitioned ones, the data is aggregated
ScanMetrics jobResult = scanBuilder.execute().get();
long failures = jobResult.get(ScanMetrics.Metric.FAILURE);
if (failures > 0) {
throw new JanusGraphException("Failed to process [" + failures + "] vertices in vertex program iteration [" + iteration + "]. Computer is aborting.");
}
// Runs the vertex program on all aggregated, partitioned vertices.
programExecutor.run(numThreads, jobResult);
failures = jobResult.getCustom(PartitionedVertexProgramExecutor.PARTITION_VERTEX_POSTFAIL);
if (failures > 0) {
throw new JanusGraphException("Failed to process [" + failures + "] partitioned vertices in vertex program iteration [" + iteration + "]. Computer is aborting.");
}
} catch (Exception e) {
throw new JanusGraphException(e);
}
vertexMemory.completeIteration();
memory.completeSubRound();
try {
if (this.vertexProgram.terminate(this.memory)) {
break;
}
} finally {
memory.incrIteration();
}
}
}
}
// ##### Execute map-reduce jobs
// Collect map jobs
Map<MapReduce, FulgoraMapEmitter> mapJobs = new HashMap<>(mapReduces.size());
for (MapReduce mapReduce : mapReduces) {
if (mapReduce.doStage(MapReduce.Stage.MAP)) {
FulgoraMapEmitter mapEmitter = new FulgoraMapEmitter<>(mapReduce.doStage(MapReduce.Stage.REDUCE));
mapJobs.put(mapReduce, mapEmitter);
}
}
// Execute map jobs
jobId = name + "#map";
try (VertexMapJob.Executor job = VertexMapJob.getVertexMapJob(graph, vertexMemory, mapJobs)) {
StandardScanner.Builder scanBuilder = graph.getBackend().buildEdgeScanJob();
scanBuilder.setJobId(jobId);
scanBuilder.setNumProcessingThreads(numThreads);
scanBuilder.setWorkBlockSize(readBatchSize);
scanBuilder.setJob(job);
try {
ScanMetrics jobResult = scanBuilder.execute().get();
long failures = jobResult.get(ScanMetrics.Metric.FAILURE);
if (failures > 0) {
throw new JanusGraphException("Failed to process [" + failures + "] vertices in map phase. Computer is aborting.");
}
failures = jobResult.getCustom(VertexMapJob.MAP_JOB_FAILURE);
if (failures > 0) {
throw new JanusGraphException("Failed to process [" + failures + "] individual map jobs. Computer is aborting.");
}
} catch (Exception e) {
throw new JanusGraphException(e);
}
// Execute reduce phase and add to memory
for (Map.Entry<MapReduce, FulgoraMapEmitter> mapJob : mapJobs.entrySet()) {
FulgoraMapEmitter<?, ?> mapEmitter = mapJob.getValue();
MapReduce mapReduce = mapJob.getKey();
// sort results if a map output sort is defined
mapEmitter.complete(mapReduce);
if (mapReduce.doStage(MapReduce.Stage.REDUCE)) {
final FulgoraReduceEmitter<?, ?> reduceEmitter = new FulgoraReduceEmitter<>();
try (WorkerPool workers = new WorkerPool(numThreads)) {
workers.submit(() -> mapReduce.workerStart(MapReduce.Stage.REDUCE));
for (final Map.Entry queueEntry : mapEmitter.reduceMap.entrySet()) {
if (null == queueEntry)
break;
workers.submit(() -> mapReduce.reduce(queueEntry.getKey(), ((Iterable) queueEntry.getValue()).iterator(), reduceEmitter));
}
workers.submit(() -> mapReduce.workerEnd(MapReduce.Stage.REDUCE));
} catch (Exception e) {
throw new JanusGraphException("Exception while executing reduce phase", e);
}
// mapEmitter.reduceMap.entrySet().parallelStream().forEach(entry -> mapReduce.reduce(entry.getKey(), entry.getValue().iterator(), reduceEmitter));
// sort results if a reduce output sort is defined
reduceEmitter.complete(mapReduce);
mapReduce.addResultToMemory(this.memory, reduceEmitter.reduceQueue.iterator());
} else {
mapReduce.addResultToMemory(this.memory, mapEmitter.mapQueue.iterator());
}
}
}
memory.attachReferenceElements(graph);
// #### Write mutated properties back into graph
Graph resultgraph = graph;
if (persistMode == Persist.NOTHING && resultGraphMode == ResultGraph.NEW) {
resultgraph = EmptyGraph.instance();
} else if (persistMode != Persist.NOTHING && vertexProgram != null && !vertexProgram.getVertexComputeKeys().isEmpty()) {
// First, create property keys in graph if they don't already exist
JanusGraphManagement management = graph.openManagement();
try {
for (VertexComputeKey key : vertexProgram.getVertexComputeKeys()) {
if (!management.containsPropertyKey(key.getKey()))
log.warn("Property key [{}] is not part of the schema and will be created. It is advised to initialize all keys.", key.getKey());
management.getOrCreatePropertyKey(key.getKey());
}
management.commit();
} finally {
if (management != null && management.isOpen())
management.rollback();
}
// TODO: Filter based on VertexProgram
Map<Long, Map<String, Object>> mutatedProperties = Maps.transformValues(vertexMemory.getMutableVertexProperties(), new Function<Map<String, Object>, Map<String, Object>>() {
@Nullable
@Override
public Map<String, Object> apply(final Map<String, Object> o) {
return Maps.filterKeys(o, s -> !VertexProgramHelper.isTransientVertexComputeKey(s, vertexProgram.getVertexComputeKeys()));
}
});
if (resultGraphMode == ResultGraph.ORIGINAL) {
AtomicInteger failures = new AtomicInteger(0);
try (WorkerPool workers = new WorkerPool(numThreads)) {
List<Map.Entry<Long, Map<String, Object>>> subset = new ArrayList<>(writeBatchSize / vertexProgram.getVertexComputeKeys().size());
int currentSize = 0;
for (Map.Entry<Long, Map<String, Object>> entry : mutatedProperties.entrySet()) {
subset.add(entry);
currentSize += entry.getValue().size();
if (currentSize >= writeBatchSize) {
workers.submit(new VertexPropertyWriter(subset, failures));
subset = new ArrayList<>(subset.size());
currentSize = 0;
}
}
if (!subset.isEmpty())
workers.submit(new VertexPropertyWriter(subset, failures));
} catch (Exception e) {
throw new JanusGraphException("Exception while attempting to persist result into graph", e);
}
if (failures.get() > 0)
throw new JanusGraphException("Could not persist program results to graph. Check log for details.");
} else if (resultGraphMode == ResultGraph.NEW) {
resultgraph = graph.newTransaction();
for (Map.Entry<Long, Map<String, Object>> vertexProperty : mutatedProperties.entrySet()) {
Vertex v = resultgraph.vertices(vertexProperty.getKey()).next();
for (Map.Entry<String, Object> prop : vertexProperty.getValue().entrySet()) {
if (prop.getValue() instanceof List) {
((List) prop.getValue()).forEach(value -> v.property(VertexProperty.Cardinality.list, prop.getKey(), value));
} else {
v.property(VertexProperty.Cardinality.single, prop.getKey(), prop.getValue());
}
}
}
}
}
// update runtime and return the newly computed graph
this.memory.setRuntime(System.currentTimeMillis() - time);
this.memory.complete();
return new DefaultComputerResult(resultgraph, this.memory);
});
}
use of org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult in project janusgraph by JanusGraph.
the class FulgoraGraphComputer method submitAsync.
private ComputerResult submitAsync() {
final long time = System.currentTimeMillis();
executeVertexProgram();
Map<MapReduce, FulgoraMapEmitter> mapJobs = collectMapJobs();
executeMapJobs(mapJobs);
Graph resultgraph = writeMutatedPropertiesBackIntoGraph();
// update runtime and return the newly computed graph
this.memory.setRuntime(System.currentTimeMillis() - time);
this.memory.complete();
return new DefaultComputerResult(resultgraph, this.memory);
}
use of org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult in project grakn by graknlabs.
the class GraknSparkComputer method submitWithExecutor.
@SuppressWarnings("PMD.UnusedFormalParameter")
private Future<ComputerResult> submitWithExecutor() {
jobGroupId = Integer.toString(ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE));
String jobDescription = this.vertexProgram == null ? this.mapReducers.toString() : this.vertexProgram + "+" + this.mapReducers;
// Use different output locations
this.sparkConfiguration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, this.sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION) + "/" + jobGroupId);
updateConfigKeys(sparkConfiguration);
final Future<ComputerResult> result = computerService.submit(() -> {
final long startTime = System.currentTimeMillis();
// apache and hadoop configurations that are used throughout the graph computer computation
final org.apache.commons.configuration.Configuration graphComputerConfiguration = new HadoopConfiguration(this.sparkConfiguration);
if (!graphComputerConfiguration.containsKey(Constants.SPARK_SERIALIZER)) {
graphComputerConfiguration.setProperty(Constants.SPARK_SERIALIZER, GryoSerializer.class.getCanonicalName());
}
graphComputerConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, this.persist.equals(GraphComputer.Persist.EDGES));
final Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(graphComputerConfiguration);
final Storage fileSystemStorage = FileSystemStorage.open(hadoopConfiguration);
final boolean inputFromHDFS = FileInputFormat.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
final boolean inputFromSpark = PersistedInputRDD.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
final boolean outputToHDFS = FileOutputFormat.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
final boolean outputToSpark = PersistedOutputRDD.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
final boolean skipPartitioner = graphComputerConfiguration.getBoolean(Constants.GREMLIN_SPARK_SKIP_PARTITIONER, false);
final boolean skipPersist = graphComputerConfiguration.getBoolean(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE, false);
if (inputFromHDFS) {
String inputLocation = Constants.getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION), fileSystemStorage).orElse(null);
if (null != inputLocation) {
try {
graphComputerConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath().toString());
hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath().toString());
} catch (final IOException e) {
throw new IllegalStateException(e.getMessage(), e);
}
}
}
final InputRDD inputRDD;
final OutputRDD outputRDD;
final boolean filtered;
try {
inputRDD = InputRDD.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)) ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputRDD.class, InputRDD.class).newInstance() : InputFormatRDD.class.newInstance();
outputRDD = OutputRDD.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)) ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputRDD.class, OutputRDD.class).newInstance() : OutputFormatRDD.class.newInstance();
// if the input class can filter on load, then set the filters
if (inputRDD instanceof InputFormatRDD && GraphFilterAware.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class))) {
GraphFilterAware.storeGraphFilter(graphComputerConfiguration, hadoopConfiguration, this.graphFilter);
filtered = false;
} else if (inputRDD instanceof GraphFilterAware) {
((GraphFilterAware) inputRDD).setGraphFilter(this.graphFilter);
filtered = false;
} else
filtered = this.graphFilter.hasFilter();
} catch (final InstantiationException | IllegalAccessException e) {
throw new IllegalStateException(e.getMessage(), e);
}
// create the spark context from the graph computer configuration
final JavaSparkContext sparkContext = new JavaSparkContext(Spark.create(hadoopConfiguration));
final Storage sparkContextStorage = SparkContextStorage.open();
sparkContext.setJobGroup(jobGroupId, jobDescription);
GraknSparkMemory memory = null;
// delete output location
final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);
if (null != outputLocation) {
if (outputToHDFS && fileSystemStorage.exists(outputLocation)) {
fileSystemStorage.rm(outputLocation);
}
if (outputToSpark && sparkContextStorage.exists(outputLocation)) {
sparkContextStorage.rm(outputLocation);
}
}
// the Spark application name will always be set by SparkContextStorage,
// thus, INFO the name to make it easier to debug
logger.debug(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX + (null == this.vertexProgram ? "No VertexProgram" : this.vertexProgram) + "[" + this.mapReducers + "]");
// add the project jars to the cluster
this.loadJars(hadoopConfiguration, sparkContext);
updateLocalConfiguration(sparkContext, hadoopConfiguration);
// create a message-passing friendly rdd from the input rdd
boolean partitioned = false;
JavaPairRDD<Object, VertexWritable> loadedGraphRDD = inputRDD.readGraphRDD(graphComputerConfiguration, sparkContext);
// if there are vertex or edge filters, filter the loaded graph rdd prior to partitioning and persisting
if (filtered) {
this.logger.debug("Filtering the loaded graphRDD: " + this.graphFilter);
loadedGraphRDD = GraknSparkExecutor.applyGraphFilter(loadedGraphRDD, this.graphFilter);
}
// else partition it with HashPartitioner
if (loadedGraphRDD.partitioner().isPresent()) {
this.logger.debug("Using the existing partitioner associated with the loaded graphRDD: " + loadedGraphRDD.partitioner().get());
} else {
if (!skipPartitioner) {
final Partitioner partitioner = new HashPartitioner(this.workersSet ? this.workers : loadedGraphRDD.partitions().size());
this.logger.debug("Partitioning the loaded graphRDD: " + partitioner);
loadedGraphRDD = loadedGraphRDD.partitionBy(partitioner);
partitioned = true;
assert loadedGraphRDD.partitioner().isPresent();
} else {
// no easy way to test this with a test case
assert skipPartitioner == !loadedGraphRDD.partitioner().isPresent();
this.logger.debug("Partitioning has been skipped for the loaded graphRDD via " + Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
}
}
// then this coalesce/repartition will not take place
if (this.workersSet) {
// ensures that the loaded graphRDD does not have more partitions than workers
if (loadedGraphRDD.partitions().size() > this.workers) {
loadedGraphRDD = loadedGraphRDD.coalesce(this.workers);
} else {
// ensures that the loaded graphRDD does not have less partitions than workers
if (loadedGraphRDD.partitions().size() < this.workers) {
loadedGraphRDD = loadedGraphRDD.repartition(this.workers);
}
}
}
// or else use default cache() which is MEMORY_ONLY
if (!skipPersist && (!inputFromSpark || partitioned || filtered)) {
loadedGraphRDD = loadedGraphRDD.persist(StorageLevel.fromString(hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
}
// final graph with view
// (for persisting and/or mapReducing -- may be null and thus, possible to save space/time)
JavaPairRDD<Object, VertexWritable> computedGraphRDD = null;
try {
// //////////////////////////////
if (null != this.vertexProgram) {
memory = new GraknSparkMemory(this.vertexProgram, this.mapReducers, sparkContext);
// if there is a registered VertexProgramInterceptor, use it to bypass the GraphComputer semantics
if (graphComputerConfiguration.containsKey(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) {
try {
final GraknSparkVertexProgramInterceptor<VertexProgram> interceptor = (GraknSparkVertexProgramInterceptor) Class.forName(graphComputerConfiguration.getString(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)).newInstance();
computedGraphRDD = interceptor.apply(this.vertexProgram, loadedGraphRDD, memory);
} catch (final ClassNotFoundException | IllegalAccessException | InstantiationException e) {
throw new IllegalStateException(e.getMessage());
}
} else {
// standard GraphComputer semantics
// get a configuration that will be propagated to all workers
final HadoopConfiguration vertexProgramConfiguration = new HadoopConfiguration();
this.vertexProgram.storeState(vertexProgramConfiguration);
// set up the vertex program and wire up configurations
this.vertexProgram.setup(memory);
JavaPairRDD<Object, ViewIncomingPayload<Object>> viewIncomingRDD = null;
memory.broadcastMemory(sparkContext);
// execute the vertex program
while (true) {
if (Thread.interrupted()) {
sparkContext.cancelAllJobs();
throw new TraversalInterruptedException();
}
memory.setInExecute(true);
viewIncomingRDD = GraknSparkExecutor.executeVertexProgramIteration(loadedGraphRDD, viewIncomingRDD, memory, graphComputerConfiguration, vertexProgramConfiguration);
memory.setInExecute(false);
if (this.vertexProgram.terminate(memory)) {
break;
} else {
memory.incrIteration();
memory.broadcastMemory(sparkContext);
}
}
// then generate a view+graph
if ((null != outputRDD && !this.persist.equals(Persist.NOTHING)) || !this.mapReducers.isEmpty()) {
computedGraphRDD = GraknSparkExecutor.prepareFinalGraphRDD(loadedGraphRDD, viewIncomingRDD, this.vertexProgram.getVertexComputeKeys());
assert null != computedGraphRDD && computedGraphRDD != loadedGraphRDD;
} else {
// ensure that the computedGraphRDD was not created
assert null == computedGraphRDD;
}
}
// ///////////////
// drop all transient memory keys
memory.complete();
// write the computed graph to the respective output (rdd or output format)
if (null != outputRDD && !this.persist.equals(Persist.NOTHING)) {
// the logic holds that a computeGraphRDD must be created at this point
assert null != computedGraphRDD;
outputRDD.writeGraphRDD(graphComputerConfiguration, computedGraphRDD);
}
}
final boolean computedGraphCreated = computedGraphRDD != null && computedGraphRDD != loadedGraphRDD;
if (!computedGraphCreated) {
computedGraphRDD = loadedGraphRDD;
}
final Memory.Admin finalMemory = null == memory ? new MapMemory() : new MapMemory(memory);
// ////////////////////////////
if (!this.mapReducers.isEmpty()) {
// create a mapReduceRDD for executing the map reduce jobs on
JavaPairRDD<Object, VertexWritable> mapReduceRDD = computedGraphRDD;
if (computedGraphCreated && !outputToSpark) {
// drop all the edges of the graph as they are not used in mapReduce processing
mapReduceRDD = computedGraphRDD.mapValues(vertexWritable -> {
vertexWritable.get().dropEdges(Direction.BOTH);
return vertexWritable;
});
// if there is only one MapReduce to execute, don't bother wasting the clock cycles.
if (this.mapReducers.size() > 1) {
mapReduceRDD = mapReduceRDD.persist(StorageLevel.fromString(hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
}
}
for (final MapReduce mapReduce : this.mapReducers) {
// execute the map reduce job
final HadoopConfiguration newApacheConfiguration = new HadoopConfiguration(graphComputerConfiguration);
mapReduce.storeState(newApacheConfiguration);
// map
final JavaPairRDD mapRDD = GraknSparkExecutor.executeMap(mapReduceRDD, mapReduce, newApacheConfiguration);
// combine
final JavaPairRDD combineRDD = mapReduce.doStage(MapReduce.Stage.COMBINE) ? GraknSparkExecutor.executeCombine(mapRDD, newApacheConfiguration) : mapRDD;
// reduce
final JavaPairRDD reduceRDD = mapReduce.doStage(MapReduce.Stage.REDUCE) ? GraknSparkExecutor.executeReduce(combineRDD, mapReduce, newApacheConfiguration) : combineRDD;
// write the map reduce output back to disk and computer result memory
if (null != outputRDD) {
mapReduce.addResultToMemory(finalMemory, outputRDD.writeMemoryRDD(graphComputerConfiguration, mapReduce.getMemoryKey(), reduceRDD));
}
}
// if the mapReduceRDD is not simply the computed graph, unpersist the mapReduceRDD
if (computedGraphCreated && !outputToSpark) {
assert loadedGraphRDD != computedGraphRDD;
assert mapReduceRDD != computedGraphRDD;
mapReduceRDD.unpersist();
} else {
assert mapReduceRDD == computedGraphRDD;
}
}
// if the graphRDD was loaded from Spark, but then partitioned or filtered, its a different RDD
if (!inputFromSpark || partitioned || filtered) {
loadedGraphRDD.unpersist();
}
// then don't unpersist the computedGraphRDD/loadedGraphRDD
if ((!outputToSpark || this.persist.equals(GraphComputer.Persist.NOTHING)) && computedGraphCreated) {
computedGraphRDD.unpersist();
}
// delete any file system or rdd data if persist nothing
if (null != outputLocation && this.persist.equals(GraphComputer.Persist.NOTHING)) {
if (outputToHDFS) {
fileSystemStorage.rm(outputLocation);
}
if (outputToSpark) {
sparkContextStorage.rm(outputLocation);
}
}
// update runtime and return the newly computed graph
finalMemory.setRuntime(System.currentTimeMillis() - startTime);
// clear properties that should not be propagated in an OLAP chain
graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR);
graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE);
graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
return new DefaultComputerResult(InputOutputHelper.getOutputGraph(graphComputerConfiguration, this.resultGraph, this.persist), finalMemory.asImmutable());
} catch (Exception e) {
// So it throws the same exception as tinker does
throw new RuntimeException(e);
}
});
computerService.shutdown();
return result;
}
Aggregations