Examples with SparkContextStorage - org.apache.tinkerpop.gremlin.spark.structure.io.SparkContextStorage

Example 1 with SparkContextStorage

use of org.apache.tinkerpop.gremlin.spark.structure.io.SparkContextStorage in project grakn by graknlabs.

the class GraknSparkComputer method submitWithExecutor.

@SuppressWarnings("PMD.UnusedFormalParameter")
private Future<ComputerResult> submitWithExecutor() {
    jobGroupId = Integer.toString(ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE));
    String jobDescription = this.vertexProgram == null ? this.mapReducers.toString() : this.vertexProgram + "+" + this.mapReducers;
    // Use different output locations
    this.sparkConfiguration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, this.sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION) + "/" + jobGroupId);
    updateConfigKeys(sparkConfiguration);
    final Future<ComputerResult> result = computerService.submit(() -> {
        final long startTime = System.currentTimeMillis();
        // apache and hadoop configurations that are used throughout the graph computer computation
        final org.apache.commons.configuration.Configuration graphComputerConfiguration = new HadoopConfiguration(this.sparkConfiguration);
        if (!graphComputerConfiguration.containsKey(Constants.SPARK_SERIALIZER)) {
            graphComputerConfiguration.setProperty(Constants.SPARK_SERIALIZER, GryoSerializer.class.getCanonicalName());
        }
        graphComputerConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, this.persist.equals(GraphComputer.Persist.EDGES));
        final Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(graphComputerConfiguration);
        final Storage fileSystemStorage = FileSystemStorage.open(hadoopConfiguration);
        final boolean inputFromHDFS = FileInputFormat.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
        final boolean inputFromSpark = PersistedInputRDD.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
        final boolean outputToHDFS = FileOutputFormat.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
        final boolean outputToSpark = PersistedOutputRDD.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
        final boolean skipPartitioner = graphComputerConfiguration.getBoolean(Constants.GREMLIN_SPARK_SKIP_PARTITIONER, false);
        final boolean skipPersist = graphComputerConfiguration.getBoolean(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE, false);
        if (inputFromHDFS) {
            String inputLocation = Constants.getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION), fileSystemStorage).orElse(null);
            if (null != inputLocation) {
                try {
                    graphComputerConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath().toString());
                    hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath().toString());
                } catch (final IOException e) {
                    throw new IllegalStateException(e.getMessage(), e);
                }
            }
        }
        final InputRDD inputRDD;
        final OutputRDD outputRDD;
        final boolean filtered;
        try {
            inputRDD = InputRDD.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)) ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputRDD.class, InputRDD.class).newInstance() : InputFormatRDD.class.newInstance();
            outputRDD = OutputRDD.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)) ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputRDD.class, OutputRDD.class).newInstance() : OutputFormatRDD.class.newInstance();
            // if the input class can filter on load, then set the filters
            if (inputRDD instanceof InputFormatRDD && GraphFilterAware.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class))) {
                GraphFilterAware.storeGraphFilter(graphComputerConfiguration, hadoopConfiguration, this.graphFilter);
                filtered = false;
            } else if (inputRDD instanceof GraphFilterAware) {
                ((GraphFilterAware) inputRDD).setGraphFilter(this.graphFilter);
                filtered = false;
            } else
                filtered = this.graphFilter.hasFilter();
        } catch (final InstantiationException | IllegalAccessException e) {
            throw new IllegalStateException(e.getMessage(), e);
        }
        // create the spark context from the graph computer configuration
        final JavaSparkContext sparkContext = new JavaSparkContext(Spark.create(hadoopConfiguration));
        final Storage sparkContextStorage = SparkContextStorage.open();
        sparkContext.setJobGroup(jobGroupId, jobDescription);
        GraknSparkMemory memory = null;
        // delete output location
        final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);
        if (null != outputLocation) {
            if (outputToHDFS && fileSystemStorage.exists(outputLocation)) {
                fileSystemStorage.rm(outputLocation);
            }
            if (outputToSpark && sparkContextStorage.exists(outputLocation)) {
                sparkContextStorage.rm(outputLocation);
            }
        }
        // the Spark application name will always be set by SparkContextStorage,
        // thus, INFO the name to make it easier to debug
        logger.debug(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX + (null == this.vertexProgram ? "No VertexProgram" : this.vertexProgram) + "[" + this.mapReducers + "]");
        // add the project jars to the cluster
        this.loadJars(hadoopConfiguration, sparkContext);
        updateLocalConfiguration(sparkContext, hadoopConfiguration);
        // create a message-passing friendly rdd from the input rdd
        boolean partitioned = false;
        JavaPairRDD<Object, VertexWritable> loadedGraphRDD = inputRDD.readGraphRDD(graphComputerConfiguration, sparkContext);
        // if there are vertex or edge filters, filter the loaded graph rdd prior to partitioning and persisting
        if (filtered) {
            this.logger.debug("Filtering the loaded graphRDD: " + this.graphFilter);
            loadedGraphRDD = GraknSparkExecutor.applyGraphFilter(loadedGraphRDD, this.graphFilter);
        }
        // else partition it with HashPartitioner
        if (loadedGraphRDD.partitioner().isPresent()) {
            this.logger.debug("Using the existing partitioner associated with the loaded graphRDD: " + loadedGraphRDD.partitioner().get());
        } else {
            if (!skipPartitioner) {
                final Partitioner partitioner = new HashPartitioner(this.workersSet ? this.workers : loadedGraphRDD.partitions().size());
                this.logger.debug("Partitioning the loaded graphRDD: " + partitioner);
                loadedGraphRDD = loadedGraphRDD.partitionBy(partitioner);
                partitioned = true;
                assert loadedGraphRDD.partitioner().isPresent();
            } else {
                // no easy way to test this with a test case
                assert skipPartitioner == !loadedGraphRDD.partitioner().isPresent();
                this.logger.debug("Partitioning has been skipped for the loaded graphRDD via " + Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
            }
        }
        // then this coalesce/repartition will not take place
        if (this.workersSet) {
            // ensures that the loaded graphRDD does not have more partitions than workers
            if (loadedGraphRDD.partitions().size() > this.workers) {
                loadedGraphRDD = loadedGraphRDD.coalesce(this.workers);
            } else {
                // ensures that the loaded graphRDD does not have less partitions than workers
                if (loadedGraphRDD.partitions().size() < this.workers) {
                    loadedGraphRDD = loadedGraphRDD.repartition(this.workers);
                }
            }
        }
        // or else use default cache() which is MEMORY_ONLY
        if (!skipPersist && (!inputFromSpark || partitioned || filtered)) {
            loadedGraphRDD = loadedGraphRDD.persist(StorageLevel.fromString(hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
        }
        // final graph with view
        // (for persisting and/or mapReducing -- may be null and thus, possible to save space/time)
        JavaPairRDD<Object, VertexWritable> computedGraphRDD = null;
        try {
            // //////////////////////////////
            if (null != this.vertexProgram) {
                memory = new GraknSparkMemory(this.vertexProgram, this.mapReducers, sparkContext);
                // if there is a registered VertexProgramInterceptor, use it to bypass the GraphComputer semantics
                if (graphComputerConfiguration.containsKey(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) {
                    try {
                        final GraknSparkVertexProgramInterceptor<VertexProgram> interceptor = (GraknSparkVertexProgramInterceptor) Class.forName(graphComputerConfiguration.getString(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)).newInstance();
                        computedGraphRDD = interceptor.apply(this.vertexProgram, loadedGraphRDD, memory);
                    } catch (final ClassNotFoundException | IllegalAccessException | InstantiationException e) {
                        throw new IllegalStateException(e.getMessage());
                    }
                } else {
                    // standard GraphComputer semantics
                    // get a configuration that will be propagated to all workers
                    final HadoopConfiguration vertexProgramConfiguration = new HadoopConfiguration();
                    this.vertexProgram.storeState(vertexProgramConfiguration);
                    // set up the vertex program and wire up configurations
                    this.vertexProgram.setup(memory);
                    JavaPairRDD<Object, ViewIncomingPayload<Object>> viewIncomingRDD = null;
                    memory.broadcastMemory(sparkContext);
                    // execute the vertex program
                    while (true) {
                        if (Thread.interrupted()) {
                            sparkContext.cancelAllJobs();
                            throw new TraversalInterruptedException();
                        }
                        memory.setInExecute(true);
                        viewIncomingRDD = GraknSparkExecutor.executeVertexProgramIteration(loadedGraphRDD, viewIncomingRDD, memory, graphComputerConfiguration, vertexProgramConfiguration);
                        memory.setInExecute(false);
                        if (this.vertexProgram.terminate(memory)) {
                            break;
                        } else {
                            memory.incrIteration();
                            memory.broadcastMemory(sparkContext);
                        }
                    }
                    // then generate a view+graph
                    if ((null != outputRDD && !this.persist.equals(Persist.NOTHING)) || !this.mapReducers.isEmpty()) {
                        computedGraphRDD = GraknSparkExecutor.prepareFinalGraphRDD(loadedGraphRDD, viewIncomingRDD, this.vertexProgram.getVertexComputeKeys());
                        assert null != computedGraphRDD && computedGraphRDD != loadedGraphRDD;
                    } else {
                        // ensure that the computedGraphRDD was not created
                        assert null == computedGraphRDD;
                    }
                }
                // ///////////////
                // drop all transient memory keys
                memory.complete();
                // write the computed graph to the respective output (rdd or output format)
                if (null != outputRDD && !this.persist.equals(Persist.NOTHING)) {
                    // the logic holds that a computeGraphRDD must be created at this point
                    assert null != computedGraphRDD;
                    outputRDD.writeGraphRDD(graphComputerConfiguration, computedGraphRDD);
                }
            }
            final boolean computedGraphCreated = computedGraphRDD != null && computedGraphRDD != loadedGraphRDD;
            if (!computedGraphCreated) {
                computedGraphRDD = loadedGraphRDD;
            }
            final Memory.Admin finalMemory = null == memory ? new MapMemory() : new MapMemory(memory);
            // ////////////////////////////
            if (!this.mapReducers.isEmpty()) {
                // create a mapReduceRDD for executing the map reduce jobs on
                JavaPairRDD<Object, VertexWritable> mapReduceRDD = computedGraphRDD;
                if (computedGraphCreated && !outputToSpark) {
                    // drop all the edges of the graph as they are not used in mapReduce processing
                    mapReduceRDD = computedGraphRDD.mapValues(vertexWritable -> {
                        vertexWritable.get().dropEdges(Direction.BOTH);
                        return vertexWritable;
                    });
                    // if there is only one MapReduce to execute, don't bother wasting the clock cycles.
                    if (this.mapReducers.size() > 1) {
                        mapReduceRDD = mapReduceRDD.persist(StorageLevel.fromString(hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
                    }
                }
                for (final MapReduce mapReduce : this.mapReducers) {
                    // execute the map reduce job
                    final HadoopConfiguration newApacheConfiguration = new HadoopConfiguration(graphComputerConfiguration);
                    mapReduce.storeState(newApacheConfiguration);
                    // map
                    final JavaPairRDD mapRDD = GraknSparkExecutor.executeMap(mapReduceRDD, mapReduce, newApacheConfiguration);
                    // combine
                    final JavaPairRDD combineRDD = mapReduce.doStage(MapReduce.Stage.COMBINE) ? GraknSparkExecutor.executeCombine(mapRDD, newApacheConfiguration) : mapRDD;
                    // reduce
                    final JavaPairRDD reduceRDD = mapReduce.doStage(MapReduce.Stage.REDUCE) ? GraknSparkExecutor.executeReduce(combineRDD, mapReduce, newApacheConfiguration) : combineRDD;
                    // write the map reduce output back to disk and computer result memory
                    if (null != outputRDD) {
                        mapReduce.addResultToMemory(finalMemory, outputRDD.writeMemoryRDD(graphComputerConfiguration, mapReduce.getMemoryKey(), reduceRDD));
                    }
                }
                // if the mapReduceRDD is not simply the computed graph, unpersist the mapReduceRDD
                if (computedGraphCreated && !outputToSpark) {
                    assert loadedGraphRDD != computedGraphRDD;
                    assert mapReduceRDD != computedGraphRDD;
                    mapReduceRDD.unpersist();
                } else {
                    assert mapReduceRDD == computedGraphRDD;
                }
            }
            // if the graphRDD was loaded from Spark, but then partitioned or filtered, its a different RDD
            if (!inputFromSpark || partitioned || filtered) {
                loadedGraphRDD.unpersist();
            }
            // then don't unpersist the computedGraphRDD/loadedGraphRDD
            if ((!outputToSpark || this.persist.equals(GraphComputer.Persist.NOTHING)) && computedGraphCreated) {
                computedGraphRDD.unpersist();
            }
            // delete any file system or rdd data if persist nothing
            if (null != outputLocation && this.persist.equals(GraphComputer.Persist.NOTHING)) {
                if (outputToHDFS) {
                    fileSystemStorage.rm(outputLocation);
                }
                if (outputToSpark) {
                    sparkContextStorage.rm(outputLocation);
                }
            }
            // update runtime and return the newly computed graph
            finalMemory.setRuntime(System.currentTimeMillis() - startTime);
            // clear properties that should not be propagated in an OLAP chain
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
            return new DefaultComputerResult(InputOutputHelper.getOutputGraph(graphComputerConfiguration, this.resultGraph, this.persist), finalMemory.asImmutable());
        } catch (Exception e) {
            // So it throws the same exception as tinker does
            throw new RuntimeException(e);
        }
    });
    computerService.shutdown();
    return result;
}

Also used : InputRDD(org.apache.tinkerpop.gremlin.spark.structure.io.InputRDD) PersistedInputRDD(org.apache.tinkerpop.gremlin.spark.structure.io.PersistedInputRDD) TraversalInterruptedException(org.apache.tinkerpop.gremlin.process.traversal.util.TraversalInterruptedException) GryoSerializer(org.apache.tinkerpop.gremlin.spark.structure.io.gryo.GryoSerializer) FileSystem(org.apache.hadoop.fs.FileSystem) GraphFilterAware(org.apache.tinkerpop.gremlin.hadoop.structure.io.GraphFilterAware) GraphComputer(org.apache.tinkerpop.gremlin.process.computer.GraphComputer) LoggerFactory(org.slf4j.LoggerFactory) SparkContextStorage(org.apache.tinkerpop.gremlin.spark.structure.io.SparkContextStorage) Future(java.util.concurrent.Future) Partitioner(org.apache.spark.Partitioner) StorageLevel(org.apache.spark.storage.StorageLevel) Constants(org.apache.tinkerpop.gremlin.hadoop.Constants) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ThreadFactory(java.util.concurrent.ThreadFactory) DefaultComputerResult(org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult) InputRDD(org.apache.tinkerpop.gremlin.spark.structure.io.InputRDD) HadoopConfiguration(org.apache.tinkerpop.gremlin.hadoop.structure.HadoopConfiguration) OutputRDD(org.apache.tinkerpop.gremlin.spark.structure.io.OutputRDD) HashPartitioner(org.apache.spark.HashPartitioner) Set(java.util.Set) BasicThreadFactory(org.apache.commons.lang3.concurrent.BasicThreadFactory) Executors(java.util.concurrent.Executors) SparkSingleIterationStrategy(org.apache.tinkerpop.gremlin.spark.process.computer.traversal.strategy.optimization.SparkSingleIterationStrategy) Memory(org.apache.tinkerpop.gremlin.process.computer.Memory) OutputFormatRDD(org.apache.tinkerpop.gremlin.spark.structure.io.OutputFormatRDD) InputFormatRDD(org.apache.tinkerpop.gremlin.spark.structure.io.InputFormatRDD) MapMemory(org.apache.tinkerpop.gremlin.process.computer.util.MapMemory) FileConfiguration(org.apache.commons.configuration.FileConfiguration) TraversalStrategies(org.apache.tinkerpop.gremlin.process.traversal.TraversalStrategies) TraversalInterruptedException(org.apache.tinkerpop.gremlin.process.traversal.util.TraversalInterruptedException) ConfigurationUtils(org.apache.commons.configuration.ConfigurationUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ComputerSubmissionHelper(org.apache.tinkerpop.gremlin.hadoop.process.computer.util.ComputerSubmissionHelper) VertexProgram(org.apache.tinkerpop.gremlin.process.computer.VertexProgram) HashSet(java.util.HashSet) VertexWritable(org.apache.tinkerpop.gremlin.hadoop.structure.io.VertexWritable) ComputerResult(org.apache.tinkerpop.gremlin.process.computer.ComputerResult) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) PropertiesConfiguration(org.apache.commons.configuration.PropertiesConfiguration) AbstractHadoopGraphComputer(org.apache.tinkerpop.gremlin.hadoop.process.computer.AbstractHadoopGraphComputer) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) ExecutorService(java.util.concurrent.ExecutorService) FileSystemStorage(org.apache.tinkerpop.gremlin.hadoop.structure.io.FileSystemStorage) ConfUtil(org.apache.tinkerpop.gremlin.hadoop.structure.util.ConfUtil) ViewIncomingPayload(org.apache.tinkerpop.gremlin.spark.process.computer.payload.ViewIncomingPayload) Logger(org.slf4j.Logger) SparkLauncher(org.apache.spark.launcher.SparkLauncher) InputFormat(org.apache.hadoop.mapreduce.InputFormat) InputOutputHelper(org.apache.tinkerpop.gremlin.spark.structure.io.InputOutputHelper) Spark(org.apache.tinkerpop.gremlin.spark.structure.Spark) IOException(java.io.IOException) SparkInterceptorStrategy(org.apache.tinkerpop.gremlin.spark.process.computer.traversal.strategy.optimization.SparkInterceptorStrategy) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) File(java.io.File) PersistedOutputRDD(org.apache.tinkerpop.gremlin.spark.structure.io.PersistedOutputRDD) FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) Direction(org.apache.tinkerpop.gremlin.structure.Direction) HadoopGraph(org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph) PersistedInputRDD(org.apache.tinkerpop.gremlin.spark.structure.io.PersistedInputRDD) Storage(org.apache.tinkerpop.gremlin.structure.io.Storage) MapReduce(org.apache.tinkerpop.gremlin.process.computer.MapReduce) VertexWritable(org.apache.tinkerpop.gremlin.hadoop.structure.io.VertexWritable) Configuration(org.apache.hadoop.conf.Configuration) HadoopConfiguration(org.apache.tinkerpop.gremlin.hadoop.structure.HadoopConfiguration) FileConfiguration(org.apache.commons.configuration.FileConfiguration) PropertiesConfiguration(org.apache.commons.configuration.PropertiesConfiguration) Memory(org.apache.tinkerpop.gremlin.process.computer.Memory) MapMemory(org.apache.tinkerpop.gremlin.process.computer.util.MapMemory) ViewIncomingPayload(org.apache.tinkerpop.gremlin.spark.process.computer.payload.ViewIncomingPayload) InputFormatRDD(org.apache.tinkerpop.gremlin.spark.structure.io.InputFormatRDD) MapReduce(org.apache.tinkerpop.gremlin.process.computer.MapReduce) GraphFilterAware(org.apache.tinkerpop.gremlin.hadoop.structure.io.GraphFilterAware) MapMemory(org.apache.tinkerpop.gremlin.process.computer.util.MapMemory) DefaultComputerResult(org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult) ComputerResult(org.apache.tinkerpop.gremlin.process.computer.ComputerResult) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Partitioner(org.apache.spark.Partitioner) HashPartitioner(org.apache.spark.HashPartitioner) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) VertexProgram(org.apache.tinkerpop.gremlin.process.computer.VertexProgram) TraversalInterruptedException(org.apache.tinkerpop.gremlin.process.traversal.util.TraversalInterruptedException) IOException(java.io.IOException) OutputRDD(org.apache.tinkerpop.gremlin.spark.structure.io.OutputRDD) PersistedOutputRDD(org.apache.tinkerpop.gremlin.spark.structure.io.PersistedOutputRDD) SparkContextStorage(org.apache.tinkerpop.gremlin.spark.structure.io.SparkContextStorage) FileSystemStorage(org.apache.tinkerpop.gremlin.hadoop.structure.io.FileSystemStorage) Storage(org.apache.tinkerpop.gremlin.structure.io.Storage) DefaultComputerResult(org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult) HashPartitioner(org.apache.spark.HashPartitioner) GryoSerializer(org.apache.tinkerpop.gremlin.spark.structure.io.gryo.GryoSerializer) HadoopConfiguration(org.apache.tinkerpop.gremlin.hadoop.structure.HadoopConfiguration)

Aggregations

File (java.io.File)1 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1 Future (java.util.concurrent.Future)1 ThreadFactory (java.util.concurrent.ThreadFactory)1 ThreadLocalRandom (java.util.concurrent.ThreadLocalRandom)1 ConfigurationUtils (org.apache.commons.configuration.ConfigurationUtils)1 FileConfiguration (org.apache.commons.configuration.FileConfiguration)1 PropertiesConfiguration (org.apache.commons.configuration.PropertiesConfiguration)1 BasicThreadFactory (org.apache.commons.lang3.concurrent.BasicThreadFactory)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 InputFormat (org.apache.hadoop.mapreduce.InputFormat)1 FileInputFormat (org.apache.hadoop.mapreduce.lib.input.FileInputFormat)1 FileOutputFormat (org.apache.hadoop.mapreduce.lib.output.FileOutputFormat)1 HashPartitioner (org.apache.spark.HashPartitioner)1