Search in sources :

Example 46 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.

the class TestCompareParameterAveragingSparkVsSingleMachine method testAverageEveryStepGraphCNN.

@Test
public void testAverageEveryStepGraphCNN() {
    //Idea: averaging every step with SGD (SGD updater + optimizer) is mathematically identical to doing the learning
    // on a single machine for synchronous distributed training
    //BUT: This is *ONLY* the case if all workers get an identical number of examples. This won't be the case if
    // we use RDD.randomSplit (which is what occurs if we use .fit(JavaRDD<DataSet> on a data set that needs splitting),
    // which might give a number of examples that isn't divisible by number of workers (like 39 examples on 4 executors)
    //This is also ONLY the case using SGD updater
    int miniBatchSizePerWorker = 10;
    int nWorkers = 4;
    for (boolean saveUpdater : new boolean[] { true, false }) {
        JavaSparkContext sc = getContext(nWorkers);
        try {
            //Do training locally, for 3 minibatches
            int[] seeds = { 1, 2, 3 };
            ComputationGraph net = new ComputationGraph(getGraphConfCNN(12345, Updater.SGD));
            net.init();
            INDArray initialParams = net.params().dup();
            for (int i = 0; i < seeds.length; i++) {
                DataSet ds = getOneDataSetCNN(miniBatchSizePerWorker * nWorkers, seeds[i]);
                if (!saveUpdater)
                    net.setUpdater(null);
                net.fit(ds);
            }
            INDArray finalParams = net.params().dup();
            //Do training on Spark with one executor, for 3 separate minibatches
            TrainingMaster tm = getTrainingMaster(1, miniBatchSizePerWorker, saveUpdater);
            SparkComputationGraph sparkNet = new SparkComputationGraph(sc, getGraphConfCNN(12345, Updater.SGD), tm);
            sparkNet.setCollectTrainingStats(true);
            INDArray initialSparkParams = sparkNet.getNetwork().params().dup();
            for (int i = 0; i < seeds.length; i++) {
                List<DataSet> list = getOneDataSetAsIndividalExamplesCNN(miniBatchSizePerWorker * nWorkers, seeds[i]);
                JavaRDD<DataSet> rdd = sc.parallelize(list);
                sparkNet.fit(rdd);
            }
            System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
            INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
            System.out.println("Initial (Local) params:  " + Arrays.toString(initialParams.data().asFloat()));
            System.out.println("Initial (Spark) params:  " + Arrays.toString(initialSparkParams.data().asFloat()));
            System.out.println("Final (Local) params:    " + Arrays.toString(finalParams.data().asFloat()));
            System.out.println("Final (Spark) params:    " + Arrays.toString(finalSparkParams.data().asFloat()));
            assertArrayEquals(initialParams.data().asFloat(), initialSparkParams.data().asFloat(), 1e-8f);
            assertArrayEquals(finalParams.data().asFloat(), finalSparkParams.data().asFloat(), 1e-6f);
            double sparkScore = sparkNet.getScore();
            assertTrue(sparkScore > 0.0);
            assertEquals(net.score(), sparkScore, 1e-3);
        } finally {
            sc.stop();
        }
    }
}
Also used : SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) INDArray(org.nd4j.linalg.api.ndarray.INDArray) DataSet(org.nd4j.linalg.dataset.DataSet) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ComputationGraph(org.deeplearning4j.nn.graph.ComputationGraph) SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) TrainingMaster(org.deeplearning4j.spark.api.TrainingMaster) Test(org.junit.Test)

Example 47 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.

the class TestCompareParameterAveragingSparkVsSingleMachine method testAverageEveryStep.

@Test
public void testAverageEveryStep() {
    //Idea: averaging every step with SGD (SGD updater + optimizer) is mathematically identical to doing the learning
    // on a single machine for synchronous distributed training
    //BUT: This is *ONLY* the case if all workers get an identical number of examples. This won't be the case if
    // we use RDD.randomSplit (which is what occurs if we use .fit(JavaRDD<DataSet> on a data set that needs splitting),
    // which might give a number of examples that isn't divisible by number of workers (like 39 examples on 4 executors)
    //This is also ONLY the case using SGD updater
    int miniBatchSizePerWorker = 10;
    int nWorkers = 4;
    for (boolean saveUpdater : new boolean[] { true, false }) {
        JavaSparkContext sc = getContext(nWorkers);
        try {
            //Do training locally, for 3 minibatches
            int[] seeds = { 1, 2, 3 };
            //                CudaGridExecutioner executioner = (CudaGridExecutioner) Nd4j.getExecutioner();
            MultiLayerNetwork net = new MultiLayerNetwork(getConf(12345, Updater.SGD));
            net.init();
            INDArray initialParams = net.params().dup();
            for (int i = 0; i < seeds.length; i++) {
                DataSet ds = getOneDataSet(miniBatchSizePerWorker * nWorkers, seeds[i]);
                if (!saveUpdater)
                    net.setUpdater(null);
                net.fit(ds);
            }
            INDArray finalParams = net.params().dup();
            //Do training on Spark with one executor, for 3 separate minibatches
            //                TrainingMaster tm = getTrainingMaster(1, miniBatchSizePerWorker, saveUpdater);
            ParameterAveragingTrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(1).averagingFrequency(1).batchSizePerWorker(miniBatchSizePerWorker).saveUpdater(saveUpdater).workerPrefetchNumBatches(0).rddTrainingApproach(RDDTrainingApproach.Export).build();
            SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, getConf(12345, Updater.SGD), tm);
            sparkNet.setCollectTrainingStats(true);
            INDArray initialSparkParams = sparkNet.getNetwork().params().dup();
            for (int i = 0; i < seeds.length; i++) {
                List<DataSet> list = getOneDataSetAsIndividalExamples(miniBatchSizePerWorker * nWorkers, seeds[i]);
                JavaRDD<DataSet> rdd = sc.parallelize(list);
                sparkNet.fit(rdd);
            }
            System.out.println(sparkNet.getSparkTrainingStats().statsAsString());
            INDArray finalSparkParams = sparkNet.getNetwork().params().dup();
            System.out.println("Initial (Local) params:       " + Arrays.toString(initialParams.data().asFloat()));
            System.out.println("Initial (Spark) params:       " + Arrays.toString(initialSparkParams.data().asFloat()));
            System.out.println("Final (Local) params: " + Arrays.toString(finalParams.data().asFloat()));
            System.out.println("Final (Spark) params: " + Arrays.toString(finalSparkParams.data().asFloat()));
            assertEquals(initialParams, initialSparkParams);
            assertNotEquals(initialParams, finalParams);
            assertEquals(finalParams, finalSparkParams);
            double sparkScore = sparkNet.getScore();
            assertTrue(sparkScore > 0.0);
            assertEquals(net.score(), sparkScore, 1e-3);
        } finally {
            sc.stop();
        }
    }
}
Also used : DataSet(org.nd4j.linalg.dataset.DataSet) INDArray(org.nd4j.linalg.api.ndarray.INDArray) SparkDl4jMultiLayer(org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork) Test(org.junit.Test)

Example 48 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project deeplearning4j by deeplearning4j.

the class TestTrainingStatsCollection method testStatsCollection.

@Test
public void testStatsCollection() throws Exception {
    int nWorkers = 4;
    SparkConf sparkConf = new SparkConf();
    sparkConf.setMaster("local[" + nWorkers + "]");
    sparkConf.setAppName("Test");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    try {
        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).list().layer(0, new DenseLayer.Builder().nIn(10).nOut(10).build()).layer(1, new OutputLayer.Builder().nIn(10).nOut(10).build()).pretrain(false).backprop(true).build();
        int miniBatchSizePerWorker = 10;
        int averagingFrequency = 5;
        int numberOfAveragings = 3;
        int totalExamples = nWorkers * miniBatchSizePerWorker * averagingFrequency * numberOfAveragings;
        Nd4j.getRandom().setSeed(12345);
        List<DataSet> list = new ArrayList<>();
        for (int i = 0; i < totalExamples; i++) {
            INDArray f = Nd4j.rand(1, 10);
            INDArray l = Nd4j.rand(1, 10);
            DataSet ds = new DataSet(f, l);
            list.add(ds);
        }
        JavaRDD<DataSet> rdd = sc.parallelize(list);
        rdd.repartition(4);
        ParameterAveragingTrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(nWorkers, 1).averagingFrequency(averagingFrequency).batchSizePerWorker(miniBatchSizePerWorker).saveUpdater(true).workerPrefetchNumBatches(0).repartionData(Repartition.Always).build();
        SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, tm);
        sparkNet.setCollectTrainingStats(true);
        sparkNet.fit(rdd);
        //Collect the expected keys:
        List<String> expectedStatNames = new ArrayList<>();
        Class<?>[] classes = new Class[] { CommonSparkTrainingStats.class, ParameterAveragingTrainingMasterStats.class, ParameterAveragingTrainingWorkerStats.class };
        String[] fieldNames = new String[] { "columnNames", "columnNames", "columnNames" };
        for (int i = 0; i < classes.length; i++) {
            Field field = classes[i].getDeclaredField(fieldNames[i]);
            field.setAccessible(true);
            Object f = field.get(null);
            Collection<String> c = (Collection<String>) f;
            expectedStatNames.addAll(c);
        }
        System.out.println(expectedStatNames);
        SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
        Set<String> actualKeySet = stats.getKeySet();
        assertEquals(expectedStatNames.size(), actualKeySet.size());
        for (String s : stats.getKeySet()) {
            assertTrue(expectedStatNames.contains(s));
            assertNotNull(stats.getValue(s));
        }
        String statsAsString = stats.statsAsString();
        System.out.println(statsAsString);
        //One line per stat
        assertEquals(actualKeySet.size(), statsAsString.split("\n").length);
        //Go through nested stats
        //First: master stats
        assertTrue(stats instanceof ParameterAveragingTrainingMasterStats);
        ParameterAveragingTrainingMasterStats masterStats = (ParameterAveragingTrainingMasterStats) stats;
        List<EventStats> exportTimeStats = masterStats.getParameterAveragingMasterExportTimesMs();
        assertEquals(1, exportTimeStats.size());
        assertDurationGreaterZero(exportTimeStats);
        assertNonNullFields(exportTimeStats);
        assertExpectedNumberMachineIdsJvmIdsThreadIds(exportTimeStats, 1, 1, 1);
        List<EventStats> countRddTime = masterStats.getParameterAveragingMasterCountRddSizeTimesMs();
        //occurs once per fit
        assertEquals(1, countRddTime.size());
        assertDurationGreaterEqZero(countRddTime);
        assertNonNullFields(countRddTime);
        //should occur only in master once
        assertExpectedNumberMachineIdsJvmIdsThreadIds(countRddTime, 1, 1, 1);
        List<EventStats> broadcastCreateTime = masterStats.getParameterAveragingMasterBroadcastCreateTimesMs();
        assertEquals(numberOfAveragings, broadcastCreateTime.size());
        assertDurationGreaterEqZero(broadcastCreateTime);
        assertNonNullFields(broadcastCreateTime);
        //only 1 thread for master
        assertExpectedNumberMachineIdsJvmIdsThreadIds(broadcastCreateTime, 1, 1, 1);
        List<EventStats> fitTimes = masterStats.getParameterAveragingMasterFitTimesMs();
        //i.e., number of times fit(JavaRDD<DataSet>) was called
        assertEquals(1, fitTimes.size());
        assertDurationGreaterZero(fitTimes);
        assertNonNullFields(fitTimes);
        //only 1 thread for master
        assertExpectedNumberMachineIdsJvmIdsThreadIds(fitTimes, 1, 1, 1);
        List<EventStats> splitTimes = masterStats.getParameterAveragingMasterSplitTimesMs();
        //Splitting of the data set is executed once only (i.e., one fit(JavaRDD<DataSet>) call)
        assertEquals(1, splitTimes.size());
        assertDurationGreaterEqZero(splitTimes);
        assertNonNullFields(splitTimes);
        //only 1 thread for master
        assertExpectedNumberMachineIdsJvmIdsThreadIds(splitTimes, 1, 1, 1);
        List<EventStats> aggregateTimesMs = masterStats.getParamaterAveragingMasterAggregateTimesMs();
        assertEquals(numberOfAveragings, aggregateTimesMs.size());
        assertDurationGreaterEqZero(aggregateTimesMs);
        assertNonNullFields(aggregateTimesMs);
        //only 1 thread for master
        assertExpectedNumberMachineIdsJvmIdsThreadIds(aggregateTimesMs, 1, 1, 1);
        List<EventStats> processParamsTimesMs = masterStats.getParameterAveragingMasterProcessParamsUpdaterTimesMs();
        assertEquals(numberOfAveragings, processParamsTimesMs.size());
        assertDurationGreaterEqZero(processParamsTimesMs);
        assertNonNullFields(processParamsTimesMs);
        //only 1 thread for master
        assertExpectedNumberMachineIdsJvmIdsThreadIds(processParamsTimesMs, 1, 1, 1);
        List<EventStats> repartitionTimesMs = masterStats.getParameterAveragingMasterRepartitionTimesMs();
        assertEquals(numberOfAveragings, repartitionTimesMs.size());
        assertDurationGreaterEqZero(repartitionTimesMs);
        assertNonNullFields(repartitionTimesMs);
        //only 1 thread for master
        assertExpectedNumberMachineIdsJvmIdsThreadIds(repartitionTimesMs, 1, 1, 1);
        //Second: Common spark training stats
        SparkTrainingStats commonStats = masterStats.getNestedTrainingStats();
        assertNotNull(commonStats);
        assertTrue(commonStats instanceof CommonSparkTrainingStats);
        CommonSparkTrainingStats cStats = (CommonSparkTrainingStats) commonStats;
        List<EventStats> workerFlatMapTotalTimeMs = cStats.getWorkerFlatMapTotalTimeMs();
        assertEquals(numberOfAveragings * nWorkers, workerFlatMapTotalTimeMs.size());
        assertDurationGreaterZero(workerFlatMapTotalTimeMs);
        assertNonNullFields(workerFlatMapTotalTimeMs);
        assertExpectedNumberMachineIdsJvmIdsThreadIds(workerFlatMapTotalTimeMs, 1, 1, nWorkers);
        List<EventStats> workerFlatMapGetInitialModelTimeMs = cStats.getWorkerFlatMapGetInitialModelTimeMs();
        assertEquals(numberOfAveragings * nWorkers, workerFlatMapGetInitialModelTimeMs.size());
        assertDurationGreaterEqZero(workerFlatMapGetInitialModelTimeMs);
        assertNonNullFields(workerFlatMapGetInitialModelTimeMs);
        assertExpectedNumberMachineIdsJvmIdsThreadIds(workerFlatMapGetInitialModelTimeMs, 1, 1, nWorkers);
        List<EventStats> workerFlatMapDataSetGetTimesMs = cStats.getWorkerFlatMapDataSetGetTimesMs();
        int numMinibatchesProcessed = workerFlatMapDataSetGetTimesMs.size();
        //1 for every time we get a data set
        int expectedNumMinibatchesProcessed = numberOfAveragings * nWorkers * averagingFrequency;
        //Sometimes random split is just bad - some executors might miss out on getting the expected amount of data
        assertTrue(numMinibatchesProcessed >= expectedNumMinibatchesProcessed - 5);
        List<EventStats> workerFlatMapProcessMiniBatchTimesMs = cStats.getWorkerFlatMapProcessMiniBatchTimesMs();
        assertTrue(workerFlatMapProcessMiniBatchTimesMs.size() >= numberOfAveragings * nWorkers * averagingFrequency - 5);
        assertDurationGreaterEqZero(workerFlatMapProcessMiniBatchTimesMs);
        assertNonNullFields(workerFlatMapDataSetGetTimesMs);
        assertExpectedNumberMachineIdsJvmIdsThreadIds(workerFlatMapDataSetGetTimesMs, 1, 1, nWorkers);
        //Third: ParameterAveragingTrainingWorker stats
        SparkTrainingStats paramAvgStats = cStats.getNestedTrainingStats();
        assertNotNull(paramAvgStats);
        assertTrue(paramAvgStats instanceof ParameterAveragingTrainingWorkerStats);
        ParameterAveragingTrainingWorkerStats pStats = (ParameterAveragingTrainingWorkerStats) paramAvgStats;
        List<EventStats> parameterAveragingWorkerBroadcastGetValueTimeMs = pStats.getParameterAveragingWorkerBroadcastGetValueTimeMs();
        assertEquals(numberOfAveragings * nWorkers, parameterAveragingWorkerBroadcastGetValueTimeMs.size());
        assertDurationGreaterEqZero(parameterAveragingWorkerBroadcastGetValueTimeMs);
        assertNonNullFields(parameterAveragingWorkerBroadcastGetValueTimeMs);
        assertExpectedNumberMachineIdsJvmIdsThreadIds(parameterAveragingWorkerBroadcastGetValueTimeMs, 1, 1, nWorkers);
        List<EventStats> parameterAveragingWorkerInitTimeMs = pStats.getParameterAveragingWorkerInitTimeMs();
        assertEquals(numberOfAveragings * nWorkers, parameterAveragingWorkerInitTimeMs.size());
        assertDurationGreaterEqZero(parameterAveragingWorkerInitTimeMs);
        assertNonNullFields(parameterAveragingWorkerInitTimeMs);
        assertExpectedNumberMachineIdsJvmIdsThreadIds(parameterAveragingWorkerInitTimeMs, 1, 1, nWorkers);
        List<EventStats> parameterAveragingWorkerFitTimesMs = pStats.getParameterAveragingWorkerFitTimesMs();
        assertTrue(parameterAveragingWorkerFitTimesMs.size() >= numberOfAveragings * nWorkers * averagingFrequency - 5);
        assertDurationGreaterEqZero(parameterAveragingWorkerFitTimesMs);
        assertNonNullFields(parameterAveragingWorkerFitTimesMs);
        assertExpectedNumberMachineIdsJvmIdsThreadIds(parameterAveragingWorkerFitTimesMs, 1, 1, nWorkers);
        assertNull(pStats.getNestedTrainingStats());
        //Finally: try exporting stats
        String tempDir = System.getProperty("java.io.tmpdir");
        String outDir = FilenameUtils.concat(tempDir, "dl4j_testTrainingStatsCollection");
        stats.exportStatFiles(outDir, sc.sc());
        String htmlPlotsPath = FilenameUtils.concat(outDir, "AnalysisPlots.html");
        StatsUtils.exportStatsAsHtml(stats, htmlPlotsPath, sc);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        StatsUtils.exportStatsAsHTML(stats, baos);
        baos.close();
        byte[] bytes = baos.toByteArray();
        String str = new String(bytes, "UTF-8");
    //            System.out.println(str);
    } finally {
        sc.stop();
    }
}
Also used : OutputLayer(org.deeplearning4j.nn.conf.layers.OutputLayer) ParameterAveragingTrainingMasterStats(org.deeplearning4j.spark.impl.paramavg.stats.ParameterAveragingTrainingMasterStats) DataSet(org.nd4j.linalg.dataset.DataSet) CommonSparkTrainingStats(org.deeplearning4j.spark.api.stats.CommonSparkTrainingStats) SparkTrainingStats(org.deeplearning4j.spark.api.stats.SparkTrainingStats) Field(java.lang.reflect.Field) EventStats(org.deeplearning4j.spark.stats.EventStats) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) SparkDl4jMultiLayer(org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ParameterAveragingTrainingWorkerStats(org.deeplearning4j.spark.impl.paramavg.stats.ParameterAveragingTrainingWorkerStats) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ParameterAveragingTrainingMaster(org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster) INDArray(org.nd4j.linalg.api.ndarray.INDArray) CommonSparkTrainingStats(org.deeplearning4j.spark.api.stats.CommonSparkTrainingStats) SparkConf(org.apache.spark.SparkConf) Test(org.junit.Test)

Example 49 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project azure-tools-for-java by Microsoft.

the class JavaSparkPi method main.

public static void main(String[] args) throws Exception {
    //use this line if you want to run your application in the cluster
    //SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
    SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi").setMaster("local[2]");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2;
    int n = 100000 * slices;
    List<Integer> l = new ArrayList<Integer>(n);
    for (int i = 0; i < n; i++) {
        l.add(i);
    }
    JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);
    int count = dataSet.map(new Function<Integer, Integer>() {

        @Override
        public Integer call(Integer integer) {
            double x = Math.random() * 2 - 1;
            double y = Math.random() * 2 - 1;
            return (x * x + y * y < 1) ? 1 : 0;
        }
    }).reduce(new Function2<Integer, Integer, Integer>() {

        @Override
        public Integer call(Integer integer, Integer integer2) {
            return integer + integer2;
        }
    });
    System.out.println("Pi is roughly " + 4.0 * count / n);
    jsc.stop();
}
Also used : Function(org.apache.spark.api.java.function.Function) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 50 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project geode by apache.

the class OQLJavaDemo method main.

public static void main(String[] argv) {
    if (argv.length != 1) {
        System.err.printf("Usage: OQLJavaDemo <locators>\n");
        return;
    }
    SparkConf conf = new SparkConf().setAppName("OQLJavaDemo");
    // "192.168.1.47[10335]"
    conf.set(GeodeLocatorPropKey, argv[0]);
    JavaSparkContext sc = new JavaSparkContext(conf);
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    DataFrame df = javaFunctions(sqlContext).geodeOQL("select * from /str_str_region");
    System.out.println("======= DataFrame =======\n");
    df.show();
    sc.stop();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DataFrame(org.apache.spark.sql.DataFrame) SparkConf(org.apache.spark.SparkConf) SQLContext(org.apache.spark.sql.SQLContext)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)260 Test (org.testng.annotations.Test)65 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)64 SparkConf (org.apache.spark.SparkConf)49 Tuple2 (scala.Tuple2)48 ArrayList (java.util.ArrayList)45 Test (org.junit.Test)43 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)32 List (java.util.List)28 Configuration (org.apache.hadoop.conf.Configuration)24 JavaRDD (org.apache.spark.api.java.JavaRDD)24 File (java.io.File)23 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)20 Collectors (java.util.stream.Collectors)16 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 DataSet (org.nd4j.linalg.dataset.DataSet)15 IOException (java.io.IOException)14 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 HashSet (java.util.HashSet)12 RealMatrix (org.apache.commons.math3.linear.RealMatrix)12