Search in sources :

Example 81 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.

the class TestSparkMultiLayerParameterAveraging method testFitViaStringPathsSize1.

@Test
public void testFitViaStringPathsSize1() throws Exception {
    Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPathsSize1");
    File tempDirF = tempDir.toFile();
    tempDirF.deleteOnExit();
    int dataSetObjSize = 1;
    int batchSizePerExecutor = 25;
    int numSplits = 10;
    int averagingFrequency = 3;
    int totalExamples = numExecutors() * batchSizePerExecutor * numSplits * averagingFrequency;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, totalExamples, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        DataSet ds = iter.next();
        ds.save(nextFile);
        i++;
    }
    System.out.println("Saved to: " + tempDirF.getAbsolutePath());
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).list().layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build()).pretrain(false).backprop(true).build();
    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(averagingFrequency).repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);
    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }
    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);
    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);
    Thread.sleep(2000);
    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    //Expect
    System.out.println(stats.statsAsString());
    assertEquals(numSplits, stats.getValue("ParameterAveragingMasterRepartitionTimesMs").size());
    List<EventStats> list = stats.getValue("ParameterAveragingWorkerFitTimesMs");
    assertEquals(numSplits * numExecutors() * averagingFrequency, list.size());
    for (EventStats es : list) {
        ExampleCountEventStats e = (ExampleCountEventStats) es;
        assertTrue(batchSizePerExecutor * averagingFrequency - 10 >= e.getTotalExampleCount());
    }
    sparkNet.getTrainingMaster().deleteTempFiles(sc);
}
Also used : ExampleCountEventStats(org.deeplearning4j.spark.stats.ExampleCountEventStats) Configuration(org.apache.hadoop.conf.Configuration) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) DataSet(org.nd4j.linalg.dataset.DataSet) SparkTrainingStats(org.deeplearning4j.spark.api.stats.SparkTrainingStats) ExampleCountEventStats(org.deeplearning4j.spark.stats.ExampleCountEventStats) EventStats(org.deeplearning4j.spark.stats.EventStats) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) SparkDl4jMultiLayer(org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer) Path(java.nio.file.Path) MnistDataSetIterator(org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) INDArray(org.nd4j.linalg.api.ndarray.INDArray) File(java.io.File) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator) MnistDataSetIterator(org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 82 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.

the class TestSparkMultiLayerParameterAveraging method testFitViaStringPathsCompGraph.

@Test
public void testFitViaStringPathsCompGraph() throws Exception {
    Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPathsCG");
    Path tempDir2 = Files.createTempDirectory("DL4J-testFitViaStringPathsCG-MDS");
    File tempDirF = tempDir.toFile();
    File tempDirF2 = tempDir2.toFile();
    tempDirF.deleteOnExit();
    tempDirF2.deleteOnExit();
    int dataSetObjSize = 5;
    int batchSizePerExecutor = 25;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        File nextFile2 = new File(tempDirF2, i + ".bin");
        DataSet ds = iter.next();
        MultiDataSet mds = new MultiDataSet(ds.getFeatures(), ds.getLabels());
        ds.save(nextFile);
        mds.save(nextFile2);
        i++;
    }
    System.out.println("Saved to: " + tempDirF.getAbsolutePath());
    System.out.println("Saved to: " + tempDirF2.getAbsolutePath());
    ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).graphBuilder().addInputs("in").addLayer("0", new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build(), "in").addLayer("1", new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build(), "0").setOutputs("1").pretrain(false).backprop(true).build();
    SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).workerPrefetchNumBatches(0).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);
    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }
    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);
    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);
    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());
    //Same thing, buf for MultiDataSet objects:
    config = new Configuration();
    hdfs = FileSystem.get(tempDir2.toUri(), config);
    fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir2.toString()), false);
    paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }
    paramsBefore = sparkNet.getNetwork().params().dup();
    pathRdd = sc.parallelize(paths);
    sparkNet.fitPathsMultiDataSet(pathRdd);
    paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);
    stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());
}
Also used : OutputLayer(org.deeplearning4j.nn.conf.layers.OutputLayer) SparkComputationGraph(org.deeplearning4j.spark.impl.graph.SparkComputationGraph) Configuration(org.apache.hadoop.conf.Configuration) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) DataSet(org.nd4j.linalg.dataset.DataSet) SparkTrainingStats(org.deeplearning4j.spark.api.stats.SparkTrainingStats) FileSystem(org.apache.hadoop.fs.FileSystem) Path(java.nio.file.Path) MnistDataSetIterator(org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) File(java.io.File) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator) MnistDataSetIterator(org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 83 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.

the class TestSparkMultiLayerParameterAveraging method testFitViaStringPaths.

@Test
public void testFitViaStringPaths() throws Exception {
    Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPaths");
    File tempDirF = tempDir.toFile();
    tempDirF.deleteOnExit();
    int dataSetObjSize = 5;
    int batchSizePerExecutor = 25;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        DataSet ds = iter.next();
        ds.save(nextFile);
        i++;
    }
    System.out.println("Saved to: " + tempDirF.getAbsolutePath());
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).list().layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build()).pretrain(false).backprop(true).build();
    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);
    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }
    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);
    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);
    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());
    sparkNet.getTrainingMaster().deleteTempFiles(sc);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ComputationGraphConfiguration(org.deeplearning4j.nn.conf.ComputationGraphConfiguration) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) DataSet(org.nd4j.linalg.dataset.DataSet) SparkTrainingStats(org.deeplearning4j.spark.api.stats.SparkTrainingStats) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) SparkDl4jMultiLayer(org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer) Path(java.nio.file.Path) MnistDataSetIterator(org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) INDArray(org.nd4j.linalg.api.ndarray.INDArray) File(java.io.File) IrisDataSetIterator(org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator) MnistDataSetIterator(org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 84 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.

the class SparkUtils method listPaths.

/**
     * List of the files in the given directory (path), as a {@code JavaRDD<String>}
     *
     * @param sc      Spark context
     * @param path    Path to list files in
     * @return        Paths in the directory
     * @throws IOException If error occurs getting directory contents
     */
public static JavaRDD<String> listPaths(JavaSparkContext sc, String path) throws IOException {
    List<String> paths = new ArrayList<>();
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(URI.create(path), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), false);
    while (fileIter.hasNext()) {
        String filePath = fileIter.next().getPath().toString();
        paths.add(filePath);
    }
    return sc.parallelize(paths);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus)

Example 85 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project presto by prestodb.

the class BackgroundHiveSplitLoader method loadSplits.

private CompletableFuture<?> loadSplits() throws IOException {
    HiveFileIterator files = fileIterators.poll();
    if (files == null) {
        HivePartitionMetadata partition = partitions.poll();
        if (partition == null) {
            return COMPLETED_FUTURE;
        }
        loadPartition(partition);
        return COMPLETED_FUTURE;
    }
    while (files.hasNext() && !stopped) {
        LocatedFileStatus file = files.next();
        if (isDirectory(file)) {
            if (recursiveDirWalkerEnabled) {
                HiveFileIterator fileIterator = new HiveFileIterator(file.getPath(), files.getFileSystem(), files.getDirectoryLister(), files.getNamenodeStats(), files.getPartitionName(), files.getInputFormat(), files.getSchema(), files.getPartitionKeys(), files.getEffectivePredicate(), files.getColumnCoercions());
                fileIterators.add(fileIterator);
            }
        } else {
            boolean splittable = isSplittable(files.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            CompletableFuture<?> future = hiveSplitSource.addToQueue(createHiveSplitIterator(files.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), files.getSchema(), files.getPartitionKeys(), splittable, session, OptionalInt.empty(), files.getEffectivePredicate(), files.getColumnCoercions()));
            if (!future.isDone()) {
                fileIterators.addFirst(files);
                return future;
            }
        }
    }
    // No need to put the iterator back, since it's either empty or we've stopped
    return COMPLETED_FUTURE;
}
Also used : HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus)

Aggregations

LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)145 Path (org.apache.hadoop.fs.Path)105 FileSystem (org.apache.hadoop.fs.FileSystem)54 ArrayList (java.util.ArrayList)48 FileStatus (org.apache.hadoop.fs.FileStatus)34 Test (org.junit.Test)33 IOException (java.io.IOException)27 Configuration (org.apache.hadoop.conf.Configuration)20 File (java.io.File)13 HashSet (java.util.HashSet)12 FileNotFoundException (java.io.FileNotFoundException)11 BlockLocation (org.apache.hadoop.fs.BlockLocation)10 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)8 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)7 StocatorPath (com.ibm.stocator.fs.common.StocatorPath)6 HashMap (java.util.HashMap)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 Map (java.util.Map)5 Matcher (java.util.regex.Matcher)5 BufferedReader (java.io.BufferedReader)4