use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.
the class TestSparkMultiLayerParameterAveraging method testFitViaStringPathsSize1.
@Test
public void testFitViaStringPathsSize1() throws Exception {
Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPathsSize1");
File tempDirF = tempDir.toFile();
tempDirF.deleteOnExit();
int dataSetObjSize = 1;
int batchSizePerExecutor = 25;
int numSplits = 10;
int averagingFrequency = 3;
int totalExamples = numExecutors() * batchSizePerExecutor * numSplits * averagingFrequency;
DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, totalExamples, false);
int i = 0;
while (iter.hasNext()) {
File nextFile = new File(tempDirF, i + ".bin");
DataSet ds = iter.next();
ds.save(nextFile);
i++;
}
System.out.println("Saved to: " + tempDirF.getAbsolutePath());
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).list().layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build()).pretrain(false).backprop(true).build();
SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(averagingFrequency).repartionData(Repartition.Always).build());
sparkNet.setCollectTrainingStats(true);
//List files:
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
List<String> paths = new ArrayList<>();
while (fileIter.hasNext()) {
String path = fileIter.next().getPath().toString();
paths.add(path);
}
INDArray paramsBefore = sparkNet.getNetwork().params().dup();
JavaRDD<String> pathRdd = sc.parallelize(paths);
sparkNet.fitPaths(pathRdd);
INDArray paramsAfter = sparkNet.getNetwork().params().dup();
assertNotEquals(paramsBefore, paramsAfter);
Thread.sleep(2000);
SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
//Expect
System.out.println(stats.statsAsString());
assertEquals(numSplits, stats.getValue("ParameterAveragingMasterRepartitionTimesMs").size());
List<EventStats> list = stats.getValue("ParameterAveragingWorkerFitTimesMs");
assertEquals(numSplits * numExecutors() * averagingFrequency, list.size());
for (EventStats es : list) {
ExampleCountEventStats e = (ExampleCountEventStats) es;
assertTrue(batchSizePerExecutor * averagingFrequency - 10 >= e.getTotalExampleCount());
}
sparkNet.getTrainingMaster().deleteTempFiles(sc);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.
the class TestSparkMultiLayerParameterAveraging method testFitViaStringPathsCompGraph.
@Test
public void testFitViaStringPathsCompGraph() throws Exception {
Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPathsCG");
Path tempDir2 = Files.createTempDirectory("DL4J-testFitViaStringPathsCG-MDS");
File tempDirF = tempDir.toFile();
File tempDirF2 = tempDir2.toFile();
tempDirF.deleteOnExit();
tempDirF2.deleteOnExit();
int dataSetObjSize = 5;
int batchSizePerExecutor = 25;
DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
int i = 0;
while (iter.hasNext()) {
File nextFile = new File(tempDirF, i + ".bin");
File nextFile2 = new File(tempDirF2, i + ".bin");
DataSet ds = iter.next();
MultiDataSet mds = new MultiDataSet(ds.getFeatures(), ds.getLabels());
ds.save(nextFile);
mds.save(nextFile2);
i++;
}
System.out.println("Saved to: " + tempDirF.getAbsolutePath());
System.out.println("Saved to: " + tempDirF2.getAbsolutePath());
ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).graphBuilder().addInputs("in").addLayer("0", new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build(), "in").addLayer("1", new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build(), "0").setOutputs("1").pretrain(false).backprop(true).build();
SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).workerPrefetchNumBatches(0).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
sparkNet.setCollectTrainingStats(true);
//List files:
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
List<String> paths = new ArrayList<>();
while (fileIter.hasNext()) {
String path = fileIter.next().getPath().toString();
paths.add(path);
}
INDArray paramsBefore = sparkNet.getNetwork().params().dup();
JavaRDD<String> pathRdd = sc.parallelize(paths);
sparkNet.fitPaths(pathRdd);
INDArray paramsAfter = sparkNet.getNetwork().params().dup();
assertNotEquals(paramsBefore, paramsAfter);
SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
System.out.println(stats.statsAsString());
//Same thing, buf for MultiDataSet objects:
config = new Configuration();
hdfs = FileSystem.get(tempDir2.toUri(), config);
fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir2.toString()), false);
paths = new ArrayList<>();
while (fileIter.hasNext()) {
String path = fileIter.next().getPath().toString();
paths.add(path);
}
paramsBefore = sparkNet.getNetwork().params().dup();
pathRdd = sc.parallelize(paths);
sparkNet.fitPathsMultiDataSet(pathRdd);
paramsAfter = sparkNet.getNetwork().params().dup();
assertNotEquals(paramsBefore, paramsAfter);
stats = sparkNet.getSparkTrainingStats();
System.out.println(stats.statsAsString());
}
use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.
the class TestSparkMultiLayerParameterAveraging method testFitViaStringPaths.
@Test
public void testFitViaStringPaths() throws Exception {
Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPaths");
File tempDirF = tempDir.toFile();
tempDirF.deleteOnExit();
int dataSetObjSize = 5;
int batchSizePerExecutor = 25;
DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
int i = 0;
while (iter.hasNext()) {
File nextFile = new File(tempDirF, i + ".bin");
DataSet ds = iter.next();
ds.save(nextFile);
i++;
}
System.out.println("Saved to: " + tempDirF.getAbsolutePath());
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).list().layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build()).pretrain(false).backprop(true).build();
SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
sparkNet.setCollectTrainingStats(true);
//List files:
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
List<String> paths = new ArrayList<>();
while (fileIter.hasNext()) {
String path = fileIter.next().getPath().toString();
paths.add(path);
}
INDArray paramsBefore = sparkNet.getNetwork().params().dup();
JavaRDD<String> pathRdd = sc.parallelize(paths);
sparkNet.fitPaths(pathRdd);
INDArray paramsAfter = sparkNet.getNetwork().params().dup();
assertNotEquals(paramsBefore, paramsAfter);
SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
System.out.println(stats.statsAsString());
sparkNet.getTrainingMaster().deleteTempFiles(sc);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.
the class SparkUtils method listPaths.
/**
* List of the files in the given directory (path), as a {@code JavaRDD<String>}
*
* @param sc Spark context
* @param path Path to list files in
* @return Paths in the directory
* @throws IOException If error occurs getting directory contents
*/
public static JavaRDD<String> listPaths(JavaSparkContext sc, String path) throws IOException {
List<String> paths = new ArrayList<>();
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(URI.create(path), config);
RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), false);
while (fileIter.hasNext()) {
String filePath = fileIter.next().getPath().toString();
paths.add(filePath);
}
return sc.parallelize(paths);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project presto by prestodb.
the class BackgroundHiveSplitLoader method loadSplits.
private CompletableFuture<?> loadSplits() throws IOException {
HiveFileIterator files = fileIterators.poll();
if (files == null) {
HivePartitionMetadata partition = partitions.poll();
if (partition == null) {
return COMPLETED_FUTURE;
}
loadPartition(partition);
return COMPLETED_FUTURE;
}
while (files.hasNext() && !stopped) {
LocatedFileStatus file = files.next();
if (isDirectory(file)) {
if (recursiveDirWalkerEnabled) {
HiveFileIterator fileIterator = new HiveFileIterator(file.getPath(), files.getFileSystem(), files.getDirectoryLister(), files.getNamenodeStats(), files.getPartitionName(), files.getInputFormat(), files.getSchema(), files.getPartitionKeys(), files.getEffectivePredicate(), files.getColumnCoercions());
fileIterators.add(fileIterator);
}
} else {
boolean splittable = isSplittable(files.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
CompletableFuture<?> future = hiveSplitSource.addToQueue(createHiveSplitIterator(files.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), files.getSchema(), files.getPartitionKeys(), splittable, session, OptionalInt.empty(), files.getEffectivePredicate(), files.getColumnCoercions()));
if (!future.isDone()) {
fileIterators.addFirst(files);
return future;
}
}
}
// No need to put the iterator back, since it's either empty or we've stopped
return COMPLETED_FUTURE;
}
Aggregations