use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.
the class TestSparkMultiLayerParameterAveraging method testFitViaStringPathsCompGraph.
@Test
public void testFitViaStringPathsCompGraph() throws Exception {
Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPathsCG");
Path tempDir2 = Files.createTempDirectory("DL4J-testFitViaStringPathsCG-MDS");
File tempDirF = tempDir.toFile();
File tempDirF2 = tempDir2.toFile();
tempDirF.deleteOnExit();
tempDirF2.deleteOnExit();
int dataSetObjSize = 5;
int batchSizePerExecutor = 25;
DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
int i = 0;
while (iter.hasNext()) {
File nextFile = new File(tempDirF, i + ".bin");
File nextFile2 = new File(tempDirF2, i + ".bin");
DataSet ds = iter.next();
MultiDataSet mds = new MultiDataSet(ds.getFeatures(), ds.getLabels());
ds.save(nextFile);
mds.save(nextFile2);
i++;
}
System.out.println("Saved to: " + tempDirF.getAbsolutePath());
System.out.println("Saved to: " + tempDirF2.getAbsolutePath());
ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).graphBuilder().addInputs("in").addLayer("0", new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build(), "in").addLayer("1", new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build(), "0").setOutputs("1").pretrain(false).backprop(true).build();
SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).workerPrefetchNumBatches(0).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
sparkNet.setCollectTrainingStats(true);
//List files:
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
List<String> paths = new ArrayList<>();
while (fileIter.hasNext()) {
String path = fileIter.next().getPath().toString();
paths.add(path);
}
INDArray paramsBefore = sparkNet.getNetwork().params().dup();
JavaRDD<String> pathRdd = sc.parallelize(paths);
sparkNet.fitPaths(pathRdd);
INDArray paramsAfter = sparkNet.getNetwork().params().dup();
assertNotEquals(paramsBefore, paramsAfter);
SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
System.out.println(stats.statsAsString());
//Same thing, buf for MultiDataSet objects:
config = new Configuration();
hdfs = FileSystem.get(tempDir2.toUri(), config);
fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir2.toString()), false);
paths = new ArrayList<>();
while (fileIter.hasNext()) {
String path = fileIter.next().getPath().toString();
paths.add(path);
}
paramsBefore = sparkNet.getNetwork().params().dup();
pathRdd = sc.parallelize(paths);
sparkNet.fitPathsMultiDataSet(pathRdd);
paramsAfter = sparkNet.getNetwork().params().dup();
assertNotEquals(paramsBefore, paramsAfter);
stats = sparkNet.getSparkTrainingStats();
System.out.println(stats.statsAsString());
}
use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.
the class TestSparkMultiLayerParameterAveraging method testFitViaStringPaths.
@Test
public void testFitViaStringPaths() throws Exception {
Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPaths");
File tempDirF = tempDir.toFile();
tempDirF.deleteOnExit();
int dataSetObjSize = 5;
int batchSizePerExecutor = 25;
DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
int i = 0;
while (iter.hasNext()) {
File nextFile = new File(tempDirF, i + ".bin");
DataSet ds = iter.next();
ds.save(nextFile);
i++;
}
System.out.println("Saved to: " + tempDirF.getAbsolutePath());
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.RMSPROP).optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1).list().layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50).activation(Activation.TANH).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10).activation(Activation.SOFTMAX).build()).pretrain(false).backprop(true).build();
SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize).workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1).repartionData(Repartition.Always).build());
sparkNet.setCollectTrainingStats(true);
//List files:
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);
List<String> paths = new ArrayList<>();
while (fileIter.hasNext()) {
String path = fileIter.next().getPath().toString();
paths.add(path);
}
INDArray paramsBefore = sparkNet.getNetwork().params().dup();
JavaRDD<String> pathRdd = sc.parallelize(paths);
sparkNet.fitPaths(pathRdd);
INDArray paramsAfter = sparkNet.getNetwork().params().dup();
assertNotEquals(paramsBefore, paramsAfter);
SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
System.out.println(stats.statsAsString());
sparkNet.getTrainingMaster().deleteTempFiles(sc);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project deeplearning4j by deeplearning4j.
the class SparkUtils method listPaths.
/**
* List of the files in the given directory (path), as a {@code JavaRDD<String>}
*
* @param sc Spark context
* @param path Path to list files in
* @return Paths in the directory
* @throws IOException If error occurs getting directory contents
*/
public static JavaRDD<String> listPaths(JavaSparkContext sc, String path) throws IOException {
List<String> paths = new ArrayList<>();
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(URI.create(path), config);
RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), false);
while (fileIter.hasNext()) {
String filePath = fileIter.next().getPath().toString();
paths.add(filePath);
}
return sc.parallelize(paths);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project druid by druid-io.
the class HadoopConverterJob method run.
public List<DataSegment> run() throws IOException {
final JobConf jobConf = new JobConf();
jobConf.setKeepFailedTaskFiles(false);
for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
}
final List<DataSegment> segments = converterConfig.getSegments();
if (segments.isEmpty()) {
throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
}
converterConfigIntoConfiguration(converterConfig, segments, jobConf);
// Map only. Number of map tasks determined by input format
jobConf.setNumReduceTasks(0);
jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));
setJobName(jobConf, segments);
if (converterConfig.getJobPriority() != null) {
jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
}
final Job job = Job.getInstance(jobConf);
job.setInputFormatClass(ConfigInputFormat.class);
job.setMapperClass(ConvertingMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapSpeculativeExecution(false);
job.setOutputFormatClass(ConvertingOutputFormat.class);
JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job);
Throwable throwable = null;
try {
job.submit();
log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
final boolean success = job.waitForCompletion(true);
if (!success) {
final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
if (reports != null) {
for (final TaskReport report : reports) {
log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics()));
}
}
return null;
}
try {
loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
} catch (IOException ex) {
log.error(ex, "Could not fetch counters");
}
final JobID jobID = job.getJobID();
final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
final List<Path> goodPaths = new ArrayList<>();
while (it.hasNext()) {
final LocatedFileStatus locatedFileStatus = it.next();
if (locatedFileStatus.isFile()) {
final Path myPath = locatedFileStatus.getPath();
if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
}
}
}
if (goodPaths.isEmpty()) {
log.warn("No good data found at [%s]", jobDir);
return null;
}
final List<DataSegment> returnList = ImmutableList.copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() {
@Nullable
@Override
public DataSegment apply(final Path input) {
try {
if (!fs.exists(input)) {
throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir);
}
} catch (final IOException e) {
throw Throwables.propagate(e);
}
try (final InputStream stream = fs.open(input)) {
return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
} catch (final IOException e) {
throw Throwables.propagate(e);
}
}
}));
if (returnList.size() == segments.size()) {
return returnList;
} else {
throw new ISE("Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir);
}
} catch (InterruptedException | ClassNotFoundException e) {
RuntimeException exception = Throwables.propagate(e);
throwable = exception;
throw exception;
} catch (Throwable t) {
throwable = t;
throw t;
} finally {
try {
cleanup(job);
} catch (IOException e) {
if (throwable != null) {
throwable.addSuppressed(e);
} else {
log.error(e, "Could not clean up job [%s]", job.getJobID());
}
}
}
}
use of org.apache.hadoop.fs.LocatedFileStatus in project druid by druid-io.
the class DatasourceInputFormatTest method setUp.
@Before
public void setUp() throws Exception {
segments = ImmutableList.of(WindowedDataSegment.of(new DataSegment("test1", Interval.parse("2000/3000"), "ver", ImmutableMap.<String, Object>of("type", "local", "path", "/tmp/index1.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 2)), WindowedDataSegment.of(new DataSegment("test2", Interval.parse("2050/3000"), "ver", ImmutableMap.<String, Object>of("type", "hdfs", "path", "/tmp/index2.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 11)), WindowedDataSegment.of(new DataSegment("test3", Interval.parse("2030/3000"), "ver", ImmutableMap.<String, Object>of("type", "hdfs", "path", "/tmp/index3.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 4)));
Path path1 = new Path(JobHelper.getURIFromSegment(segments.get(0).getSegment()));
Path path2 = new Path(JobHelper.getURIFromSegment(segments.get(1).getSegment()));
Path path3 = new Path(JobHelper.getURIFromSegment(segments.get(2).getSegment()));
// dummy locations for test
locations = ImmutableList.of(new LocatedFileStatus(1000, false, 0, 0, 0, 0, null, null, null, null, path1, new BlockLocation[] { new BlockLocation(null, new String[] { "s1", "s2" }, 0, 600), new BlockLocation(null, new String[] { "s2", "s3" }, 600, 400) }), new LocatedFileStatus(4000, false, 0, 0, 0, 0, null, null, null, null, path2, new BlockLocation[] { new BlockLocation(null, new String[] { "s1", "s2" }, 0, 1000), new BlockLocation(null, new String[] { "s1", "s3" }, 1000, 1200), new BlockLocation(null, new String[] { "s2", "s3" }, 2200, 1100), new BlockLocation(null, new String[] { "s1", "s2" }, 3300, 700) }), new LocatedFileStatus(500, false, 0, 0, 0, 0, null, null, null, null, path3, new BlockLocation[] { new BlockLocation(null, new String[] { "s2", "s3" }, 0, 500) }));
config = new JobConf();
config.set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, new DefaultObjectMapper().writeValueAsString(segments));
context = EasyMock.createMock(JobContext.class);
EasyMock.expect(context.getConfiguration()).andReturn(config);
EasyMock.replay(context);
}
Aggregations