use of org.nd4j.linalg.dataset.DataSet in project deeplearning4j by deeplearning4j.
the class TestDataVecDataSetFunctions method testDataVecDataSetFunction.
@Test
public void testDataVecDataSetFunction() throws Exception {
JavaSparkContext sc = getContext();
//Test Spark record reader functionality vs. local
File f = new File("src/test/resources/imagetest/0/a.bmp");
//Need this for Spark: can't infer without init call
List<String> labelsList = Arrays.asList("0", "1");
String path = f.getPath();
String folder = path.substring(0, path.length() - 7);
path = folder + "*";
JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
//4 images
assertEquals(4, origData.count());
ImageRecordReader rr = new ImageRecordReader(28, 28, 1, new ParentPathLabelGenerator());
rr.setLabels(labelsList);
org.datavec.spark.functions.RecordReaderFunction rrf = new org.datavec.spark.functions.RecordReaderFunction(rr);
JavaRDD<List<Writable>> rdd = origData.map(rrf);
JavaRDD<DataSet> data = rdd.map(new DataVecDataSetFunction(1, 2, false));
List<DataSet> collected = data.collect();
//Load normally (i.e., not via Spark), and check that we get the same results (order not withstanding)
InputSplit is = new FileSplit(new File(folder), new String[] { "bmp" }, true);
ImageRecordReader irr = new ImageRecordReader(28, 28, 1, new ParentPathLabelGenerator());
irr.initialize(is);
RecordReaderDataSetIterator iter = new RecordReaderDataSetIterator(irr, 1, 1, 2);
List<DataSet> listLocal = new ArrayList<>(4);
while (iter.hasNext()) {
listLocal.add(iter.next());
}
//Compare:
assertEquals(4, collected.size());
assertEquals(4, listLocal.size());
//Check that results are the same (order not withstanding)
boolean[] found = new boolean[4];
for (int i = 0; i < 4; i++) {
int foundIndex = -1;
DataSet ds = collected.get(i);
for (int j = 0; j < 4; j++) {
if (ds.equals(listLocal.get(j))) {
if (foundIndex != -1)
//Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
fail();
foundIndex = j;
if (found[foundIndex])
//One of the other spark values was equal to this one -> suggests duplicates in Spark list
fail();
//mark this one as seen before
found[foundIndex] = true;
}
}
}
int count = 0;
for (boolean b : found) if (b)
count++;
//Expect all 4 and exactly 4 pairwise matches between spark and local versions
assertEquals(4, count);
}
use of org.nd4j.linalg.dataset.DataSet in project deeplearning4j by deeplearning4j.
the class PathSparkDataSetIterator method load.
protected synchronized DataSet load(String path) {
if (fileSystem == null) {
try {
fileSystem = FileSystem.get(new URI(path), new Configuration());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
DataSet ds = new DataSet();
try (FSDataInputStream inputStream = fileSystem.open(new Path(path), BUFFER_SIZE)) {
ds.load(inputStream);
} catch (IOException e) {
throw new RuntimeException(e);
}
cursor++;
return ds;
}
use of org.nd4j.linalg.dataset.DataSet in project deeplearning4j by deeplearning4j.
the class ScoreFlatMapFunctionCGDataSetAdapter method call.
@Override
public Iterable<Tuple2<Integer, Double>> call(Iterator<DataSet> dataSetIterator) throws Exception {
if (!dataSetIterator.hasNext()) {
return Collections.singletonList(new Tuple2<>(0, 0.0));
}
//Does batching where appropriate
DataSetIterator iter = new IteratorDataSetIterator(dataSetIterator, minibatchSize);
ComputationGraph network = new ComputationGraph(ComputationGraphConfiguration.fromJson(json));
network.init();
//.value() is shared by all executors on single machine -> OK, as params are not changed in score function
INDArray val = params.value().unsafeDuplication();
if (val.length() != network.numParams(false))
throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
network.setParams(val);
List<Tuple2<Integer, Double>> out = new ArrayList<>();
while (iter.hasNext()) {
DataSet ds = iter.next();
double score = network.score(ds, false);
int numExamples = ds.getFeatureMatrix().size(0);
out.add(new Tuple2<>(numExamples, score * numExamples));
}
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
return out;
}
use of org.nd4j.linalg.dataset.DataSet in project deeplearning4j by deeplearning4j.
the class SparkDl4jLayer method fit.
/**
* Fit the layer based on the specified org.deeplearning4j.spark context text file
* @param path the path to the text file
* @param labelIndex the index of the label
* @param recordReader the record reader
* @return the fit layer
*/
public Layer fit(String path, int labelIndex, RecordReader recordReader) {
FeedForwardLayer ffLayer = (FeedForwardLayer) conf.getLayer();
JavaRDD<String> lines = sc.textFile(path);
// gotta map this to a Matrix/INDArray
JavaRDD<DataSet> points = lines.map(new RecordReaderFunction(recordReader, labelIndex, ffLayer.getNOut()));
return fitDataSet(points);
}
use of org.nd4j.linalg.dataset.DataSet in project deeplearning4j by deeplearning4j.
the class PathSparkDataSetIterator method next.
@Override
public DataSet next() {
DataSet ds;
if (preloadedDataSet != null) {
ds = preloadedDataSet;
preloadedDataSet = null;
} else {
ds = load(iter.next());
}
//May be null for layerwise pretraining
totalOutcomes = ds.getLabels() == null ? 0 : ds.getLabels().size(1);
inputColumns = ds.getFeatureMatrix().size(1);
batch = ds.numExamples();
if (preprocessor != null)
preprocessor.preProcess(ds);
return ds;
}
Aggregations