use of org.datavec.api.records.reader.impl.csv.CSVRecordReader in project deeplearning4j by deeplearning4j.
the class TestPreProcessedData method testCsvPreprocessedDataGeneration.
@Test
public void testCsvPreprocessedDataGeneration() throws Exception {
List<String> list = new ArrayList<>();
DataSetIterator iter = new IrisDataSetIterator(1, 150);
while (iter.hasNext()) {
DataSet ds = iter.next();
list.add(toString(ds.getFeatureMatrix(), Nd4j.argMax(ds.getLabels(), 1).getInt(0)));
}
JavaRDD<String> rdd = sc.parallelize(list);
int partitions = rdd.partitions().size();
URI tempDir = new File(System.getProperty("java.io.tmpdir")).toURI();
URI outputDir = new URI(tempDir.getPath() + "/dl4j_testPreprocessedData2");
File temp = new File(outputDir.getPath());
if (temp.exists())
FileUtils.deleteDirectory(temp);
int numBinFiles = 0;
try {
int batchSize = 5;
int labelIdx = 4;
int numPossibleLabels = 3;
rdd.foreachPartition(new StringToDataSetExportFunction(outputDir, new CSVRecordReader(0), batchSize, false, labelIdx, numPossibleLabels));
File[] fileList = new File(outputDir.getPath()).listFiles();
int totalExamples = 0;
for (File f2 : fileList) {
if (!f2.getPath().endsWith(".bin"))
continue;
// System.out.println(f2.getPath());
numBinFiles++;
DataSet ds = new DataSet();
ds.load(f2);
assertEquals(4, ds.numInputs());
assertEquals(3, ds.numOutcomes());
totalExamples += ds.numExamples();
}
assertEquals(150, totalExamples);
//Expect 30, give or take due to partitioning randomness
assertTrue(Math.abs(150 / batchSize - numBinFiles) <= partitions);
//Test the PortableDataStreamDataSetIterator:
JavaPairRDD<String, PortableDataStream> pds = sc.binaryFiles(outputDir.getPath());
List<PortableDataStream> pdsList = pds.values().collect();
DataSetIterator pdsIter = new PortableDataStreamDataSetIterator(pdsList);
int pdsCount = 0;
int totalExamples2 = 0;
while (pdsIter.hasNext()) {
DataSet ds = pdsIter.next();
pdsCount++;
totalExamples2 += ds.numExamples();
assertEquals(4, ds.numInputs());
assertEquals(3, ds.numOutcomes());
}
assertEquals(150, totalExamples2);
assertEquals(numBinFiles, pdsCount);
} finally {
FileUtils.deleteDirectory(temp);
}
}
use of org.datavec.api.records.reader.impl.csv.CSVRecordReader in project deeplearning4j by deeplearning4j.
the class TestDataVecDataSetFunctions method testDataVecDataSetFunctionMultiLabelRegression.
@Test
public void testDataVecDataSetFunctionMultiLabelRegression() throws Exception {
JavaSparkContext sc = getContext();
List<String> stringData = new ArrayList<>();
int n = 6;
for (int i = 0; i < 10; i++) {
StringBuilder sb = new StringBuilder();
boolean first = true;
for (int j = 0; j < n; j++) {
if (!first)
sb.append(",");
sb.append(10 * i + j);
first = false;
}
stringData.add(sb.toString());
}
JavaRDD<String> stringList = sc.parallelize(stringData);
JavaRDD<List<Writable>> writables = stringList.map(new StringToWritablesFunction(new CSVRecordReader()));
JavaRDD<DataSet> dataSets = writables.map(new DataVecDataSetFunction(3, 5, -1, true, null, null));
List<DataSet> ds = dataSets.collect();
assertEquals(10, ds.size());
boolean[] seen = new boolean[10];
for (DataSet d : ds) {
INDArray f = d.getFeatureMatrix();
INDArray l = d.getLabels();
assertEquals(3, f.length());
assertEquals(3, l.length());
int exampleIdx = ((int) f.getDouble(0)) / 10;
seen[exampleIdx] = true;
for (int j = 0; j < 3; j++) {
assertEquals(10 * exampleIdx + j, (int) f.getDouble(j));
assertEquals(10 * exampleIdx + j + 3, (int) l.getDouble(j));
}
}
int seenCount = 0;
for (boolean b : seen) if (b)
seenCount++;
assertEquals(10, seenCount);
}
Aggregations