use of org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader in project deeplearning4j by deeplearning4j.
the class RecordReaderDataSetiteratorTest method testSequenceRecordReaderReset.
@Test
public void testSequenceRecordReaderReset() throws Exception {
//need to manually extract
for (int i = 0; i < 3; i++) {
new ClassPathResource(String.format("csvsequence_%d.txt", i)).getTempFileFromArchive();
new ClassPathResource(String.format("csvsequencelabels_%d.txt", i)).getTempFileFromArchive();
}
ClassPathResource resource = new ClassPathResource("csvsequence_0.txt");
String featuresPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
resource = new ClassPathResource("csvsequencelabels_0.txt");
String labelsPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false);
assertEquals(3, iter.inputColumns());
assertEquals(4, iter.totalOutcomes());
int nResets = 5;
for (int i = 0; i < nResets; i++) {
iter.reset();
int count = 0;
while (iter.hasNext()) {
DataSet ds = iter.next();
INDArray features = ds.getFeatureMatrix();
INDArray labels = ds.getLabels();
assertArrayEquals(new int[] { 1, 3, 4 }, features.shape());
assertArrayEquals(new int[] { 1, 4, 4 }, labels.shape());
count++;
}
assertEquals(3, count);
}
}
use of org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader in project deeplearning4j by deeplearning4j.
the class RecordReaderDataSetiteratorTest method testSequenceRecordReaderTwoReadersWithEmptyLabelSequenceThrows.
@Test(expected = ZeroLengthSequenceException.class)
public void testSequenceRecordReaderTwoReadersWithEmptyLabelSequenceThrows() throws Exception {
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new FileSplit(new ClassPathResource("csvsequence_0.txt").getTempFileFromArchive()));
labelReader.initialize(new FileSplit(new ClassPathResource("empty.txt").getTempFileFromArchive()));
new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, -1, true).next();
}
use of org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader in project deeplearning4j by deeplearning4j.
the class RecordReaderDataSetiteratorTest method testVariableLengthSequence.
@Test
public void testVariableLengthSequence() throws Exception {
//need to manually extract
for (int i = 0; i < 3; i++) {
new ClassPathResource(String.format("csvsequence_%d.txt", i)).getTempFileFromArchive();
new ClassPathResource(String.format("csvsequencelabelsShort_%d.txt", i)).getTempFileFromArchive();
}
ClassPathResource resource = new ClassPathResource("csvsequence_0.txt");
String featuresPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
resource = new ClassPathResource("csvsequencelabelsShort_0.txt");
String labelsPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReader featureReader2 = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader2 = new CSVSequenceRecordReader(1, ",");
featureReader2.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader2.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iterAlignStart = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_START);
SequenceRecordReaderDataSetIterator iterAlignEnd = new SequenceRecordReaderDataSetIterator(featureReader2, labelReader2, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_END);
assertEquals(3, iterAlignStart.inputColumns());
assertEquals(4, iterAlignStart.totalOutcomes());
assertEquals(3, iterAlignEnd.inputColumns());
assertEquals(4, iterAlignEnd.totalOutcomes());
List<DataSet> dsListAlignStart = new ArrayList<>();
while (iterAlignStart.hasNext()) {
dsListAlignStart.add(iterAlignStart.next());
}
List<DataSet> dsListAlignEnd = new ArrayList<>();
while (iterAlignEnd.hasNext()) {
dsListAlignEnd.add(iterAlignEnd.next());
}
//3 files
assertEquals(3, dsListAlignStart.size());
//3 files
assertEquals(3, dsListAlignEnd.size());
for (int i = 0; i < 3; i++) {
DataSet ds = dsListAlignStart.get(i);
INDArray features = ds.getFeatureMatrix();
INDArray labels = ds.getLabels();
//1 example in mini-batch
assertEquals(1, features.size(0));
assertEquals(1, labels.size(0));
//3 values per line/time step
assertEquals(3, features.size(1));
//1 value per line, but 4 possible values -> one-hot vector
assertEquals(4, labels.size(1));
//sequence length = 4
assertEquals(4, features.size(2));
assertEquals(4, labels.size(2));
DataSet ds2 = dsListAlignEnd.get(i);
features = ds2.getFeatureMatrix();
labels = ds2.getLabels();
//1 example in mini-batch
assertEquals(1, features.size(0));
assertEquals(1, labels.size(0));
//3 values per line/time step
assertEquals(3, features.size(1));
//1 value per line, but 4 possible values -> one-hot vector
assertEquals(4, labels.size(1));
//sequence length = 4
assertEquals(4, features.size(2));
assertEquals(4, labels.size(2));
}
//Check features vs. expected:
//Here: labels always longer than features -> same features for align start and align end
INDArray expF0 = Nd4j.create(1, 3, 4);
expF0.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 0, 1, 2 }));
expF0.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 10, 11, 12 }));
expF0.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 20, 21, 22 }));
expF0.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 30, 31, 32 }));
assertEquals(expF0, dsListAlignStart.get(0).getFeatureMatrix());
assertEquals(expF0, dsListAlignEnd.get(0).getFeatureMatrix());
INDArray expF1 = Nd4j.create(1, 3, 4);
expF1.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 100, 101, 102 }));
expF1.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 110, 111, 112 }));
expF1.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 120, 121, 122 }));
expF1.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 130, 131, 132 }));
assertEquals(expF1, dsListAlignStart.get(1).getFeatureMatrix());
assertEquals(expF1, dsListAlignEnd.get(1).getFeatureMatrix());
INDArray expF2 = Nd4j.create(1, 3, 4);
expF2.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 200, 201, 202 }));
expF2.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 210, 211, 212 }));
expF2.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 220, 221, 222 }));
expF2.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 230, 231, 232 }));
assertEquals(expF2, dsListAlignStart.get(2).getFeatureMatrix());
assertEquals(expF2, dsListAlignEnd.get(2).getFeatureMatrix());
//Check features mask array:
//1 example, 4 values: same for both start/end align here
INDArray featuresMaskExpected = Nd4j.ones(1, 4);
for (int i = 0; i < 3; i++) {
INDArray featuresMaskStart = dsListAlignStart.get(i).getFeaturesMaskArray();
INDArray featuresMaskEnd = dsListAlignEnd.get(i).getFeaturesMaskArray();
assertEquals(featuresMaskExpected, featuresMaskStart);
assertEquals(featuresMaskExpected, featuresMaskEnd);
}
//Check labels vs. expected:
//First: aligning start
INDArray expL0 = Nd4j.create(1, 4, 4);
expL0.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 1, 0, 0, 0 }));
expL0.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
assertEquals(expL0, dsListAlignStart.get(0).getLabels());
INDArray expL1 = Nd4j.create(1, 4, 4);
expL1.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
assertEquals(expL1, dsListAlignStart.get(1).getLabels());
INDArray expL2 = Nd4j.create(1, 4, 4);
expL2.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 0, 0, 0, 1 }));
expL2.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 0, 0, 1, 0 }));
expL2.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
assertEquals(expL2, dsListAlignStart.get(2).getLabels());
//Second: align end
INDArray expL0end = Nd4j.create(1, 4, 4);
expL0end.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 1, 0, 0, 0 }));
expL0end.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
assertEquals(expL0end, dsListAlignEnd.get(0).getLabels());
INDArray expL1end = Nd4j.create(1, 4, 4);
expL1end.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
assertEquals(expL1end, dsListAlignEnd.get(1).getLabels());
INDArray expL2end = Nd4j.create(1, 4, 4);
expL2end.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 0, 0, 0, 1 }));
expL2end.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 0, 0, 1, 0 }));
expL2end.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
assertEquals(expL2end, dsListAlignEnd.get(2).getLabels());
//Check labels mask array
INDArray[] labelsMaskExpectedStart = new INDArray[] { Nd4j.create(new float[] { 1, 1, 0, 0 }, new int[] { 1, 4 }), Nd4j.create(new float[] { 1, 0, 0, 0 }, new int[] { 1, 4 }), Nd4j.create(new float[] { 1, 1, 1, 0 }, new int[] { 1, 4 }) };
INDArray[] labelsMaskExpectedEnd = new INDArray[] { Nd4j.create(new float[] { 0, 0, 1, 1 }, new int[] { 1, 4 }), Nd4j.create(new float[] { 0, 0, 0, 1 }, new int[] { 1, 4 }), Nd4j.create(new float[] { 0, 1, 1, 1 }, new int[] { 1, 4 }) };
for (int i = 0; i < 3; i++) {
INDArray labelsMaskStart = dsListAlignStart.get(i).getLabelsMaskArray();
INDArray labelsMaskEnd = dsListAlignEnd.get(i).getLabelsMaskArray();
assertEquals(labelsMaskExpectedStart[i], labelsMaskStart);
assertEquals(labelsMaskExpectedEnd[i], labelsMaskEnd);
}
}
use of org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader in project deeplearning4j by deeplearning4j.
the class RecordReaderDataSetiteratorTest method testSequenceRecordReaderRegression.
@Test
public void testSequenceRecordReaderRegression() throws Exception {
//need to manually extract
for (int i = 0; i < 3; i++) {
new ClassPathResource(String.format("csvsequence_%d.txt", i)).getTempFileFromArchive();
}
ClassPathResource resource = new ClassPathResource("csvsequence_0.txt");
String featuresPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
resource = new ClassPathResource("csvsequence_0.txt");
String labelsPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 0, true);
assertEquals(3, iter.inputColumns());
assertEquals(3, iter.totalOutcomes());
List<DataSet> dsList = new ArrayList<>();
while (iter.hasNext()) {
dsList.add(iter.next());
}
//3 files
assertEquals(3, dsList.size());
for (int i = 0; i < 3; i++) {
DataSet ds = dsList.get(i);
INDArray features = ds.getFeatureMatrix();
INDArray labels = ds.getLabels();
//1 examples, 3 values, 4 time steps
assertArrayEquals(new int[] { 1, 3, 4 }, features.shape());
assertArrayEquals(new int[] { 1, 3, 4 }, labels.shape());
assertEquals(features, labels);
}
//Also test regression + reset from a single reader:
featureReader.reset();
iter = new SequenceRecordReaderDataSetIterator(featureReader, 1, 0, 2, true);
int count = 0;
while (iter.hasNext()) {
DataSet ds = iter.next();
assertEquals(2, ds.getFeatureMatrix().size(1));
assertEquals(1, ds.getLabels().size(1));
count++;
}
assertEquals(3, count);
iter.reset();
count = 0;
while (iter.hasNext()) {
iter.next();
count++;
}
assertEquals(3, count);
}
use of org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader in project deeplearning4j by deeplearning4j.
the class TestDataVecDataSetFunctions method testDataVecSequencePairDataSetFunctionVariableLength.
@Test
public void testDataVecSequencePairDataSetFunctionVariableLength() throws Exception {
//Same sort of test as testDataVecSequencePairDataSetFunction() but with variable length time series (labels shorter, align end)
//Convert data to a SequenceFile:
File f = new File("src/test/resources/csvsequence/csvsequence_0.txt");
String pathFeatures = f.getAbsolutePath();
String folderFeatures = pathFeatures.substring(0, pathFeatures.length() - 17);
pathFeatures = folderFeatures + "*";
File f2 = new File("src/test/resources/csvsequencelabels/csvsequencelabelsShort_0.txt");
String pathLabels = f2.getPath();
String folderLabels = pathLabels.substring(0, pathLabels.length() - 28);
pathLabels = folderLabels + "*";
//Extract a number from the file name
PathToKeyConverter pathConverter = new PathToKeyConverterNumber();
JavaPairRDD<Text, BytesPairWritable> toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, pathFeatures, pathLabels, pathConverter);
Path p = Files.createTempDirectory("dl4j_testSeqPairFnVarLength");
p.toFile().deleteOnExit();
String outPath = p.toString() + "/out";
new File(outPath).deleteOnExit();
toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
//Load from sequence file:
JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);
SequenceRecordReader srr1 = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader srr2 = new CSVSequenceRecordReader(1, ",");
PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);
JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
//Map to DataSet:
DataVecSequencePairDataSetFunction pairFn = new DataVecSequencePairDataSetFunction(4, false, DataVecSequencePairDataSetFunction.AlignmentMode.ALIGN_END);
JavaRDD<DataSet> data = writables.map(pairFn);
List<DataSet> sparkData = data.collect();
//Now: do the same thing locally (SequenceRecordReaderDataSetIterator) and compare
String featuresPath = f.getPath().replaceAll("0", "%d");
String labelsPath = f2.getPath().replaceAll("0", "%d");
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_END);
List<DataSet> localData = new ArrayList<>(3);
while (iter.hasNext()) localData.add(iter.next());
assertEquals(3, sparkData.size());
assertEquals(3, localData.size());
//1 example, 3 values, 4 time steps
int[] fShapeExp = new int[] { 1, 3, 4 };
//1 example, 4 values/classes, 4 time steps (after padding)
int[] lShapeExp = new int[] { 1, 4, 4 };
for (int i = 0; i < 3; i++) {
//Check shapes etc. data sets order may differ for spark vs. local
DataSet dsSpark = sparkData.get(i);
DataSet dsLocal = localData.get(i);
//Expect mask array for labels
assertNotNull(dsSpark.getLabelsMaskArray());
INDArray fSpark = dsSpark.getFeatureMatrix();
INDArray fLocal = dsLocal.getFeatureMatrix();
INDArray lSpark = dsSpark.getLabels();
INDArray lLocal = dsLocal.getLabels();
assertArrayEquals(fShapeExp, fSpark.shape());
assertArrayEquals(fShapeExp, fLocal.shape());
assertArrayEquals(lShapeExp, lSpark.shape());
assertArrayEquals(lShapeExp, lLocal.shape());
}
//Check that results are the same (order not withstanding)
boolean[] found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = sparkData.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(localData.get(j))) {
if (foundIndex != -1)
//Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
fail();
foundIndex = j;
if (found[foundIndex])
//One of the other spark values was equal to this one -> suggests duplicates in Spark list
fail();
//mark this one as seen before
found[foundIndex] = true;
}
}
}
int count = 0;
for (boolean b : found) if (b)
count++;
//Expect all 3 and exactly 3 pairwise matches between spark and local versions
assertEquals(3, count);
//-------------------------------------------------
//NOW: test same thing, but for align start...
DataVecSequencePairDataSetFunction pairFnAlignStart = new DataVecSequencePairDataSetFunction(4, false, DataVecSequencePairDataSetFunction.AlignmentMode.ALIGN_START);
JavaRDD<DataSet> rddDataAlignStart = writables.map(pairFnAlignStart);
List<DataSet> sparkDataAlignStart = rddDataAlignStart.collect();
//re-initialize to reset
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iterAlignStart = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_START);
List<DataSet> localDataAlignStart = new ArrayList<>(3);
while (iterAlignStart.hasNext()) localDataAlignStart.add(iterAlignStart.next());
assertEquals(3, sparkDataAlignStart.size());
assertEquals(3, localDataAlignStart.size());
for (int i = 0; i < 3; i++) {
//Check shapes etc. data sets order may differ for spark vs. local
DataSet dsSpark = sparkDataAlignStart.get(i);
DataSet dsLocal = localDataAlignStart.get(i);
//Expect mask array for labels
assertNotNull(dsSpark.getLabelsMaskArray());
INDArray fSpark = dsSpark.getFeatureMatrix();
INDArray fLocal = dsLocal.getFeatureMatrix();
INDArray lSpark = dsSpark.getLabels();
INDArray lLocal = dsLocal.getLabels();
assertArrayEquals(fShapeExp, fSpark.shape());
assertArrayEquals(fShapeExp, fLocal.shape());
assertArrayEquals(lShapeExp, lSpark.shape());
assertArrayEquals(lShapeExp, lLocal.shape());
}
//Check that results are the same (order not withstanding)
found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = sparkData.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(localData.get(j))) {
if (foundIndex != -1)
//Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
fail();
foundIndex = j;
if (found[foundIndex])
//One of the other spark values was equal to this one -> suggests duplicates in Spark list
fail();
//mark this one as seen before
found[foundIndex] = true;
}
}
}
count = 0;
for (boolean b : found) if (b)
count++;
//Expect all 3 and exactly 3 pairwise matches between spark and local versions
assertEquals(3, count);
}
Aggregations