Search in sources :

Example 1 with NumberedFileInputSplit

use of org.datavec.api.split.NumberedFileInputSplit in project deeplearning4j by deeplearning4j.

the class RecordReaderDataSetiteratorTest method testSequenceRecordReaderReset.

@Test
public void testSequenceRecordReaderReset() throws Exception {
    //need to manually extract
    for (int i = 0; i < 3; i++) {
        new ClassPathResource(String.format("csvsequence_%d.txt", i)).getTempFileFromArchive();
        new ClassPathResource(String.format("csvsequencelabels_%d.txt", i)).getTempFileFromArchive();
    }
    ClassPathResource resource = new ClassPathResource("csvsequence_0.txt");
    String featuresPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
    resource = new ClassPathResource("csvsequencelabels_0.txt");
    String labelsPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
    SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
    featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
    SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false);
    assertEquals(3, iter.inputColumns());
    assertEquals(4, iter.totalOutcomes());
    int nResets = 5;
    for (int i = 0; i < nResets; i++) {
        iter.reset();
        int count = 0;
        while (iter.hasNext()) {
            DataSet ds = iter.next();
            INDArray features = ds.getFeatureMatrix();
            INDArray labels = ds.getLabels();
            assertArrayEquals(new int[] { 1, 3, 4 }, features.shape());
            assertArrayEquals(new int[] { 1, 4, 4 }, labels.shape());
            count++;
        }
        assertEquals(3, count);
    }
}
Also used : CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) SequenceRecordReader(org.datavec.api.records.reader.SequenceRecordReader) CollectionSequenceRecordReader(org.datavec.api.records.reader.impl.collection.CollectionSequenceRecordReader) CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) INDArray(org.nd4j.linalg.api.ndarray.INDArray) DataSet(org.nd4j.linalg.dataset.DataSet) ClassPathResource(org.nd4j.linalg.io.ClassPathResource) NumberedFileInputSplit(org.datavec.api.split.NumberedFileInputSplit) Test(org.junit.Test)

Example 2 with NumberedFileInputSplit

use of org.datavec.api.split.NumberedFileInputSplit in project deeplearning4j by deeplearning4j.

the class RecordReaderDataSetiteratorTest method testVariableLengthSequence.

@Test
public void testVariableLengthSequence() throws Exception {
    //need to manually extract
    for (int i = 0; i < 3; i++) {
        new ClassPathResource(String.format("csvsequence_%d.txt", i)).getTempFileFromArchive();
        new ClassPathResource(String.format("csvsequencelabelsShort_%d.txt", i)).getTempFileFromArchive();
    }
    ClassPathResource resource = new ClassPathResource("csvsequence_0.txt");
    String featuresPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
    resource = new ClassPathResource("csvsequencelabelsShort_0.txt");
    String labelsPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
    SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
    featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
    SequenceRecordReader featureReader2 = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader labelReader2 = new CSVSequenceRecordReader(1, ",");
    featureReader2.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    labelReader2.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
    SequenceRecordReaderDataSetIterator iterAlignStart = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_START);
    SequenceRecordReaderDataSetIterator iterAlignEnd = new SequenceRecordReaderDataSetIterator(featureReader2, labelReader2, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_END);
    assertEquals(3, iterAlignStart.inputColumns());
    assertEquals(4, iterAlignStart.totalOutcomes());
    assertEquals(3, iterAlignEnd.inputColumns());
    assertEquals(4, iterAlignEnd.totalOutcomes());
    List<DataSet> dsListAlignStart = new ArrayList<>();
    while (iterAlignStart.hasNext()) {
        dsListAlignStart.add(iterAlignStart.next());
    }
    List<DataSet> dsListAlignEnd = new ArrayList<>();
    while (iterAlignEnd.hasNext()) {
        dsListAlignEnd.add(iterAlignEnd.next());
    }
    //3 files
    assertEquals(3, dsListAlignStart.size());
    //3 files
    assertEquals(3, dsListAlignEnd.size());
    for (int i = 0; i < 3; i++) {
        DataSet ds = dsListAlignStart.get(i);
        INDArray features = ds.getFeatureMatrix();
        INDArray labels = ds.getLabels();
        //1 example in mini-batch
        assertEquals(1, features.size(0));
        assertEquals(1, labels.size(0));
        //3 values per line/time step
        assertEquals(3, features.size(1));
        //1 value per line, but 4 possible values -> one-hot vector
        assertEquals(4, labels.size(1));
        //sequence length = 4
        assertEquals(4, features.size(2));
        assertEquals(4, labels.size(2));
        DataSet ds2 = dsListAlignEnd.get(i);
        features = ds2.getFeatureMatrix();
        labels = ds2.getLabels();
        //1 example in mini-batch
        assertEquals(1, features.size(0));
        assertEquals(1, labels.size(0));
        //3 values per line/time step
        assertEquals(3, features.size(1));
        //1 value per line, but 4 possible values -> one-hot vector
        assertEquals(4, labels.size(1));
        //sequence length = 4
        assertEquals(4, features.size(2));
        assertEquals(4, labels.size(2));
    }
    //Check features vs. expected:
    //Here: labels always longer than features -> same features for align start and align end
    INDArray expF0 = Nd4j.create(1, 3, 4);
    expF0.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 0, 1, 2 }));
    expF0.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 10, 11, 12 }));
    expF0.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 20, 21, 22 }));
    expF0.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 30, 31, 32 }));
    assertEquals(expF0, dsListAlignStart.get(0).getFeatureMatrix());
    assertEquals(expF0, dsListAlignEnd.get(0).getFeatureMatrix());
    INDArray expF1 = Nd4j.create(1, 3, 4);
    expF1.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 100, 101, 102 }));
    expF1.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 110, 111, 112 }));
    expF1.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 120, 121, 122 }));
    expF1.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 130, 131, 132 }));
    assertEquals(expF1, dsListAlignStart.get(1).getFeatureMatrix());
    assertEquals(expF1, dsListAlignEnd.get(1).getFeatureMatrix());
    INDArray expF2 = Nd4j.create(1, 3, 4);
    expF2.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 200, 201, 202 }));
    expF2.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 210, 211, 212 }));
    expF2.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 220, 221, 222 }));
    expF2.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 230, 231, 232 }));
    assertEquals(expF2, dsListAlignStart.get(2).getFeatureMatrix());
    assertEquals(expF2, dsListAlignEnd.get(2).getFeatureMatrix());
    //Check features mask array:
    //1 example, 4 values: same for both start/end align here
    INDArray featuresMaskExpected = Nd4j.ones(1, 4);
    for (int i = 0; i < 3; i++) {
        INDArray featuresMaskStart = dsListAlignStart.get(i).getFeaturesMaskArray();
        INDArray featuresMaskEnd = dsListAlignEnd.get(i).getFeaturesMaskArray();
        assertEquals(featuresMaskExpected, featuresMaskStart);
        assertEquals(featuresMaskExpected, featuresMaskEnd);
    }
    //Check labels vs. expected:
    //First: aligning start
    INDArray expL0 = Nd4j.create(1, 4, 4);
    expL0.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 1, 0, 0, 0 }));
    expL0.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
    assertEquals(expL0, dsListAlignStart.get(0).getLabels());
    INDArray expL1 = Nd4j.create(1, 4, 4);
    expL1.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
    assertEquals(expL1, dsListAlignStart.get(1).getLabels());
    INDArray expL2 = Nd4j.create(1, 4, 4);
    expL2.tensorAlongDimension(0, 1).assign(Nd4j.create(new double[] { 0, 0, 0, 1 }));
    expL2.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 0, 0, 1, 0 }));
    expL2.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
    assertEquals(expL2, dsListAlignStart.get(2).getLabels());
    //Second: align end
    INDArray expL0end = Nd4j.create(1, 4, 4);
    expL0end.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 1, 0, 0, 0 }));
    expL0end.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
    assertEquals(expL0end, dsListAlignEnd.get(0).getLabels());
    INDArray expL1end = Nd4j.create(1, 4, 4);
    expL1end.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
    assertEquals(expL1end, dsListAlignEnd.get(1).getLabels());
    INDArray expL2end = Nd4j.create(1, 4, 4);
    expL2end.tensorAlongDimension(1, 1).assign(Nd4j.create(new double[] { 0, 0, 0, 1 }));
    expL2end.tensorAlongDimension(2, 1).assign(Nd4j.create(new double[] { 0, 0, 1, 0 }));
    expL2end.tensorAlongDimension(3, 1).assign(Nd4j.create(new double[] { 0, 1, 0, 0 }));
    assertEquals(expL2end, dsListAlignEnd.get(2).getLabels());
    //Check labels mask array
    INDArray[] labelsMaskExpectedStart = new INDArray[] { Nd4j.create(new float[] { 1, 1, 0, 0 }, new int[] { 1, 4 }), Nd4j.create(new float[] { 1, 0, 0, 0 }, new int[] { 1, 4 }), Nd4j.create(new float[] { 1, 1, 1, 0 }, new int[] { 1, 4 }) };
    INDArray[] labelsMaskExpectedEnd = new INDArray[] { Nd4j.create(new float[] { 0, 0, 1, 1 }, new int[] { 1, 4 }), Nd4j.create(new float[] { 0, 0, 0, 1 }, new int[] { 1, 4 }), Nd4j.create(new float[] { 0, 1, 1, 1 }, new int[] { 1, 4 }) };
    for (int i = 0; i < 3; i++) {
        INDArray labelsMaskStart = dsListAlignStart.get(i).getLabelsMaskArray();
        INDArray labelsMaskEnd = dsListAlignEnd.get(i).getLabelsMaskArray();
        assertEquals(labelsMaskExpectedStart[i], labelsMaskStart);
        assertEquals(labelsMaskExpectedEnd[i], labelsMaskEnd);
    }
}
Also used : CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) SequenceRecordReader(org.datavec.api.records.reader.SequenceRecordReader) CollectionSequenceRecordReader(org.datavec.api.records.reader.impl.collection.CollectionSequenceRecordReader) CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) INDArray(org.nd4j.linalg.api.ndarray.INDArray) DataSet(org.nd4j.linalg.dataset.DataSet) ClassPathResource(org.nd4j.linalg.io.ClassPathResource) NumberedFileInputSplit(org.datavec.api.split.NumberedFileInputSplit) Test(org.junit.Test)

Example 3 with NumberedFileInputSplit

use of org.datavec.api.split.NumberedFileInputSplit in project deeplearning4j by deeplearning4j.

the class RecordReaderDataSetiteratorTest method testSequenceRecordReaderRegression.

@Test
public void testSequenceRecordReaderRegression() throws Exception {
    //need to manually extract
    for (int i = 0; i < 3; i++) {
        new ClassPathResource(String.format("csvsequence_%d.txt", i)).getTempFileFromArchive();
    }
    ClassPathResource resource = new ClassPathResource("csvsequence_0.txt");
    String featuresPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
    resource = new ClassPathResource("csvsequence_0.txt");
    String labelsPath = resource.getTempFileFromArchive().getAbsolutePath().replaceAll("0", "%d");
    SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
    featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
    SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 0, true);
    assertEquals(3, iter.inputColumns());
    assertEquals(3, iter.totalOutcomes());
    List<DataSet> dsList = new ArrayList<>();
    while (iter.hasNext()) {
        dsList.add(iter.next());
    }
    //3 files
    assertEquals(3, dsList.size());
    for (int i = 0; i < 3; i++) {
        DataSet ds = dsList.get(i);
        INDArray features = ds.getFeatureMatrix();
        INDArray labels = ds.getLabels();
        //1 examples, 3 values, 4 time steps
        assertArrayEquals(new int[] { 1, 3, 4 }, features.shape());
        assertArrayEquals(new int[] { 1, 3, 4 }, labels.shape());
        assertEquals(features, labels);
    }
    //Also test regression + reset from a single reader:
    featureReader.reset();
    iter = new SequenceRecordReaderDataSetIterator(featureReader, 1, 0, 2, true);
    int count = 0;
    while (iter.hasNext()) {
        DataSet ds = iter.next();
        assertEquals(2, ds.getFeatureMatrix().size(1));
        assertEquals(1, ds.getLabels().size(1));
        count++;
    }
    assertEquals(3, count);
    iter.reset();
    count = 0;
    while (iter.hasNext()) {
        iter.next();
        count++;
    }
    assertEquals(3, count);
}
Also used : CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) SequenceRecordReader(org.datavec.api.records.reader.SequenceRecordReader) CollectionSequenceRecordReader(org.datavec.api.records.reader.impl.collection.CollectionSequenceRecordReader) CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) INDArray(org.nd4j.linalg.api.ndarray.INDArray) DataSet(org.nd4j.linalg.dataset.DataSet) ClassPathResource(org.nd4j.linalg.io.ClassPathResource) NumberedFileInputSplit(org.datavec.api.split.NumberedFileInputSplit) Test(org.junit.Test)

Example 4 with NumberedFileInputSplit

use of org.datavec.api.split.NumberedFileInputSplit in project deeplearning4j by deeplearning4j.

the class TestDataVecDataSetFunctions method testDataVecSequencePairDataSetFunctionVariableLength.

@Test
public void testDataVecSequencePairDataSetFunctionVariableLength() throws Exception {
    //Same sort of test as testDataVecSequencePairDataSetFunction() but with variable length time series (labels shorter, align end)
    //Convert data to a SequenceFile:
    File f = new File("src/test/resources/csvsequence/csvsequence_0.txt");
    String pathFeatures = f.getAbsolutePath();
    String folderFeatures = pathFeatures.substring(0, pathFeatures.length() - 17);
    pathFeatures = folderFeatures + "*";
    File f2 = new File("src/test/resources/csvsequencelabels/csvsequencelabelsShort_0.txt");
    String pathLabels = f2.getPath();
    String folderLabels = pathLabels.substring(0, pathLabels.length() - 28);
    pathLabels = folderLabels + "*";
    //Extract a number from the file name
    PathToKeyConverter pathConverter = new PathToKeyConverterNumber();
    JavaPairRDD<Text, BytesPairWritable> toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, pathFeatures, pathLabels, pathConverter);
    Path p = Files.createTempDirectory("dl4j_testSeqPairFnVarLength");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    new File(outPath).deleteOnExit();
    toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
    //Load from sequence file:
    JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);
    SequenceRecordReader srr1 = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader srr2 = new CSVSequenceRecordReader(1, ",");
    PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);
    JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
    //Map to DataSet:
    DataVecSequencePairDataSetFunction pairFn = new DataVecSequencePairDataSetFunction(4, false, DataVecSequencePairDataSetFunction.AlignmentMode.ALIGN_END);
    JavaRDD<DataSet> data = writables.map(pairFn);
    List<DataSet> sparkData = data.collect();
    //Now: do the same thing locally (SequenceRecordReaderDataSetIterator) and compare
    String featuresPath = f.getPath().replaceAll("0", "%d");
    String labelsPath = f2.getPath().replaceAll("0", "%d");
    SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
    featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
    SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_END);
    List<DataSet> localData = new ArrayList<>(3);
    while (iter.hasNext()) localData.add(iter.next());
    assertEquals(3, sparkData.size());
    assertEquals(3, localData.size());
    //1 example, 3 values, 4 time steps
    int[] fShapeExp = new int[] { 1, 3, 4 };
    //1 example, 4 values/classes, 4 time steps (after padding)
    int[] lShapeExp = new int[] { 1, 4, 4 };
    for (int i = 0; i < 3; i++) {
        //Check shapes etc. data sets order may differ for spark vs. local
        DataSet dsSpark = sparkData.get(i);
        DataSet dsLocal = localData.get(i);
        //Expect mask array for labels
        assertNotNull(dsSpark.getLabelsMaskArray());
        INDArray fSpark = dsSpark.getFeatureMatrix();
        INDArray fLocal = dsLocal.getFeatureMatrix();
        INDArray lSpark = dsSpark.getLabels();
        INDArray lLocal = dsLocal.getLabels();
        assertArrayEquals(fShapeExp, fSpark.shape());
        assertArrayEquals(fShapeExp, fLocal.shape());
        assertArrayEquals(lShapeExp, lSpark.shape());
        assertArrayEquals(lShapeExp, lLocal.shape());
    }
    //Check that results are the same (order not withstanding)
    boolean[] found = new boolean[3];
    for (int i = 0; i < 3; i++) {
        int foundIndex = -1;
        DataSet ds = sparkData.get(i);
        for (int j = 0; j < 3; j++) {
            if (ds.equals(localData.get(j))) {
                if (foundIndex != -1)
                    //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                    fail();
                foundIndex = j;
                if (found[foundIndex])
                    //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                    fail();
                //mark this one as seen before
                found[foundIndex] = true;
            }
        }
    }
    int count = 0;
    for (boolean b : found) if (b)
        count++;
    //Expect all 3 and exactly 3 pairwise matches between spark and local versions
    assertEquals(3, count);
    //-------------------------------------------------
    //NOW: test same thing, but for align start...
    DataVecSequencePairDataSetFunction pairFnAlignStart = new DataVecSequencePairDataSetFunction(4, false, DataVecSequencePairDataSetFunction.AlignmentMode.ALIGN_START);
    JavaRDD<DataSet> rddDataAlignStart = writables.map(pairFnAlignStart);
    List<DataSet> sparkDataAlignStart = rddDataAlignStart.collect();
    //re-initialize to reset
    featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
    SequenceRecordReaderDataSetIterator iterAlignStart = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_START);
    List<DataSet> localDataAlignStart = new ArrayList<>(3);
    while (iterAlignStart.hasNext()) localDataAlignStart.add(iterAlignStart.next());
    assertEquals(3, sparkDataAlignStart.size());
    assertEquals(3, localDataAlignStart.size());
    for (int i = 0; i < 3; i++) {
        //Check shapes etc. data sets order may differ for spark vs. local
        DataSet dsSpark = sparkDataAlignStart.get(i);
        DataSet dsLocal = localDataAlignStart.get(i);
        //Expect mask array for labels
        assertNotNull(dsSpark.getLabelsMaskArray());
        INDArray fSpark = dsSpark.getFeatureMatrix();
        INDArray fLocal = dsLocal.getFeatureMatrix();
        INDArray lSpark = dsSpark.getLabels();
        INDArray lLocal = dsLocal.getLabels();
        assertArrayEquals(fShapeExp, fSpark.shape());
        assertArrayEquals(fShapeExp, fLocal.shape());
        assertArrayEquals(lShapeExp, lSpark.shape());
        assertArrayEquals(lShapeExp, lLocal.shape());
    }
    //Check that results are the same (order not withstanding)
    found = new boolean[3];
    for (int i = 0; i < 3; i++) {
        int foundIndex = -1;
        DataSet ds = sparkData.get(i);
        for (int j = 0; j < 3; j++) {
            if (ds.equals(localData.get(j))) {
                if (foundIndex != -1)
                    //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                    fail();
                foundIndex = j;
                if (found[foundIndex])
                    //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                    fail();
                //mark this one as seen before
                found[foundIndex] = true;
            }
        }
    }
    count = 0;
    for (boolean b : found) if (b)
        count++;
    //Expect all 3 and exactly 3 pairwise matches between spark and local versions
    assertEquals(3, count);
}
Also used : DataSet(org.nd4j.linalg.dataset.DataSet) SequenceRecordReaderDataSetIterator(org.deeplearning4j.datasets.datavec.SequenceRecordReaderDataSetIterator) ArrayList(java.util.ArrayList) CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) ArrayList(java.util.ArrayList) List(java.util.List) Path(java.nio.file.Path) CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) SequenceRecordReader(org.datavec.api.records.reader.SequenceRecordReader) Text(org.apache.hadoop.io.Text) NumberedFileInputSplit(org.datavec.api.split.NumberedFileInputSplit) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) File(java.io.File) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 5 with NumberedFileInputSplit

use of org.datavec.api.split.NumberedFileInputSplit in project deeplearning4j by deeplearning4j.

the class TestDataVecDataSetFunctions method testDataVecSequencePairDataSetFunction.

@Test
public void testDataVecSequencePairDataSetFunction() throws Exception {
    JavaSparkContext sc = getContext();
    //Convert data to a SequenceFile:
    File f = new File("src/test/resources/csvsequence/csvsequence_0.txt");
    String path = f.getPath();
    String folder = path.substring(0, path.length() - 17);
    path = folder + "*";
    PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
    JavaPairRDD<Text, BytesPairWritable> toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter);
    Path p = Files.createTempDirectory("dl4j_testSeqPairFn");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    new File(outPath).deleteOnExit();
    toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
    //Load from sequence file:
    JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);
    SequenceRecordReader srr1 = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader srr2 = new CSVSequenceRecordReader(1, ",");
    PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);
    JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
    //Map to DataSet:
    DataVecSequencePairDataSetFunction pairFn = new DataVecSequencePairDataSetFunction();
    JavaRDD<DataSet> data = writables.map(pairFn);
    List<DataSet> sparkData = data.collect();
    //Now: do the same thing locally (SequenceRecordReaderDataSetIterator) and compare
    String featuresPath = f.getAbsolutePath().replaceAll("0", "%d");
    SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
    featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    labelReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, -1, true);
    List<DataSet> localData = new ArrayList<>(3);
    while (iter.hasNext()) localData.add(iter.next());
    assertEquals(3, sparkData.size());
    assertEquals(3, localData.size());
    for (int i = 0; i < 3; i++) {
        //Check shapes etc. data sets order may differ for spark vs. local
        DataSet dsSpark = sparkData.get(i);
        DataSet dsLocal = localData.get(i);
        assertNull(dsSpark.getFeaturesMaskArray());
        assertNull(dsSpark.getLabelsMaskArray());
        INDArray fSpark = dsSpark.getFeatureMatrix();
        INDArray fLocal = dsLocal.getFeatureMatrix();
        INDArray lSpark = dsSpark.getLabels();
        INDArray lLocal = dsLocal.getLabels();
        //1 example, 3 values, 3 time steps
        int[] s = new int[] { 1, 3, 4 };
        assertArrayEquals(s, fSpark.shape());
        assertArrayEquals(s, fLocal.shape());
        assertArrayEquals(s, lSpark.shape());
        assertArrayEquals(s, lLocal.shape());
    }
    //Check that results are the same (order not withstanding)
    boolean[] found = new boolean[3];
    for (int i = 0; i < 3; i++) {
        int foundIndex = -1;
        DataSet ds = sparkData.get(i);
        for (int j = 0; j < 3; j++) {
            if (ds.equals(localData.get(j))) {
                if (foundIndex != -1)
                    //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                    fail();
                foundIndex = j;
                if (found[foundIndex])
                    //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                    fail();
                //mark this one as seen before
                found[foundIndex] = true;
            }
        }
    }
    int count = 0;
    for (boolean b : found) if (b)
        count++;
    //Expect all 3 and exactly 3 pairwise matches between spark and local versions
    assertEquals(3, count);
}
Also used : DataSet(org.nd4j.linalg.dataset.DataSet) SequenceRecordReaderDataSetIterator(org.deeplearning4j.datasets.datavec.SequenceRecordReaderDataSetIterator) ArrayList(java.util.ArrayList) CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) ArrayList(java.util.ArrayList) List(java.util.List) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Path(java.nio.file.Path) CSVSequenceRecordReader(org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader) SequenceRecordReader(org.datavec.api.records.reader.SequenceRecordReader) Text(org.apache.hadoop.io.Text) NumberedFileInputSplit(org.datavec.api.split.NumberedFileInputSplit) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) File(java.io.File) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Aggregations

SequenceRecordReader (org.datavec.api.records.reader.SequenceRecordReader)14 CSVSequenceRecordReader (org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader)14 NumberedFileInputSplit (org.datavec.api.split.NumberedFileInputSplit)14 Test (org.junit.Test)14 DataSet (org.nd4j.linalg.dataset.DataSet)12 ClassPathResource (org.nd4j.linalg.io.ClassPathResource)12 INDArray (org.nd4j.linalg.api.ndarray.INDArray)9 CollectionSequenceRecordReader (org.datavec.api.records.reader.impl.collection.CollectionSequenceRecordReader)7 MultiDataSet (org.nd4j.linalg.dataset.api.MultiDataSet)5 RecordMetaData (org.datavec.api.records.metadata.RecordMetaData)4 File (java.io.File)2 Path (java.nio.file.Path)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Text (org.apache.hadoop.io.Text)2 SequenceRecordReaderDataSetIterator (org.deeplearning4j.datasets.datavec.SequenceRecordReaderDataSetIterator)2 BaseSparkTest (org.deeplearning4j.spark.BaseSparkTest)2 MultiDataSetIterator (org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator)2 Tuple2 (scala.Tuple2)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)1