the class TestDataVecDataSetFunctions method testDataVecSequencePairDataSetFunctionVariableLength.
public void testDataVecSequencePairDataSetFunctionVariableLength() throws Exception {
//Same sort of test as testDataVecSequencePairDataSetFunction() but with variable length time series (labels shorter, align end)
//Convert data to a SequenceFile:
File f = new File("src/test/resources/csvsequence/csvsequence_0.txt");
String pathFeatures = f.getAbsolutePath();
String folderFeatures = pathFeatures.substring(0, pathFeatures.length() - 17);
pathFeatures = folderFeatures + "*";
File f2 = new File("src/test/resources/csvsequencelabels/csvsequencelabelsShort_0.txt");
String pathLabels = f2.getPath();
String folderLabels = pathLabels.substring(0, pathLabels.length() - 28);
pathLabels = folderLabels + "*";
//Extract a number from the file name
PathToKeyConverter pathConverter = new PathToKeyConverterNumber();
JavaPairRDD<Text, BytesPairWritable> toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, pathFeatures, pathLabels, pathConverter);
Path p = Files.createTempDirectory("dl4j_testSeqPairFnVarLength");
String outPath = p.toString() + "/out";
new File(outPath).deleteOnExit();
toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
//Load from sequence file:
JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);
SequenceRecordReader srr1 = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader srr2 = new CSVSequenceRecordReader(1, ",");
PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);
JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables =;
//Map to DataSet:
DataVecSequencePairDataSetFunction pairFn = new DataVecSequencePairDataSetFunction(4, false, DataVecSequencePairDataSetFunction.AlignmentMode.ALIGN_END);
JavaRDD<DataSet> data =;
List<DataSet> sparkData = data.collect();
//Now: do the same thing locally (SequenceRecordReaderDataSetIterator) and compare
String featuresPath = f.getPath().replaceAll("0", "%d");
String labelsPath = f2.getPath().replaceAll("0", "%d");
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_END);
List<DataSet> localData = new ArrayList<>(3);
while (iter.hasNext()) localData.add(;
assertEquals(3, sparkData.size());
assertEquals(3, localData.size());
//1 example, 3 values, 4 time steps
int[] fShapeExp = new int[] { 1, 3, 4 };
//1 example, 4 values/classes, 4 time steps (after padding)
int[] lShapeExp = new int[] { 1, 4, 4 };
for (int i = 0; i < 3; i++) {
//Check shapes etc. data sets order may differ for spark vs. local
DataSet dsSpark = sparkData.get(i);
DataSet dsLocal = localData.get(i);
//Expect mask array for labels
INDArray fSpark = dsSpark.getFeatureMatrix();
INDArray fLocal = dsLocal.getFeatureMatrix();
INDArray lSpark = dsSpark.getLabels();
INDArray lLocal = dsLocal.getLabels();
assertArrayEquals(fShapeExp, fSpark.shape());
assertArrayEquals(fShapeExp, fLocal.shape());
assertArrayEquals(lShapeExp, lSpark.shape());
assertArrayEquals(lShapeExp, lLocal.shape());
//Check that results are the same (order not withstanding)
boolean[] found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = sparkData.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(localData.get(j))) {
if (foundIndex != -1)
//Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
foundIndex = j;
if (found[foundIndex])
//One of the other spark values was equal to this one -> suggests duplicates in Spark list
//mark this one as seen before
found[foundIndex] = true;
int count = 0;
for (boolean b : found) if (b)
//Expect all 3 and exactly 3 pairwise matches between spark and local versions
assertEquals(3, count);
//NOW: test same thing, but for align start...
DataVecSequencePairDataSetFunction pairFnAlignStart = new DataVecSequencePairDataSetFunction(4, false, DataVecSequencePairDataSetFunction.AlignmentMode.ALIGN_START);
JavaRDD<DataSet> rddDataAlignStart =;
List<DataSet> sparkDataAlignStart = rddDataAlignStart.collect();
//re-initialize to reset
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iterAlignStart = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_START);
List<DataSet> localDataAlignStart = new ArrayList<>(3);
while (iterAlignStart.hasNext()) localDataAlignStart.add(;
assertEquals(3, sparkDataAlignStart.size());
assertEquals(3, localDataAlignStart.size());
for (int i = 0; i < 3; i++) {
//Check shapes etc. data sets order may differ for spark vs. local
DataSet dsSpark = sparkDataAlignStart.get(i);
DataSet dsLocal = localDataAlignStart.get(i);
//Expect mask array for labels
INDArray fSpark = dsSpark.getFeatureMatrix();
INDArray fLocal = dsLocal.getFeatureMatrix();
INDArray lSpark = dsSpark.getLabels();
INDArray lLocal = dsLocal.getLabels();
assertArrayEquals(fShapeExp, fSpark.shape());
assertArrayEquals(fShapeExp, fLocal.shape());
assertArrayEquals(lShapeExp, lSpark.shape());
assertArrayEquals(lShapeExp, lLocal.shape());
//Check that results are the same (order not withstanding)
found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = sparkData.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(localData.get(j))) {
if (foundIndex != -1)
//Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
foundIndex = j;
if (found[foundIndex])
//One of the other spark values was equal to this one -> suggests duplicates in Spark list
//mark this one as seen before
found[foundIndex] = true;
count = 0;
for (boolean b : found) if (b)
//Expect all 3 and exactly 3 pairwise matches between spark and local versions
assertEquals(3, count);
the class TestDataVecDataSetFunctions method testDataVecSequencePairDataSetFunction.
public void testDataVecSequencePairDataSetFunction() throws Exception {
JavaSparkContext sc = getContext();
//Convert data to a SequenceFile:
File f = new File("src/test/resources/csvsequence/csvsequence_0.txt");
String path = f.getPath();
String folder = path.substring(0, path.length() - 17);
path = folder + "*";
PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
JavaPairRDD<Text, BytesPairWritable> toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter);
Path p = Files.createTempDirectory("dl4j_testSeqPairFn");
String outPath = p.toString() + "/out";
new File(outPath).deleteOnExit();
toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
//Load from sequence file:
JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);
SequenceRecordReader srr1 = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader srr2 = new CSVSequenceRecordReader(1, ",");
PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);
JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables =;
//Map to DataSet:
DataVecSequencePairDataSetFunction pairFn = new DataVecSequencePairDataSetFunction();
JavaRDD<DataSet> data =;
List<DataSet> sparkData = data.collect();
//Now: do the same thing locally (SequenceRecordReaderDataSetIterator) and compare
String featuresPath = f.getAbsolutePath().replaceAll("0", "%d");
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, -1, true);
List<DataSet> localData = new ArrayList<>(3);
while (iter.hasNext()) localData.add(;
assertEquals(3, sparkData.size());
assertEquals(3, localData.size());
for (int i = 0; i < 3; i++) {
//Check shapes etc. data sets order may differ for spark vs. local
DataSet dsSpark = sparkData.get(i);
DataSet dsLocal = localData.get(i);
INDArray fSpark = dsSpark.getFeatureMatrix();
INDArray fLocal = dsLocal.getFeatureMatrix();
INDArray lSpark = dsSpark.getLabels();
INDArray lLocal = dsLocal.getLabels();
//1 example, 3 values, 3 time steps
int[] s = new int[] { 1, 3, 4 };
assertArrayEquals(s, fSpark.shape());
assertArrayEquals(s, fLocal.shape());
assertArrayEquals(s, lSpark.shape());
assertArrayEquals(s, lLocal.shape());
//Check that results are the same (order not withstanding)
boolean[] found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = sparkData.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(localData.get(j))) {
if (foundIndex != -1)
//Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
foundIndex = j;
if (found[foundIndex])
//One of the other spark values was equal to this one -> suggests duplicates in Spark list
//mark this one as seen before
found[foundIndex] = true;
int count = 0;
for (boolean b : found) if (b)
//Expect all 3 and exactly 3 pairwise matches between spark and local versions
assertEquals(3, count);
the class FileReadingCollectorTest method setUpClass.
public static void setUpClass() throws Exception {
Path copy_from = Files.createTempDirectory("copy_from");
Path copy_from_gz = Files.createTempDirectory("copy_from_gz");
Path copy_from_empty = Files.createTempDirectory("copy_from_empty");
tmpFileGz = File.createTempFile("fileReadingCollector", ".json.gz", copy_from_gz.toFile());
tmpFile = File.createTempFile("fileReadingCollector", ".json", copy_from.toFile());
tmpFileEmptyLine = File.createTempFile("emptyLine", ".json", copy_from_empty.toFile());
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(tmpFileGz)), StandardCharsets.UTF_8))) {
writer.write("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}\n");
writer.write("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}\n");
try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tmpFile), StandardCharsets.UTF_8)) {
writer.write("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}\n");
writer.write("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}\n");
try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tmpFileEmptyLine), StandardCharsets.UTF_8)) {
writer.write("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}\n");
writer.write("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}\n");
the class LuceneOrderedDocCollectorTest method createLuceneIndex.
private Directory createLuceneIndex() throws IOException {
Path tmpDir = newTempDir();
Directory index =;
StandardAnalyzer analyzer = new StandardAnalyzer();
IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
IndexWriter w = new IndexWriter(index, cfg);
for (Long i = 0L; i < 4; i++) {
if (i < 2) {
addDocToLucene(w, i + 1);
} else {
addDocToLucene(w, null);
return index;
the class JavaMain method buildScriptCommandLine.
private static String buildScriptCommandLine() throws IOException {
if (System.getProperties().getProperty("").contains("Windows")) {
return Paths.get(JavaMain.class.getResource("/script/demo.bat").getPath().substring(1)).toString();
Path result = Paths.get(JavaMain.class.getResource("/script/").getPath());
Files.setPosixFilePermissions(result, PosixFilePermissions.fromString("rwxr-xr-x"));
return result.toString();