use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class VocabConstructorTest method testVocab.
@Test
public void testVocab() throws Exception {
File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
Set<String> set = new HashSet<>();
int lines = 0;
int cnt = 0;
while (iter.hasNext()) {
Tokenizer tok = t.create(iter.nextSentence());
for (String token : tok.getTokens()) {
if (token == null || token.isEmpty() || token.trim().isEmpty())
continue;
cnt++;
if (!set.contains(token))
set.add(token);
}
lines++;
}
log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
log.info("Set:\n" + set);
}
use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class SequenceVectorsTest method testInternalVocabConstruction.
@Test
public void testInternalVocabConstruction() throws Exception {
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
File file = resource.getFile();
BasicLineIterator underlyingIterator = new BasicLineIterator(file);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(new VectorsConfiguration()).minWordFrequency(5).iterate(sequenceIterator).batchSize(250).iterations(1).epochs(1).resetModel(false).trainElementsRepresentation(true).build();
logger.info("Fitting model...");
vectors.fit();
logger.info("Model ready...");
double sim = vectors.similarity("day", "night");
logger.info("Day/night similarity: " + sim);
assertTrue(sim > 0.6d);
Collection<String> labels = vectors.wordsNearest("day", 10);
logger.info("Nearest labels to 'day': " + labels);
}
use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class TestCnnSentenceDataSetIterator method testSentenceIterator.
@Test
public void testSentenceIterator() throws Exception {
WordVectors w2v = WordVectorSerializer.readWord2VecModel(new ClassPathResource("word2vec/googleload/sample_vec.bin").getFile());
int vectorSize = w2v.lookupTable().layerSize();
// Collection<String> words = w2v.lookupTable().getVocabCache().words();
// for(String s : words){
// System.out.println(s);
// }
List<String> sentences = new ArrayList<>();
//First word: all present
sentences.add("these balance Database model");
sentences.add("into same THISWORDDOESNTEXIST are");
int maxLength = 4;
List<String> s1 = Arrays.asList("these", "balance", "Database", "model");
List<String> s2 = Arrays.asList("into", "same", "are");
List<String> labelsForSentences = Arrays.asList("Positive", "Negative");
//Order of labels: alphabetic. Positive -> [0,1]
INDArray expLabels = Nd4j.create(new double[][] { { 0, 1 }, { 1, 0 } });
boolean[] alongHeightVals = new boolean[] { true, false };
for (boolean alongHeight : alongHeightVals) {
INDArray expectedFeatures;
if (alongHeight) {
expectedFeatures = Nd4j.create(2, 1, maxLength, vectorSize);
} else {
expectedFeatures = Nd4j.create(2, 1, vectorSize, maxLength);
}
INDArray expectedFeatureMask = Nd4j.create(new double[][] { { 1, 1, 1, 1 }, { 1, 1, 1, 0 } });
for (int i = 0; i < 4; i++) {
if (alongHeight) {
expectedFeatures.get(NDArrayIndex.point(0), NDArrayIndex.point(0), NDArrayIndex.point(i), NDArrayIndex.all()).assign(w2v.getWordVectorMatrix(s1.get(i)));
} else {
expectedFeatures.get(NDArrayIndex.point(0), NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s1.get(i)));
}
}
for (int i = 0; i < 3; i++) {
if (alongHeight) {
expectedFeatures.get(NDArrayIndex.point(1), NDArrayIndex.point(0), NDArrayIndex.point(i), NDArrayIndex.all()).assign(w2v.getWordVectorMatrix(s2.get(i)));
} else {
expectedFeatures.get(NDArrayIndex.point(1), NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s2.get(i)));
}
}
LabeledSentenceProvider p = new CollectionLabeledSentenceProvider(sentences, labelsForSentences, null);
CnnSentenceDataSetIterator dsi = new CnnSentenceDataSetIterator.Builder().sentenceProvider(p).wordVectors(w2v).maxSentenceLength(256).minibatchSize(32).sentencesAlongHeight(alongHeight).build();
// System.out.println("alongHeight = " + alongHeight);
DataSet ds = dsi.next();
assertArrayEquals(expectedFeatures.shape(), ds.getFeatures().shape());
assertEquals(expectedFeatures, ds.getFeatures());
assertEquals(expLabels, ds.getLabels());
assertEquals(expectedFeatureMask, ds.getFeaturesMaskArray());
assertNull(ds.getLabelsMaskArray());
INDArray s1F = dsi.loadSingleSentence(sentences.get(0));
INDArray s2F = dsi.loadSingleSentence(sentences.get(1));
INDArray sub1 = ds.getFeatures().get(NDArrayIndex.interval(0, 0, true), NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.all());
INDArray sub2;
if (alongHeight) {
sub2 = ds.getFeatures().get(NDArrayIndex.interval(1, 1, true), NDArrayIndex.all(), NDArrayIndex.interval(0, 3), NDArrayIndex.all());
} else {
sub2 = ds.getFeatures().get(NDArrayIndex.interval(1, 1, true), NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.interval(0, 3));
}
assertArrayEquals(sub1.shape(), s1F.shape());
assertArrayEquals(sub2.shape(), s2F.shape());
assertEquals(sub1, s1F);
assertEquals(sub2, s2F);
}
}
use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class AsyncLabelAwareIteratorTest method nextDocument.
@Test
public void nextDocument() throws Exception {
SentenceIterator sentence = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
BasicLabelAwareIterator backed = new BasicLabelAwareIterator.Builder(sentence).build();
int cnt = 0;
while (backed.hasNextDocument()) {
backed.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
backed.reset();
AsyncLabelAwareIterator iterator = new AsyncLabelAwareIterator(backed, 64);
cnt = 0;
while (iterator.hasNext()) {
iterator.next();
cnt++;
if (cnt == 10)
iterator.reset();
}
assertEquals(97172, cnt);
}
use of org.datavec.api.util.ClassPathResource in project deeplearning4j by deeplearning4j.
the class BasicLabelAwareIteratorTest method testHasNextDocument2.
@Test
public void testHasNextDocument2() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();
int cnt = 0;
while (iterator.hasNextDocument()) {
iterator.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
iterator.reset();
cnt = 0;
while (iterator.hasNextDocument()) {
iterator.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
LabelsSource generator = iterator.getLabelsSource();
// this is important moment. Iterator after reset should not increase number of labels attained
assertEquals(97162, generator.getLabels().size());
assertEquals("DOCZ_0", generator.getLabels().get(0));
}
Aggregations