use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.
the class VocabConstructorTest method testVocab.
@Test
public void testVocab() throws Exception {
File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
Set<String> set = new HashSet<>();
int lines = 0;
int cnt = 0;
while (iter.hasNext()) {
Tokenizer tok = t.create(iter.nextSentence());
for (String token : tok.getTokens()) {
if (token == null || token.isEmpty() || token.trim().isEmpty())
continue;
cnt++;
if (!set.contains(token))
set.add(token);
}
lines++;
}
log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
log.info("Set:\n" + set);
}
use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.
the class SequenceVectorsTest method testInternalVocabConstruction.
@Test
public void testInternalVocabConstruction() throws Exception {
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
File file = resource.getFile();
BasicLineIterator underlyingIterator = new BasicLineIterator(file);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(new VectorsConfiguration()).minWordFrequency(5).iterate(sequenceIterator).batchSize(250).iterations(1).epochs(1).resetModel(false).trainElementsRepresentation(true).build();
logger.info("Fitting model...");
vectors.fit();
logger.info("Model ready...");
double sim = vectors.similarity("day", "night");
logger.info("Day/night similarity: " + sim);
assertTrue(sim > 0.6d);
Collection<String> labels = vectors.wordsNearest("day", 10);
logger.info("Nearest labels to 'day': " + labels);
}
use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.
the class AsyncLabelAwareIteratorTest method nextDocument.
@Test
public void nextDocument() throws Exception {
SentenceIterator sentence = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
BasicLabelAwareIterator backed = new BasicLabelAwareIterator.Builder(sentence).build();
int cnt = 0;
while (backed.hasNextDocument()) {
backed.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
backed.reset();
AsyncLabelAwareIterator iterator = new AsyncLabelAwareIterator(backed, 64);
cnt = 0;
while (iterator.hasNext()) {
iterator.next();
cnt++;
if (cnt == 10)
iterator.reset();
}
assertEquals(97172, cnt);
}
use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.
the class BasicLabelAwareIteratorTest method testHasNextDocument2.
@Test
public void testHasNextDocument2() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();
int cnt = 0;
while (iterator.hasNextDocument()) {
iterator.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
iterator.reset();
cnt = 0;
while (iterator.hasNextDocument()) {
iterator.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
LabelsSource generator = iterator.getLabelsSource();
// this is important moment. Iterator after reset should not increase number of labels attained
assertEquals(97162, generator.getLabels().size());
assertEquals("DOCZ_0", generator.getLabels().get(0));
}
use of org.deeplearning4j.text.sentenceiterator.BasicLineIterator in project deeplearning4j by deeplearning4j.
the class ManualTests method testWord2VecPlot.
@Test
public void testWord2VecPlot() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025).layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10).tokenizerFactory(t).build();
vec.fit();
// UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
// vec.getLookupTable().plotVocab(100, connectionInfo);
Thread.sleep(10000000000L);
fail("Not implemented");
}
Aggregations