use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class SequenceVectorsTest method testGlove1.
@Ignore
@Test
public void testGlove1() throws Exception {
logger.info("Max available memory: " + Runtime.getRuntime().maxMemory());
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
File file = resource.getFile();
BasicLineIterator underlyingIterator = new BasicLineIterator(file);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VectorsConfiguration configuration = new VectorsConfiguration();
configuration.setWindow(5);
configuration.setLearningRate(0.06);
configuration.setLayersSize(100);
SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(configuration).iterate(sequenceIterator).iterations(1).epochs(45).elementsLearningAlgorithm(new GloVe.Builder<VocabWord>().shuffle(true).symmetric(true).learningRate(0.05).alpha(0.75).xMax(100.0).build()).resetModel(true).trainElementsRepresentation(true).trainSequencesRepresentation(false).build();
vectors.fit();
double sim = vectors.similarity("day", "night");
logger.info("Day/night similarity: " + sim);
sim = vectors.similarity("day", "another");
logger.info("Day/another similarity: " + sim);
sim = vectors.similarity("night", "year");
logger.info("Night/year similarity: " + sim);
sim = vectors.similarity("night", "me");
logger.info("Night/me similarity: " + sim);
sim = vectors.similarity("day", "know");
logger.info("Day/know similarity: " + sim);
sim = vectors.similarity("best", "police");
logger.info("Best/police similarity: " + sim);
Collection<String> labels = vectors.wordsNearest("day", 10);
logger.info("Nearest labels to 'day': " + labels);
sim = vectors.similarity("day", "night");
assertTrue(sim > 0.6d);
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class DefaultDocumentIteratorTest method testDocumentIterator.
@Test
public void testDocumentIterator() throws Exception {
ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
File f = reuters5250.getFile();
DocumentIterator iter = new FileDocumentIterator(f.getAbsolutePath());
InputStream doc = iter.nextDocument();
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer next = t.create(doc);
String[] list = "PEARSON CONCENTRATES ON FOUR SECTORS".split(" ");
///PEARSON CONCENTRATES ON FOUR SECTORS
int count = 0;
while (next.hasMoreTokens() && count < list.length) {
String token = next.nextToken();
assertEquals(list[count++], token);
}
doc.close();
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class DefaulTokenizerTests method testDefaultStreamTokenizer.
@Test
public void testDefaultStreamTokenizer() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
assertEquals(5, tokenizer2.countTokens());
int cnt = 0;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer2.nextToken();
log.info(tok1);
cnt++;
}
assertEquals(5, cnt);
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class DefaulTokenizerTests method testDefaultTokenizer2.
@Test
public void testDefaultTokenizer2() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
tokenizer2.countTokens();
while (tokenizer.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
assertEquals(tok1, tok2);
}
System.out.println("-----------------------------------------------");
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: " + Math.abs(stringCount - stringCount2));
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class DefaulTokenizerTests method testDefaultTokenizer1.
@Test
public void testDefaultTokenizer1() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
int position = 1;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
position++;
assertEquals(tok1, tok2);
}
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
Aggregations