use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class PerformanceTests method testWord2VecCBOWBig.
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
//iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
//SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");
TokenizerFactory t = new KoreanTokenizerFactory();
//t = new DefaultTokenizerFactory();
//t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).allowParallelTokenization(true).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
long time1 = System.currentTimeMillis();
vec.fit();
long time2 = System.currentTimeMillis();
log.info("Total execution time: {}", (time2 - time1));
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class DefaulTokenizerTests method testDefaultTokenizer3.
@Test
public void testDefaultTokenizer3() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
int position = 1;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
position++;
assertEquals(tok1, tok2);
}
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class NGramTokenizerTest method testNGramTokenizer.
@Test
public void testNGramTokenizer() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory factory = new NGramTokenizerFactory(new DefaultTokenizerFactory(), 1, 2);
Tokenizer tokenizer = factory.create(toTokenize);
Tokenizer tokenizer2 = factory.create(toTokenize);
while (tokenizer.hasMoreTokens()) {
assertEquals(tokenizer.nextToken(), tokenizer2.nextToken());
}
int stringCount = factory.create(toTokenize).countTokens();
List<String> tokens = factory.create(toTokenize).getTokens();
assertEquals(9, stringCount);
assertTrue(tokens.contains("Mary"));
assertTrue(tokens.contains("had"));
assertTrue(tokens.contains("a"));
assertTrue(tokens.contains("little"));
assertTrue(tokens.contains("lamb."));
assertTrue(tokens.contains("Mary had"));
assertTrue(tokens.contains("had a"));
assertTrue(tokens.contains("a little"));
assertTrue(tokens.contains("little lamb."));
factory = new NGramTokenizerFactory(new DefaultTokenizerFactory(), 2, 2);
tokens = factory.create(toTokenize).getTokens();
assertEquals(4, tokens.size());
assertTrue(tokens.contains("Mary had"));
assertTrue(tokens.contains("had a"));
assertTrue(tokens.contains("a little"));
assertTrue(tokens.contains("little lamb."));
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsDM.
@Test
public void testParagraphVectorsDM() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(2).seed(119).epochs(3).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0).useHierarchicSoftmax(true).sampling(0).workers(1).usePreciseWeightInit(true).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
vec.fit();
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
double simDN = vec.similarity("day", "night");
log.info("day/night similariry: {}", simDN);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.2d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.2d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity3 > 0.6d);
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
// testing DM inference now
INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
INDArray inferredA1 = vec.inferVector("This is my work");
INDArray inferredB1 = vec.inferVector("This is my work .");
double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
log.info("Cos O/A: {}", cosAO1);
log.info("Cos A/B: {}", cosAB1);
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testGoogleModelForInference.
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
WordVectors googleVectors = WordVectorSerializer.loadGoogleModelNonNormalized(new File("/ext/GoogleNews-vectors-negative300.bin.gz"), true, false);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false).trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors).negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
INDArray vec1 = pv.inferVector("This text is pretty awesome");
INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");
log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
Aggregations