use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsWithWordVectorsModelling1.
@Test
public void testParagraphVectorsWithWordVectorsModelling1() throws Exception {
ClassPathResource resource = new ClassPathResource("/big/raw_sentences.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
// InMemoryLookupCache cache = new InMemoryLookupCache(false);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(3).epochs(1).layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).sampling(0).build();
vec.fit();
int cnt1 = cache.wordFrequency("day");
int cnt2 = cache.wordFrequency("me");
assertNotEquals(1, cnt1);
assertNotEquals(1, cnt2);
assertNotEquals(cnt1, cnt2);
/*
We have few lines that contain pretty close words invloved.
These sentences should be pretty close to each other in vector space
*/
// line 3721: This is my way .
// line 6348: This is my case .
// line 9836: This is my house .
// line 12493: This is my world .
// line 16393: This is my work .
// this is special sentence, that has nothing common with previous sentences
// line 9853: We now have one .
assertTrue(vec.hasWord("DOC_3720"));
double similarityD = vec.similarity("day", "night");
log.info("day/night similarity: " + similarityD);
double similarityW = vec.similarity("way", "work");
log.info("way/work similarity: " + similarityW);
double similarityH = vec.similarity("house", "world");
log.info("house/world similarity: " + similarityH);
double similarityC = vec.similarity("case", "way");
log.info("case/way similarity: " + similarityC);
double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
log.info("9835/12492 similarity: " + similarity1);
// assertTrue(similarity1 > 0.7d);
double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
log.info("3720/16392 similarity: " + similarity2);
// assertTrue(similarity2 > 0.7d);
double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
log.info("6347/3720 similarity: " + similarity3);
// assertTrue(similarity2 > 0.7d);
// likelihood in this case should be significantly lower
// however, since corpus is small, and weight initialization is random-based, sometimes this test CAN fail
double similarityX = vec.similarity("DOC_3720", "DOC_9852");
log.info("3720/9852 similarity: " + similarityX);
assertTrue(similarityX < 0.5d);
double sim119 = vec.similarityToLabel("This is my case .", "DOC_6347");
double sim120 = vec.similarityToLabel("This is my case .", "DOC_3720");
log.info("1/2: " + sim119 + "/" + sim120);
//assertEquals(similarity3, sim119, 0.001);
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class ParagraphVectorsTest method testParagraphVectorsReducedLabels1.
/**
* This test is not indicative.
* there's no need in this test within travis, use it manually only for problems detection
*
* @throws Exception
*/
@Test
@Ignore
public void testParagraphVectorsReducedLabels1() throws Exception {
ClassPathResource resource = new ClassPathResource("/labeled");
File file = resource.getFile();
LabelAwareIterator iter = new FileLabelAwareIterator.Builder().addSourceFolder(file).build();
TokenizerFactory t = new DefaultTokenizerFactory();
/**
* Please note: text corpus is REALLY small, and some kind of "results" could be received with HIGH epochs number, like 30.
* But there's no reason to keep at that high
*/
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).epochs(3).layerSize(100).stopWords(new ArrayList<String>()).windowSize(5).iterate(iter).tokenizerFactory(t).build();
vec.fit();
//WordVectorSerializer.writeWordVectors(vec, "vectors.txt");
INDArray w1 = vec.lookupTable().vector("I");
INDArray w2 = vec.lookupTable().vector("am");
INDArray w3 = vec.lookupTable().vector("sad.");
INDArray words = Nd4j.create(3, vec.lookupTable().layerSize());
words.putRow(0, w1);
words.putRow(1, w2);
words.putRow(2, w3);
INDArray mean = words.isMatrix() ? words.mean(0) : words;
log.info("Mean" + Arrays.toString(mean.dup().data().asDouble()));
log.info("Array" + Arrays.toString(vec.lookupTable().vector("negative").dup().data().asDouble()));
double simN = Transforms.cosineSim(mean, vec.lookupTable().vector("negative"));
log.info("Similarity negative: " + simN);
double simP = Transforms.cosineSim(mean, vec.lookupTable().vector("neutral"));
log.info("Similarity neutral: " + simP);
double simV = Transforms.cosineSim(mean, vec.lookupTable().vector("positive"));
log.info("Similarity positive: " + simV);
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class SequenceVectorsTest method testInternalVocabConstruction.
@Test
public void testInternalVocabConstruction() throws Exception {
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
File file = resource.getFile();
BasicLineIterator underlyingIterator = new BasicLineIterator(file);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(new VectorsConfiguration()).minWordFrequency(5).iterate(sequenceIterator).batchSize(250).iterations(1).epochs(1).resetModel(false).trainElementsRepresentation(true).build();
logger.info("Fitting model...");
vectors.fit();
logger.info("Model ready...");
double sim = vectors.similarity("day", "night");
logger.info("Day/night similarity: " + sim);
assertTrue(sim > 0.6d);
Collection<String> labels = vectors.wordsNearest("day", 10);
logger.info("Nearest labels to 'day': " + labels);
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class ManualTests method testWord2VecPlot.
@Test
public void testWord2VecPlot() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025).layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10).tokenizerFactory(t).build();
vec.fit();
// UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
// vec.getLookupTable().plotVocab(100, connectionInfo);
Thread.sleep(10000000000L);
fail("Not implemented");
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory in project deeplearning4j by deeplearning4j.
the class Word2VecTest method testSparkW2VonBiggerCorpus.
@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest").set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g").set("spark.executor.memory", "8g");
// Set SparkContext
JavaSparkContext sc = new JavaSparkContext(sparkConf);
// Path of data part-00000
//String dataPath = new ClassPathResource("/big/raw_sentences.txt").getFile().getAbsolutePath();
// String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();
// Read in data
JavaRDD<String> corpus = sc.textFile(dataPath);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new LowCasePreProcessor());
Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1).tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5).learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5).useUnknown(true).build();
word2Vec.train(corpus);
sc.stop();
WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
Aggregations