use of org.deeplearning4j.text.tokenization.tokenizerfactory.KoreanTokenizerFactory in project deeplearning4j by deeplearning4j.
the class KoreanTokenizerTest method testKoreanTokenizer.
@Test
public void testKoreanTokenizer() throws Exception {
String toTokenize = "세계 최초의 상용 수준 오픈소스 딥러닝 라이브러리입니다";
TokenizerFactory t = new KoreanTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
String[] expect = { "세계", "최초", "의", "상용", "수준", "오픈소스", "딥", "러닝", "라이브러리", "입니", "다" };
assertEquals(expect.length, tokenizer.countTokens());
for (int i = 0; i < tokenizer.countTokens(); ++i) {
assertEquals(tokenizer.nextToken(), expect[i]);
}
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.KoreanTokenizerFactory in project deeplearning4j by deeplearning4j.
the class PerformanceTests method testWord2VecCBOWBig.
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
//iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
//SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");
TokenizerFactory t = new KoreanTokenizerFactory();
//t = new DefaultTokenizerFactory();
//t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8).allowParallelTokenization(true).tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
long time1 = System.currentTimeMillis();
vec.fit();
long time2 = System.currentTimeMillis();
log.info("Total execution time: {}", (time2 - time1));
}
Aggregations