use of org.deeplearning4j.text.tokenization.tokenizerfactory.NGramTokenizerFactory in project deeplearning4j by deeplearning4j.
the class NGramTokenizerTest method testNGramTokenizer.
@Test
public void testNGramTokenizer() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory factory = new NGramTokenizerFactory(new DefaultTokenizerFactory(), 1, 2);
Tokenizer tokenizer = factory.create(toTokenize);
Tokenizer tokenizer2 = factory.create(toTokenize);
while (tokenizer.hasMoreTokens()) {
assertEquals(tokenizer.nextToken(), tokenizer2.nextToken());
}
int stringCount = factory.create(toTokenize).countTokens();
List<String> tokens = factory.create(toTokenize).getTokens();
assertEquals(9, stringCount);
assertTrue(tokens.contains("Mary"));
assertTrue(tokens.contains("had"));
assertTrue(tokens.contains("a"));
assertTrue(tokens.contains("little"));
assertTrue(tokens.contains("lamb."));
assertTrue(tokens.contains("Mary had"));
assertTrue(tokens.contains("had a"));
assertTrue(tokens.contains("a little"));
assertTrue(tokens.contains("little lamb."));
factory = new NGramTokenizerFactory(new DefaultTokenizerFactory(), 2, 2);
tokens = factory.create(toTokenize).getTokens();
assertEquals(4, tokens.size());
assertTrue(tokens.contains("Mary had"));
assertTrue(tokens.contains("had a"));
assertTrue(tokens.contains("a little"));
assertTrue(tokens.contains("little lamb."));
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.NGramTokenizerFactory in project deeplearning4j by deeplearning4j.
the class TokenizerFunction method getTokenizerFactory.
private TokenizerFactory getTokenizerFactory() {
try {
TokenPreProcess tokenPreProcessInst = null;
// token preprocess CAN be undefined
if (tokenizerPreprocessorClazz != null && !tokenizerPreprocessorClazz.isEmpty()) {
Class<? extends TokenPreProcess> clazz = (Class<? extends TokenPreProcess>) Class.forName(tokenizerPreprocessorClazz);
tokenPreProcessInst = clazz.newInstance();
}
Class<? extends TokenizerFactory> clazz2 = (Class<? extends TokenizerFactory>) Class.forName(tokenizerFactoryClazz);
tokenizerFactory = clazz2.newInstance();
if (tokenPreProcessInst != null)
tokenizerFactory.setTokenPreProcessor(tokenPreProcessInst);
if (nGrams > 1) {
tokenizerFactory = new NGramTokenizerFactory(tokenizerFactory, nGrams, nGrams);
}
} catch (Exception e) {
e.printStackTrace();
}
return tokenizerFactory;
}
Aggregations