use of org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory in project deeplearning4j by deeplearning4j.
the class DefaulTokenizerTests method testDefaultTokenizer1.
@Test
public void testDefaultTokenizer1() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
int position = 1;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
position++;
assertEquals(tok1, tok2);
}
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
use of org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory in project deeplearning4j by deeplearning4j.
the class TokenizerFunction method getTokenizerFactory.
private TokenizerFactory getTokenizerFactory() {
try {
TokenPreProcess tokenPreProcessInst = null;
// token preprocess CAN be undefined
if (tokenizerPreprocessorClazz != null && !tokenizerPreprocessorClazz.isEmpty()) {
Class<? extends TokenPreProcess> clazz = (Class<? extends TokenPreProcess>) Class.forName(tokenizerPreprocessorClazz);
tokenPreProcessInst = clazz.newInstance();
}
Class<? extends TokenizerFactory> clazz2 = (Class<? extends TokenizerFactory>) Class.forName(tokenizerFactoryClazz);
tokenizerFactory = clazz2.newInstance();
if (tokenPreProcessInst != null)
tokenizerFactory.setTokenPreProcessor(tokenPreProcessInst);
if (nGrams > 1) {
tokenizerFactory = new NGramTokenizerFactory(tokenizerFactory, nGrams, nGrams);
}
} catch (Exception e) {
e.printStackTrace();
}
return tokenizerFactory;
}
Aggregations