use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class Windows method windows.
/**
* Constructs a list of window of size windowSize.
* Note that padding for each window is created as well.
* @param words the words to tokenize and construct windows from
* @param tokenizerFactory tokenizer factory to use
* @param windowSize the window size to generate
* @return the list of windows for the tokenized string
*/
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize, WordVectors vectors) {
Tokenizer tokenizer = tokenizerFactory.create(words);
List<String> list = new ArrayList<>();
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
// if we don't have UNK word defined - we have to skip this word
if (vectors.getWordVectorMatrix(token) != null)
list.add(token);
}
if (list.isEmpty())
throw new IllegalStateException("No tokens found for windows");
return windows(list, windowSize);
}
use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class PosUimaTokenizerFactoryTest method testCreate2.
@Test
public void testCreate2() throws Exception {
String[] posTags = new String[] { "NN" };
PosUimaTokenizerFactory factory = new PosUimaTokenizerFactory(Arrays.asList(posTags), true);
Tokenizer tokenizer = factory.create("some test string");
List<String> tokens = tokenizer.getTokens();
System.out.println("Tokens: " + tokens);
Assert.assertEquals(2, tokens.size());
Assert.assertEquals("test", tokens.get(0));
Assert.assertEquals("string", tokens.get(1));
}
use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class BagOfWordsVectorizer method transform.
@Override
public INDArray transform(String text) {
Tokenizer tokenizer = tokenizerFactory.create(text);
List<String> tokens = tokenizer.getTokens();
return transform(tokens);
}
use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class SentenceTransformer method transformToSequence.
@Override
public Sequence<VocabWord> transformToSequence(String object) {
Sequence<VocabWord> sequence = new Sequence<>();
Tokenizer tokenizer = tokenizerFactory.create(object);
List<String> list = tokenizer.getTokens();
for (String token : list) {
if (token == null || token.isEmpty() || token.trim().isEmpty())
continue;
VocabWord word = new VocabWord(1.0, token);
sequence.addElement(word);
}
sequence.setSequenceId(sentenceCounter.getAndIncrement());
return sequence;
}
use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class NGramTokenizerFactory method create.
@Override
public Tokenizer create(String toTokenize) {
if (toTokenize == null || toTokenize.isEmpty()) {
throw new IllegalArgumentException("Unable to proceed; no sentence to tokenize");
}
Tokenizer t1 = tokenizerFactory.create(toTokenize);
t1.setTokenPreProcessor(preProcess);
Tokenizer ret = new NGramTokenizer(t1, minN, maxN);
return ret;
}
Aggregations