use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class VocabConstructorTest method testVocab.
@Test
public void testVocab() throws Exception {
File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
Set<String> set = new HashSet<>();
int lines = 0;
int cnt = 0;
while (iter.hasNext()) {
Tokenizer tok = t.create(iter.nextSentence());
for (String token : tok.getTokens()) {
if (token == null || token.isEmpty() || token.trim().isEmpty())
continue;
cnt++;
if (!set.contains(token))
set.add(token);
}
lines++;
}
log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
log.info("Set:\n" + set);
}
use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class ContextLabelRetriever method stringWithLabels.
/**
* Returns a stripped sentence with the indices of words
* with certain kinds of labels.
* @param sentence the sentence to process
* @return a pair of a post processed sentence
* with labels stripped and the spans of
* the labels
*/
public static Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringWithLabels(String sentence, TokenizerFactory tokenizerFactory) {
MultiDimensionalMap<Integer, Integer, String> map = MultiDimensionalMap.newHashBackedMap();
Tokenizer t = tokenizerFactory.create(sentence);
List<String> currTokens = new ArrayList<>();
String currLabel = null;
String endLabel = null;
List<Pair<String, List<String>>> tokensWithSameLabel = new ArrayList<>();
while (t.hasMoreTokens()) {
String token = t.nextToken();
if (token.matches(BEGIN_LABEL)) {
if (endLabel != null)
throw new IllegalStateException("Tried parsing sentence; found an end label when the begin label has not been cleared");
currLabel = token;
//no labels; add these as NONE and begin the new label
if (!currTokens.isEmpty()) {
tokensWithSameLabel.add(new Pair<>("NONE", (List<String>) new ArrayList<>(currTokens)));
currTokens.clear();
}
} else if (token.matches(END_LABEL)) {
if (currLabel == null)
throw new IllegalStateException("Found an ending label with no matching begin label");
endLabel = token;
} else
currTokens.add(token);
if (currLabel != null && endLabel != null) {
currLabel = currLabel.replaceAll("[<>/]", "");
endLabel = endLabel.replaceAll("[<>/]", "");
assert !currLabel.isEmpty() : "Current label is empty!";
assert !endLabel.isEmpty() : "End label is empty!";
assert currLabel.equals(endLabel) : "Current label begin and end did not match for the parse. Was: " + currLabel + " ending with " + endLabel;
tokensWithSameLabel.add(new Pair<>(currLabel, (List<String>) new ArrayList<>(currTokens)));
currTokens.clear();
//clear out the tokens
currLabel = null;
endLabel = null;
}
}
//no labels; add these as NONE and begin the new label
if (!currTokens.isEmpty()) {
tokensWithSameLabel.add(new Pair<>("none", (List<String>) new ArrayList<>(currTokens)));
currTokens.clear();
}
//now join the output
StringBuilder strippedSentence = new StringBuilder();
for (Pair<String, List<String>> tokensWithLabel : tokensWithSameLabel) {
String joinedSentence = StringUtils.join(tokensWithLabel.getSecond(), " ");
//spaces between separate parts of the sentence
if (!(strippedSentence.length() < 1))
strippedSentence.append(" ");
strippedSentence.append(joinedSentence);
int begin = strippedSentence.toString().indexOf(joinedSentence);
int end = begin + joinedSentence.length();
map.put(begin, end, tokensWithLabel.getFirst());
}
return new Pair<>(strippedSentence.toString(), map);
}
use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class DefaultTokenizerFactory method create.
@Override
public Tokenizer create(InputStream toTokenize) {
Tokenizer t = new DefaultStreamTokenizer(toTokenize);
t.setTokenPreProcessor(tokenPreProcess);
return t;
}
use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class Windows method windows.
/**
* Constructs a list of window of size windowSize.
* Note that padding for each window is created as well.
* @param words the words to tokenize and construct windows from
* @param windowSize the window size to generate
* @return the list of windows for the tokenized string
*/
public static List<Window> windows(InputStream words, int windowSize) {
Tokenizer tokenizer = new DefaultStreamTokenizer(words);
List<String> list = new ArrayList<>();
while (tokenizer.hasMoreTokens()) list.add(tokenizer.nextToken());
return windows(list, windowSize);
}
use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.
the class Windows method windows.
/**
* Constructs a list of window of size windowSize.
* Note that padding for each window is created as well.
* @param words the words to tokenize and construct windows from
* @param tokenizerFactory tokenizer factory to use
* @param windowSize the window size to generate
* @return the list of windows for the tokenized string
*/
public static List<Window> windows(InputStream words, TokenizerFactory tokenizerFactory, int windowSize) {
Tokenizer tokenizer = tokenizerFactory.create(words);
List<String> list = new ArrayList<>();
while (tokenizer.hasMoreTokens()) list.add(tokenizer.nextToken());
if (list.isEmpty())
throw new IllegalStateException("No tokens found for windows");
return windows(list, windowSize);
}
Aggregations