Search in sources :

Example 6 with Tokenizer

use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.

the class VocabConstructorTest method testVocab.

@Test
public void testVocab() throws Exception {
    File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    Set<String> set = new HashSet<>();
    int lines = 0;
    int cnt = 0;
    while (iter.hasNext()) {
        Tokenizer tok = t.create(iter.nextSentence());
        for (String token : tok.getTokens()) {
            if (token == null || token.isEmpty() || token.trim().isEmpty())
                continue;
            cnt++;
            if (!set.contains(token))
                set.add(token);
        }
        lines++;
    }
    log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
    log.info("Set:\n" + set);
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) File(java.io.File) Tokenizer(org.deeplearning4j.text.tokenization.tokenizer.Tokenizer) ClassPathResource(org.datavec.api.util.ClassPathResource) SentenceIterator(org.deeplearning4j.text.sentenceiterator.SentenceIterator) Test(org.junit.Test)

Example 7 with Tokenizer

use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.

the class ContextLabelRetriever method stringWithLabels.

/**
     * Returns a stripped sentence with the indices of words
     * with certain kinds of labels.
     * @param sentence the sentence to process
     * @return a pair of a post processed sentence
     * with labels stripped and the spans of
     * the labels
     */
public static Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringWithLabels(String sentence, TokenizerFactory tokenizerFactory) {
    MultiDimensionalMap<Integer, Integer, String> map = MultiDimensionalMap.newHashBackedMap();
    Tokenizer t = tokenizerFactory.create(sentence);
    List<String> currTokens = new ArrayList<>();
    String currLabel = null;
    String endLabel = null;
    List<Pair<String, List<String>>> tokensWithSameLabel = new ArrayList<>();
    while (t.hasMoreTokens()) {
        String token = t.nextToken();
        if (token.matches(BEGIN_LABEL)) {
            if (endLabel != null)
                throw new IllegalStateException("Tried parsing sentence; found an end label when the begin label has not been cleared");
            currLabel = token;
            //no labels; add these as NONE and begin the new label
            if (!currTokens.isEmpty()) {
                tokensWithSameLabel.add(new Pair<>("NONE", (List<String>) new ArrayList<>(currTokens)));
                currTokens.clear();
            }
        } else if (token.matches(END_LABEL)) {
            if (currLabel == null)
                throw new IllegalStateException("Found an ending label with no matching begin label");
            endLabel = token;
        } else
            currTokens.add(token);
        if (currLabel != null && endLabel != null) {
            currLabel = currLabel.replaceAll("[<>/]", "");
            endLabel = endLabel.replaceAll("[<>/]", "");
            assert !currLabel.isEmpty() : "Current label is empty!";
            assert !endLabel.isEmpty() : "End label is empty!";
            assert currLabel.equals(endLabel) : "Current label begin and end did not match for the parse. Was: " + currLabel + " ending with " + endLabel;
            tokensWithSameLabel.add(new Pair<>(currLabel, (List<String>) new ArrayList<>(currTokens)));
            currTokens.clear();
            //clear out the tokens
            currLabel = null;
            endLabel = null;
        }
    }
    //no labels; add these as NONE and begin the new label
    if (!currTokens.isEmpty()) {
        tokensWithSameLabel.add(new Pair<>("none", (List<String>) new ArrayList<>(currTokens)));
        currTokens.clear();
    }
    //now join the output
    StringBuilder strippedSentence = new StringBuilder();
    for (Pair<String, List<String>> tokensWithLabel : tokensWithSameLabel) {
        String joinedSentence = StringUtils.join(tokensWithLabel.getSecond(), " ");
        //spaces between separate parts of the sentence
        if (!(strippedSentence.length() < 1))
            strippedSentence.append(" ");
        strippedSentence.append(joinedSentence);
        int begin = strippedSentence.toString().indexOf(joinedSentence);
        int end = begin + joinedSentence.length();
        map.put(begin, end, tokensWithLabel.getFirst());
    }
    return new Pair<>(strippedSentence.toString(), map);
}
Also used : ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) Tokenizer(org.deeplearning4j.text.tokenization.tokenizer.Tokenizer) Pair(org.deeplearning4j.berkeley.Pair)

Example 8 with Tokenizer

use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.

the class DefaultTokenizerFactory method create.

@Override
public Tokenizer create(InputStream toTokenize) {
    Tokenizer t = new DefaultStreamTokenizer(toTokenize);
    t.setTokenPreProcessor(tokenPreProcess);
    return t;
}
Also used : DefaultStreamTokenizer(org.deeplearning4j.text.tokenization.tokenizer.DefaultStreamTokenizer) DefaultTokenizer(org.deeplearning4j.text.tokenization.tokenizer.DefaultTokenizer) DefaultStreamTokenizer(org.deeplearning4j.text.tokenization.tokenizer.DefaultStreamTokenizer) Tokenizer(org.deeplearning4j.text.tokenization.tokenizer.Tokenizer)

Example 9 with Tokenizer

use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.

the class Windows method windows.

/**
     * Constructs a list of window of size windowSize.
     * Note that padding for each window is created as well.
     * @param words the words to tokenize and construct windows from
     * @param windowSize the window size to generate
     * @return the list of windows for the tokenized string
     */
public static List<Window> windows(InputStream words, int windowSize) {
    Tokenizer tokenizer = new DefaultStreamTokenizer(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens()) list.add(tokenizer.nextToken());
    return windows(list, windowSize);
}
Also used : DefaultStreamTokenizer(org.deeplearning4j.text.tokenization.tokenizer.DefaultStreamTokenizer) ArrayList(java.util.ArrayList) StringTokenizer(java.util.StringTokenizer) DefaultStreamTokenizer(org.deeplearning4j.text.tokenization.tokenizer.DefaultStreamTokenizer) Tokenizer(org.deeplearning4j.text.tokenization.tokenizer.Tokenizer)

Example 10 with Tokenizer

use of org.deeplearning4j.text.tokenization.tokenizer.Tokenizer in project deeplearning4j by deeplearning4j.

the class Windows method windows.

/**
     * Constructs a list of window of size windowSize.
     * Note that padding for each window is created as well.
     * @param words the words to tokenize and construct windows from
     * @param tokenizerFactory tokenizer factory to use
     * @param windowSize the window size to generate
     * @return the list of windows for the tokenized string
     */
public static List<Window> windows(InputStream words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens()) list.add(tokenizer.nextToken());
    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");
    return windows(list, windowSize);
}
Also used : ArrayList(java.util.ArrayList) StringTokenizer(java.util.StringTokenizer) DefaultStreamTokenizer(org.deeplearning4j.text.tokenization.tokenizer.DefaultStreamTokenizer) Tokenizer(org.deeplearning4j.text.tokenization.tokenizer.Tokenizer)

Aggregations

Tokenizer (org.deeplearning4j.text.tokenization.tokenizer.Tokenizer)17 ArrayList (java.util.ArrayList)5 DefaultStreamTokenizer (org.deeplearning4j.text.tokenization.tokenizer.DefaultStreamTokenizer)5 Test (org.junit.Test)5 StringTokenizer (java.util.StringTokenizer)4 File (java.io.File)2 ClassPathResource (org.datavec.api.util.ClassPathResource)2 TokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory)2 InputStream (java.io.InputStream)1 List (java.util.List)1 Pair (org.deeplearning4j.berkeley.Pair)1 Sequence (org.deeplearning4j.models.sequencevectors.sequence.Sequence)1 VocabWord (org.deeplearning4j.models.word2vec.VocabWord)1 BasicLineIterator (org.deeplearning4j.text.sentenceiterator.BasicLineIterator)1 SentenceIterator (org.deeplearning4j.text.sentenceiterator.SentenceIterator)1 DefaultTokenizer (org.deeplearning4j.text.tokenization.tokenizer.DefaultTokenizer)1 JapaneseTokenizer (org.deeplearning4j.text.tokenization.tokenizer.JapaneseTokenizer)1 NGramTokenizer (org.deeplearning4j.text.tokenization.tokenizer.NGramTokenizer)1 UimaTokenizer (org.deeplearning4j.text.tokenization.tokenizer.UimaTokenizer)1 DefaultTokenizerFactory (org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory)1