Search in sources :

Example 31 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class CountClosedTags method countTrainingTags.

/**
   * Count trainingRatio of the sentences for both trainingWords and
   * allWords, and count the rest for just allWords
   */
void countTrainingTags(TaggedFileRecord file) throws IOException {
    int sentences = countSentences(file);
    int trainSentences = (int) (sentences * trainingRatio);
    TaggedFileReader reader = file.reader();
    List<TaggedWord> line;
    for (int i = 0; i < trainSentences && reader.hasNext(); ++i) {
        line = reader.next();
        addTaggedWords(line, trainingWords);
        addTaggedWords(line, allWords);
    }
    while (reader.hasNext()) {
        line = reader.next();
        addTaggedWords(line, allWords);
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) TaggedFileReader(edu.stanford.nlp.tagger.io.TaggedFileReader)

Example 32 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class MakePrefixFile method main.

public static void main(String[] args) {
    Properties config = StringUtils.argsToProperties(args);
    log.info(config);
    boolean fullSentence = PropertiesUtils.getBool(config, "fullSentence", false);
    Random random = new Random();
    String tagSeparator = config.getProperty("tagSeparator", TaggerConfig.TAG_SEPARATOR);
    TaggedFileRecord record = TaggedFileRecord.createRecord(config, config.getProperty("input"));
    for (List<TaggedWord> sentence : record.reader()) {
        int len = random.nextInt(sentence.size()) + 1;
        System.out.println(SentenceUtils.listToString(sentence.subList(0, len), false, tagSeparator));
        if (fullSentence) {
            System.out.println(SentenceUtils.listToString(sentence, false, tagSeparator));
        }
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) Random(java.util.Random) Properties(java.util.Properties) TaggedFileRecord(edu.stanford.nlp.tagger.io.TaggedFileRecord)

Example 33 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ReadDataTagged method loadFile.

private void loadFile(TaggedFileReader reader, Map<String, IntCounter<String>> wordTagCounts) {
    log.info("Loading tagged words from " + reader.filename());
    ArrayList<String> words = new ArrayList<>();
    ArrayList<String> tags = new ArrayList<>();
    int numSentences = 0;
    int numWords = 0;
    int maxLen = Integer.MIN_VALUE;
    int minLen = Integer.MAX_VALUE;
    for (List<TaggedWord> sentence : reader) {
        if (maxentTagger.wordFunction != null) {
            List<TaggedWord> newSentence = new ArrayList<>(sentence.size());
            for (TaggedWord word : sentence) {
                TaggedWord newWord = new TaggedWord(maxentTagger.wordFunction.apply(word.word()), word.tag());
                newSentence.add(newWord);
            }
            sentence = newSentence;
        }
        for (TaggedWord tw : sentence) {
            if (tw != null) {
                words.add(tw.word());
                tags.add(tw.tag());
                if (!maxentTagger.tagTokens.containsKey(tw.tag())) {
                    maxentTagger.tagTokens.put(tw.tag(), Generics.<String>newHashSet());
                }
                maxentTagger.tagTokens.get(tw.tag()).add(tw.word());
            }
        }
        maxLen = (sentence.size() > maxLen ? sentence.size() : maxLen);
        minLen = (sentence.size() < minLen ? sentence.size() : minLen);
        words.add(Tagger.EOS_WORD);
        tags.add(Tagger.EOS_TAG);
        numElements = numElements + sentence.size() + 1;
        // iterate over the words in the sentence
        for (int i = 0; i < sentence.size() + 1; i++) {
            History h = new History(totalWords + totalSentences, totalWords + totalSentences + sentence.size(), totalWords + totalSentences + i, pairs, maxentTagger.extractors);
            String tag = tags.get(i);
            String word = words.get(i);
            pairs.add(new WordTag(word, tag));
            int y = maxentTagger.addTag(tag);
            DataWordTag dat = new DataWordTag(h, y, tag);
            v.add(dat);
            IntCounter<String> tagCounts = wordTagCounts.get(word);
            if (tagCounts == null) {
                tagCounts = new IntCounter<>();
                wordTagCounts.put(word, tagCounts);
            }
            tagCounts.incrementCount(tag, 1);
        }
        totalSentences++;
        totalWords += sentence.size();
        numSentences++;
        numWords += sentence.size();
        words.clear();
        tags.clear();
        if ((numSentences % 100000) == 0)
            log.info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]");
    }
    log.info("Read " + numWords + " words from " + reader.filename() + " [done].");
    log.info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words.");
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) ArrayList(java.util.ArrayList) WordTag(edu.stanford.nlp.ling.WordTag)

Example 34 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class CollocationFinder method getStemmedWordTagsFromTree.

/**
   *
   * @param t a tree
   * @return the WordTags corresponding to the leaves of the tree,
   * stemmed according to their POS tags in the tree.
   */
private static List<WordTag> getStemmedWordTagsFromTree(Tree t) {
    List<WordTag> stemmedWordTags = Generics.newArrayList();
    ArrayList<TaggedWord> s = t.taggedYield();
    for (TaggedWord w : s) {
        WordTag wt = Morphology.stemStatic(w.word(), w.tag());
        stemmedWordTags.add(wt);
    }
    return stemmedWordTags;
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) WordTag(edu.stanford.nlp.ling.WordTag)

Example 35 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class CollocationFinder method getNonStemmedWordTagsFromTree.

private static List<WordTag> getNonStemmedWordTagsFromTree(Tree t) {
    List<WordTag> wordTags = Generics.newArrayList();
    ArrayList<TaggedWord> s = t.taggedYield();
    for (TaggedWord w : s) {
        WordTag wt = new WordTag(w.word(), w.tag());
        wordTags.add(wt);
    }
    return wordTags;
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) WordTag(edu.stanford.nlp.ling.WordTag)

Aggregations

TaggedWord (edu.stanford.nlp.ling.TaggedWord)43 HasWord (edu.stanford.nlp.ling.HasWord)9 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)5 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 Tree (edu.stanford.nlp.trees.Tree)5 ArrayList (java.util.ArrayList)5 Label (edu.stanford.nlp.ling.Label)4 WordTag (edu.stanford.nlp.ling.WordTag)4 List (java.util.List)4 HasTag (edu.stanford.nlp.ling.HasTag)3 TaggedFileRecord (edu.stanford.nlp.tagger.io.TaggedFileRecord)3 File (java.io.File)3 StringReader (java.io.StringReader)3 Word (edu.stanford.nlp.ling.Word)2 Morphology (edu.stanford.nlp.process.Morphology)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2