Search in sources :

Example 6 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class Treebanks method countTaggings.

private static void countTaggings(Treebank tb, final PrintWriter pw) {
    final TwoDimensionalCounter<String, String> wtc = new TwoDimensionalCounter<>();
    tb.apply(tree -> {
        List<TaggedWord> tags = tree.taggedYield();
        for (TaggedWord tag : tags) wtc.incrementCount(tag.word(), tag.tag());
    });
    for (String key : wtc.firstKeySet()) {
        pw.print(key);
        pw.print('\t');
        Counter<String> ctr = wtc.getCounter(key);
        for (String k2 : ctr.keySet()) {
            pw.print(k2 + '\t' + ctr.getCount(k2) + '\t');
        }
        pw.println();
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter)

Example 7 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class CollocationFinder method mergeLeavesIntoCollocatedString.

private static String mergeLeavesIntoCollocatedString(Tree t) {
    StringBuilder sb = new StringBuilder(160);
    ArrayList<TaggedWord> sent = t.taggedYield();
    for (TaggedWord aSent : sent) {
        sb.append(aSent.word()).append('_');
    }
    return sb.substring(0, sb.length() - 1);
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Example 8 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ParserGrammar method lemmatize.

/**
   * Only works on English, as it is hard coded for using the
   * Morphology class, which is English-only
   */
public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) {
    List<TaggedWord> tagged;
    if (getOp().testOptions.preTag) {
        Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger();
        tagged = tagger.apply(tokens);
    } else {
        Tree tree = parse(tokens);
        tagged = tree.taggedYield();
    }
    Morphology morpha = new Morphology();
    List<CoreLabel> lemmas = Generics.newArrayList();
    for (TaggedWord token : tagged) {
        CoreLabel label = new CoreLabel();
        label.setWord(token.word());
        label.setTag(token.tag());
        morpha.stem(label);
        lemmas.add(label);
    }
    return lemmas;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) TaggedWord(edu.stanford.nlp.ling.TaggedWord) Morphology(edu.stanford.nlp.process.Morphology) Tree(edu.stanford.nlp.trees.Tree) List(java.util.List)

Example 9 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ChineseCharacterBasedLexiconTraining method printStats.

public static void printStats(Collection<Tree> trees, PrintWriter pw) {
    ClassicCounter<Integer> wordLengthCounter = new ClassicCounter<>();
    ClassicCounter<TaggedWord> wordCounter = new ClassicCounter<>();
    ClassicCounter<Symbol> charCounter = new ClassicCounter<>();
    int counter = 0;
    for (Tree tree : trees) {
        counter++;
        List<TaggedWord> taggedWords = tree.taggedYield();
        for (TaggedWord taggedWord : taggedWords) {
            String word = taggedWord.word();
            if (word.equals(Lexicon.BOUNDARY)) {
                continue;
            }
            wordCounter.incrementCount(taggedWord);
            wordLengthCounter.incrementCount(Integer.valueOf(word.length()));
            for (int j = 0, length = word.length(); j < length; j++) {
                Symbol sym = Symbol.cannonicalSymbol(word.charAt(j));
                charCounter.incrementCount(sym);
            }
            charCounter.incrementCount(Symbol.END_WORD);
        }
    }
    Set<Symbol> singletonChars = Counters.keysBelow(charCounter, 1.5);
    Set<TaggedWord> singletonWords = Counters.keysBelow(wordCounter, 1.5);
    ClassicCounter<String> singletonWordPOSes = new ClassicCounter<>();
    for (TaggedWord taggedWord : singletonWords) {
        singletonWordPOSes.incrementCount(taggedWord.tag());
    }
    Distribution<String> singletonWordPOSDist = Distribution.getDistribution(singletonWordPOSes);
    ClassicCounter<Character> singletonCharRads = new ClassicCounter<>();
    for (Symbol s : singletonChars) {
        singletonCharRads.incrementCount(Character.valueOf(RadicalMap.getRadical(s.getCh())));
    }
    Distribution<Character> singletonCharRadDist = Distribution.getDistribution(singletonCharRads);
    Distribution<Integer> wordLengthDist = Distribution.getDistribution(wordLengthCounter);
    NumberFormat percent = new DecimalFormat("##.##%");
    pw.println("There are " + singletonChars.size() + " singleton chars out of " + (int) charCounter.totalCount() + " tokens and " + charCounter.size() + " types found in " + counter + " trees.");
    pw.println("Thus singletonChars comprise " + percent.format(singletonChars.size() / charCounter.totalCount()) + " of tokens and " + percent.format((double) singletonChars.size() / charCounter.size()) + " of types.");
    pw.println();
    pw.println("There are " + singletonWords.size() + " singleton words out of " + (int) wordCounter.totalCount() + " tokens and " + wordCounter.size() + " types.");
    pw.println("Thus singletonWords comprise " + percent.format(singletonWords.size() / wordCounter.totalCount()) + " of tokens and " + percent.format((double) singletonWords.size() / wordCounter.size()) + " of types.");
    pw.println();
    pw.println("Distribution over singleton word POS:");
    pw.println(singletonWordPOSDist.toString());
    pw.println();
    pw.println("Distribution over singleton char radicals:");
    pw.println(singletonCharRadDist.toString());
    pw.println();
    pw.println("Distribution over word length:");
    pw.println(wordLengthDist);
}
Also used : Symbol(edu.stanford.nlp.parser.lexparser.ChineseCharacterBasedLexicon.Symbol) DecimalFormat(java.text.DecimalFormat) TaggedWord(edu.stanford.nlp.ling.TaggedWord) Tree(edu.stanford.nlp.trees.Tree) NumberFormat(java.text.NumberFormat)

Example 10 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ChineseMaxentLexicon method finishTraining.

@Override
public void finishTraining() {
    IntCounter<String> tagCounter = new IntCounter<>();
    WeightedDataset data = new WeightedDataset(datumCounter.size());
    for (TaggedWord word : datumCounter.keySet()) {
        int count = datumCounter.getIntCount(word);
        if (trainOnLowCount && count > trainCountThreshold) {
            continue;
        }
        if (functionWordTags.containsKey(word.word())) {
            continue;
        }
        tagCounter.incrementCount(word.tag());
        if (trainByType) {
            count = 1;
        }
        data.add(new BasicDatum(featExtractor.makeFeatures(word.word()), word.tag()), count);
    }
    datumCounter = null;
    tagDist = Distribution.laplaceSmoothedDistribution(tagCounter, tagCounter.size(), 0.5);
    tagCounter = null;
    applyThresholds(data);
    verbose("Making classifier...");
    //new ResultStoringMonitor(5, "weights"));
    QNMinimizer minim = new QNMinimizer();
    //    minim.shutUp();
    LinearClassifierFactory factory = new LinearClassifierFactory(minim);
    factory.setTol(tol);
    factory.setSigma(sigma);
    if (tuneSigma) {
        factory.setTuneSigmaHeldOut();
    }
    scorer = factory.trainClassifier(data);
    verbose("Done training.");
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) LinearClassifierFactory(edu.stanford.nlp.classify.LinearClassifierFactory) WeightedDataset(edu.stanford.nlp.classify.WeightedDataset) QNMinimizer(edu.stanford.nlp.optimization.QNMinimizer) BasicDatum(edu.stanford.nlp.ling.BasicDatum)

Aggregations

TaggedWord (edu.stanford.nlp.ling.TaggedWord)43 HasWord (edu.stanford.nlp.ling.HasWord)9 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)5 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 Tree (edu.stanford.nlp.trees.Tree)5 ArrayList (java.util.ArrayList)5 Label (edu.stanford.nlp.ling.Label)4 WordTag (edu.stanford.nlp.ling.WordTag)4 List (java.util.List)4 HasTag (edu.stanford.nlp.ling.HasTag)3 TaggedFileRecord (edu.stanford.nlp.tagger.io.TaggedFileRecord)3 File (java.io.File)3 StringReader (java.io.StringReader)3 Word (edu.stanford.nlp.ling.Word)2 Morphology (edu.stanford.nlp.process.Morphology)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2