Search in sources :

Example 1 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class TreeSpanScoring method countSpanErrors.

/**
   * Counts how many spans are present in goldTree, including
   * preterminals, but not present in guessTree, along with how many
   * spans are present in guessTree and not goldTree.  Each one counts
   * as an error, meaning that something like a mislabeled span or
   * preterminal counts as two errors.
   * <br>
   * Span labels are compared using the basicCategory() function
   * from the passed in TreebankLanguagePack.
   */
public static int countSpanErrors(TreebankLanguagePack tlp, Tree goldTree, Tree guessTree) {
    Set<Constituent> goldConstituents = goldTree.constituents(LabeledConstituent.factory());
    Set<Constituent> guessConstituents = guessTree.constituents(LabeledConstituent.factory());
    Set<Constituent> simpleGoldConstituents = simplifyConstituents(tlp, goldConstituents);
    Set<Constituent> simpleGuessConstituents = simplifyConstituents(tlp, guessConstituents);
    //System.out.println(simpleGoldConstituents);
    //System.out.println(simpleGuessConstituents);
    int errors = 0;
    for (Constituent gold : simpleGoldConstituents) {
        if (!simpleGuessConstituents.contains(gold)) {
            ++errors;
        }
    }
    for (Constituent guess : simpleGuessConstituents) {
        if (!simpleGoldConstituents.contains(guess)) {
            ++errors;
        }
    }
    // The spans returned by constituents() doesn't include the
    // preterminals, so we need to count those ourselves now
    List<TaggedWord> goldWords = goldTree.taggedYield();
    List<TaggedWord> guessWords = guessTree.taggedYield();
    int len = Math.min(goldWords.size(), guessWords.size());
    for (int i = 0; i < len; ++i) {
        String goldTag = tlp.basicCategory(goldWords.get(i).tag());
        String guessTag = tlp.basicCategory(guessWords.get(i).tag());
        if (!goldTag.equals(guessTag)) {
            // we count one error for each span that is present in the
            // gold and not in the guess, and one error for each span that
            // is present in the guess and not the gold, so this counts as
            // two errors
            errors += 2;
        }
    }
    return errors;
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) Constituent(edu.stanford.nlp.trees.Constituent) LabeledConstituent(edu.stanford.nlp.trees.LabeledConstituent)

Example 2 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class DependencyParser method parseTextFile.

private void parseTextFile(BufferedReader input, PrintWriter output) {
    DocumentPreprocessor preprocessor = new DocumentPreprocessor(input);
    preprocessor.setSentenceFinalPuncWords(config.tlp.sentenceFinalPunctuationWords());
    preprocessor.setEscaper(config.escaper);
    preprocessor.setSentenceDelimiter(config.sentenceDelimiter);
    preprocessor.setTokenizerFactory(config.tlp.getTokenizerFactory());
    Timing timer = new Timing();
    MaxentTagger tagger = new MaxentTagger(config.tagger);
    List<List<TaggedWord>> tagged = new ArrayList<>();
    for (List<HasWord> sentence : preprocessor) {
        tagged.add(tagger.tagSentence(sentence));
    }
    System.err.printf("Tagging completed in %.2f sec.%n", timer.stop() / 1000.0);
    timer.start();
    int numSentences = 0;
    for (List<TaggedWord> taggedSentence : tagged) {
        GrammaticalStructure parse = predict(taggedSentence);
        Collection<TypedDependency> deps = parse.typedDependencies();
        for (TypedDependency dep : deps) output.println(dep);
        output.println();
        numSentences++;
    }
    long millis = timer.stop();
    double seconds = millis / 1000.0;
    System.err.printf("Parsed %d sentences in %.2f seconds (%.2f sents/sec).%n", numSentences, seconds, numSentences / seconds);
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) TypedDependency(edu.stanford.nlp.trees.TypedDependency) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) GrammaticalStructure(edu.stanford.nlp.trees.GrammaticalStructure) ChineseGrammaticalStructure(edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure) EnglishGrammaticalStructure(edu.stanford.nlp.trees.EnglishGrammaticalStructure) UniversalEnglishGrammaticalStructure(edu.stanford.nlp.trees.UniversalEnglishGrammaticalStructure) Collectors.toList(java.util.stream.Collectors.toList) Timing(edu.stanford.nlp.util.Timing) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 3 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class DependencyParserDemo method main.

public static void main(String[] args) {
    String modelPath = DependencyParser.DEFAULT_MODEL;
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
    for (int argIndex = 0; argIndex < args.length; ) {
        switch(args[argIndex]) {
            case "-tagger":
                taggerPath = args[argIndex + 1];
                argIndex += 2;
                break;
            case "-model":
                modelPath = args[argIndex + 1];
                argIndex += 2;
                break;
            default:
                throw new RuntimeException("Unknown argument " + args[argIndex]);
        }
    }
    String text = "I can almost always tell when movies use fake dinosaurs.";
    MaxentTagger tagger = new MaxentTagger(taggerPath);
    DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);
    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
        List<TaggedWord> tagged = tagger.tagSentence(sentence);
        GrammaticalStructure gs = parser.predict(tagged);
        // Print typed dependencies
        log.info(gs);
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) DependencyParser(edu.stanford.nlp.parser.nndep.DependencyParser) StringReader(java.io.StringReader) GrammaticalStructure(edu.stanford.nlp.trees.GrammaticalStructure) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 4 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class SplittingGrammarExtractor method initialBetasAndLexicon.

private int initialBetasAndLexicon(Tree tree, int position, double weight) {
    if (tree.isLeaf()) {
        // should never get here, unless a training tree is just one leaf
        return position;
    }
    if (tree.isPreTerminal()) {
        // fill in initial lexicon here
        String tag = tree.label().value();
        String word = tree.children()[0].label().value();
        TaggedWord tw = new TaggedWord(word, state(tag, 0));
        lex.train(tw, position, weight);
        return (position + 1);
    }
    if (tree.children().length == 2) {
        String label = tree.label().value();
        String leftLabel = tree.getChild(0).label().value();
        String rightLabel = tree.getChild(1).label().value();
        if (!binaryBetas.contains(label, leftLabel, rightLabel)) {
            double[][][] map = new double[1][1][1];
            map[0][0][0] = 0.0;
            binaryBetas.put(label, leftLabel, rightLabel, map);
        }
    } else if (tree.children().length == 1) {
        String label = tree.label().value();
        String childLabel = tree.getChild(0).label().value();
        if (!unaryBetas.contains(label, childLabel)) {
            double[][] map = new double[1][1];
            map[0][0] = 0.0;
            unaryBetas.put(label, childLabel, map);
        }
    } else {
        // should have been binarized
        throw new RuntimeException("Trees should have been binarized, expected 1 or 2 children");
    }
    for (Tree child : tree.children()) {
        position = initialBetasAndLexicon(child, position, weight);
    }
    return position;
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord) Tree(edu.stanford.nlp.trees.Tree)

Example 5 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class SplittingGrammarExtractor method recalculateTemporaryBetas.

public int recalculateTemporaryBetas(Tree tree, double[] stateWeights, int position, IdentityHashMap<Tree, double[][]> unaryTransitions, IdentityHashMap<Tree, double[][][]> binaryTransitions, Map<String, double[]> totalStateMass, TwoDimensionalMap<String, String, double[][]> tempUnaryBetas, ThreeDimensionalMap<String, String, String, double[][][]> tempBinaryBetas) {
    if (tree.isLeaf()) {
        // possible to get here if we have a tree with no structure
        return position;
    }
    if (totalStateMass != null) {
        double[] stateTotal = totalStateMass.get(tree.label().value());
        if (stateTotal == null) {
            stateTotal = new double[stateWeights.length];
            totalStateMass.put(tree.label().value(), stateTotal);
        }
        for (int i = 0; i < stateWeights.length; ++i) {
            stateTotal[i] += Math.exp(stateWeights[i]);
        }
    }
    if (tree.isPreTerminal()) {
        // fill in our new lexicon here.
        String tag = tree.label().value();
        String word = tree.children()[0].label().value();
        // We smooth by LEX_SMOOTH, if relevant.  We rescale so that sum
        // of the weights being added to the lexicon stays the same.
        double total = 0.0;
        for (double stateWeight : stateWeights) {
            total += Math.exp(stateWeight);
        }
        if (total <= 0.0) {
            return position + 1;
        }
        double scale = 1.0 / (1.0 + LEX_SMOOTH);
        double smoothing = total * LEX_SMOOTH / stateWeights.length;
        for (int state = 0; state < stateWeights.length; ++state) {
            // TODO: maybe optimize all this TaggedWord creation
            TaggedWord tw = new TaggedWord(word, state(tag, state));
            tempLex.train(tw, position, (Math.exp(stateWeights[state]) + smoothing) * scale);
        }
        return position + 1;
    }
    if (tree.children().length == 1) {
        String parentLabel = tree.label().value();
        String childLabel = tree.children()[0].label().value();
        double[][] transitions = unaryTransitions.get(tree);
        int parentStates = transitions.length;
        int childStates = transitions[0].length;
        double[][] betas = tempUnaryBetas.get(parentLabel, childLabel);
        if (betas == null) {
            betas = new double[parentStates][childStates];
            for (int i = 0; i < parentStates; ++i) {
                for (int j = 0; j < childStates; ++j) {
                    betas[i][j] = Double.NEGATIVE_INFINITY;
                }
            }
            tempUnaryBetas.put(parentLabel, childLabel, betas);
        }
        double[] childWeights = neginfDoubles(childStates);
        for (int i = 0; i < parentStates; ++i) {
            for (int j = 0; j < childStates; ++j) {
                double weight = transitions[i][j];
                betas[i][j] = SloppyMath.logAdd(betas[i][j], weight + stateWeights[i]);
                childWeights[j] = SloppyMath.logAdd(childWeights[j], weight + stateWeights[i]);
            }
        }
        position = recalculateTemporaryBetas(tree.children()[0], childWeights, position, unaryTransitions, binaryTransitions, totalStateMass, tempUnaryBetas, tempBinaryBetas);
    } else {
        // length == 2
        String parentLabel = tree.label().value();
        String leftLabel = tree.children()[0].label().value();
        String rightLabel = tree.children()[1].label().value();
        double[][][] transitions = binaryTransitions.get(tree);
        int parentStates = transitions.length;
        int leftStates = transitions[0].length;
        int rightStates = transitions[0][0].length;
        double[][][] betas = tempBinaryBetas.get(parentLabel, leftLabel, rightLabel);
        if (betas == null) {
            betas = new double[parentStates][leftStates][rightStates];
            for (int i = 0; i < parentStates; ++i) {
                for (int j = 0; j < leftStates; ++j) {
                    for (int k = 0; k < rightStates; ++k) {
                        betas[i][j][k] = Double.NEGATIVE_INFINITY;
                    }
                }
            }
            tempBinaryBetas.put(parentLabel, leftLabel, rightLabel, betas);
        }
        double[] leftWeights = neginfDoubles(leftStates);
        double[] rightWeights = neginfDoubles(rightStates);
        for (int i = 0; i < parentStates; ++i) {
            for (int j = 0; j < leftStates; ++j) {
                for (int k = 0; k < rightStates; ++k) {
                    double weight = transitions[i][j][k];
                    betas[i][j][k] = SloppyMath.logAdd(betas[i][j][k], weight + stateWeights[i]);
                    leftWeights[j] = SloppyMath.logAdd(leftWeights[j], weight + stateWeights[i]);
                    rightWeights[k] = SloppyMath.logAdd(rightWeights[k], weight + stateWeights[i]);
                }
            }
        }
        position = recalculateTemporaryBetas(tree.children()[0], leftWeights, position, unaryTransitions, binaryTransitions, totalStateMass, tempUnaryBetas, tempBinaryBetas);
        position = recalculateTemporaryBetas(tree.children()[1], rightWeights, position, unaryTransitions, binaryTransitions, totalStateMass, tempUnaryBetas, tempBinaryBetas);
    }
    return position;
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Aggregations

TaggedWord (edu.stanford.nlp.ling.TaggedWord)43 HasWord (edu.stanford.nlp.ling.HasWord)9 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)5 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 Tree (edu.stanford.nlp.trees.Tree)5 ArrayList (java.util.ArrayList)5 Label (edu.stanford.nlp.ling.Label)4 WordTag (edu.stanford.nlp.ling.WordTag)4 List (java.util.List)4 HasTag (edu.stanford.nlp.ling.HasTag)3 TaggedFileRecord (edu.stanford.nlp.tagger.io.TaggedFileRecord)3 File (java.io.File)3 StringReader (java.io.StringReader)3 Word (edu.stanford.nlp.ling.Word)2 Morphology (edu.stanford.nlp.process.Morphology)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2