Search in sources :

Example 41 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class TreeAnnotator method transformTreeHelper.

/**
   * Do the category splitting of the tree passed in.
   * This is initially called on the root node of a tree, and it recursively
   * calls itself on children.  A depth first left-to-right traversal is
   * done whereby a tree node's children are first transformed and then
   * the parent is transformed.  At the time of calling, the original root
   * always sits above the current node.  This routine can be assumed to,
   * and does, change the tree passed in: it destructively modifies tree nodes,
   * and makes new tree structure when it needs to.
   *
   * @param t The tree node to subcategorize.
   * @param root The root of the tree.  It must contain {@code t} or
   *     this code will throw a NullPointerException.
   * @return The annotated tree.
   */
private Tree transformTreeHelper(Tree t, Tree root) {
    if (t == null) {
        // handle null
        return null;
    }
    if (t.isLeaf()) {
        //No need to change the label
        return t;
    }
    String cat = t.label().value();
    Tree parent;
    String parentStr;
    String grandParentStr;
    if (root == null || t.equals(root)) {
        parent = null;
        parentStr = "";
    } else {
        parent = t.parent(root);
        parentStr = parent.label().value();
    }
    if (parent == null || parent.equals(root)) {
        grandParentStr = "";
    } else {
        grandParentStr = parent.parent(root).label().value();
    }
    String baseParentStr = tlpParams.treebankLanguagePack().basicCategory(parentStr);
    String baseGrandParentStr = tlpParams.treebankLanguagePack().basicCategory(grandParentStr);
    if (t.isPreTerminal()) {
        // handle tags
        // recurse
        Tree childResult = transformTreeHelper(t.children()[0], null);
        // would be nicer if Word/CWT ??
        String word = childResult.value();
        if (!trainOptions.noTagSplit) {
            if (trainOptions.tagPA) {
                String test = cat + "^" + baseParentStr;
                if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.contains(test)) {
                    cat = test;
                }
            }
            if (trainOptions.markUnaryTags && parent.numChildren() == 1) {
                cat = cat + "^U";
            }
        }
        // otherwise, leave the tags alone!
        // Label label = new CategoryWordTag(cat, word, cat);
        Label label = t.label().labelFactory().newLabel(t.label());
        label.setValue(cat);
        if (label instanceof HasCategory)
            ((HasCategory) label).setCategory(cat);
        if (label instanceof HasWord)
            ((HasWord) label).setWord(word);
        if (label instanceof HasTag)
            ((HasTag) label).setTag(cat);
        t.setLabel(label);
        // just in case word is changed
        t.setChild(0, childResult);
        if (trainOptions.noTagSplit) {
            return t;
        } else {
            // language-specific transforms
            return tlpParams.transformTree(t, root);
        }
    }
    // end isPreTerminal()
    // handle phrasal categories
    Tree[] kids = t.children();
    for (int childNum = 0; childNum < kids.length; childNum++) {
        Tree child = kids[childNum];
        // recursive call
        Tree childResult = transformTreeHelper(child, root);
        t.setChild(childNum, childResult);
    }
    Tree headChild = hf.determineHead(t);
    if (headChild == null || headChild.label() == null) {
        throw new RuntimeException("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t);
    }
    Label headLabel = headChild.label();
    if (!(headLabel instanceof HasWord))
        throw new RuntimeException("TreeAnnotator: Head label lacks a Word annotation!");
    if (!(headLabel instanceof HasTag))
        throw new RuntimeException("TreeAnnotator: Head label lacks a Tag annotation!");
    String word = ((HasWord) headLabel).word();
    String tag = ((HasTag) headLabel).tag();
    // String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag);
    String baseCat = tlpParams.treebankLanguagePack().basicCategory(cat);
    /* Sister annotation. Potential problem: if multiple sisters are
     * strong indicators for a single category's expansions.  This
     * happens concretely in the Chinese Treebank when NP (object)
     * has left sisters VV and AS.  Could lead to too much
     * sparseness.  The ideal solution would be to give the
     * splitting list an ordering, and take only the highest (~most
     * informative/reliable) sister annotation.
     */
    if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.length() > 0) {
        List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
        List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));
        List<String> leftAnn = new ArrayList<>();
        List<String> rightAnn = new ArrayList<>();
        for (String s : leftSis) {
            //s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s);
            leftAnn.add(baseCat + "=l=" + tlpParams.treebankLanguagePack().basicCategory(s));
        //System.out.println("left-annotated test string " + s);
        }
        for (String s : rightSis) {
            //s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s);
            rightAnn.add(baseCat + "=r=" + tlpParams.treebankLanguagePack().basicCategory(s));
        }
        for (Iterator<String> j = rightAnn.iterator(); j.hasNext(); ) {
        //System.out.println("new rightsis " + (String)j.next()); //debugging
        }
        for (String annCat : trainOptions.sisterSplitters) {
            //System.out.println("annotated test string " + annCat);
            if (leftAnn.contains(annCat) || rightAnn.contains(annCat)) {
                cat = cat + annCat.replaceAll("^" + baseCat, "");
                break;
            }
        }
    }
    if (trainOptions.PA && !trainOptions.smoothing && baseParentStr.length() > 0) {
        String cat2 = baseCat + "^" + baseParentStr;
        if (!trainOptions.selectiveSplit || trainOptions.splitters.contains(cat2)) {
            cat = cat + "^" + baseParentStr;
        }
    }
    if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.length() > 0) {
        if (trainOptions.selectiveSplit) {
            String cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr;
            if (cat.contains("^") && trainOptions.splitters.contains(cat2)) {
                cat = cat + "~" + baseGrandParentStr;
            }
        } else {
            cat = cat + "~" + baseGrandParentStr;
        }
    }
    if (trainOptions.markUnary > 0) {
        if (trainOptions.markUnary == 1 && kids.length == 1 && kids[0].depth() >= 2) {
            cat = cat + "-U";
        } else if (trainOptions.markUnary == 2 && parent != null && parent.numChildren() == 1 && t.depth() >= 2) {
            cat = cat + "-u";
        }
    }
    if (trainOptions.rightRec && rightRec(t, baseCat)) {
        cat = cat + "-R";
    }
    if (trainOptions.leftRec && leftRec(t, baseCat)) {
        cat = cat + "-L";
    }
    if (trainOptions.splitPrePreT && t.isPrePreTerminal()) {
        cat = cat + "-PPT";
    }
    //    Label label = new CategoryWordTag(cat, word, tag);
    Label label = t.label().labelFactory().newLabel(t.label());
    label.setValue(cat);
    if (label instanceof HasCategory)
        ((HasCategory) label).setCategory(cat);
    if (label instanceof HasWord)
        ((HasWord) label).setWord(word);
    if (label instanceof HasTag)
        ((HasTag) label).setTag(tag);
    t.setLabel(label);
    return tlpParams.transformTree(t, root);
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) HasCategory(edu.stanford.nlp.ling.HasCategory) HasTag(edu.stanford.nlp.ling.HasTag)

Example 42 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ExhaustivePCFGParser method extractBestParse.

private Tree extractBestParse(int goal, int start, int end) {
    // find source of inside score
    // no backtraces so we can speed up the parsing for its primary use
    double bestScore = iScore[start][end][goal];
    double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal]) : bestScore;
    String goalStr = stateIndex.get(goal);
    // check tags
    if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) {
        if (op.testOptions.maxSpanForTags > 1) {
            Tree wordNode = null;
            if (sentence != null) {
                StringBuilder word = new StringBuilder();
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }
                }
                wordNode = tf.newLeaf(word.toString());
            } else if (lr != null) {
                List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end);
                for (LatticeEdge edge : latticeEdges) {
                    IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex);
                    float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null);
                    if (matches(bestScore, tagScore + (float) edge.weight)) {
                        wordNode = tf.newLeaf(edge.word);
                        if (wordNode.label() instanceof CoreLabel) {
                            CoreLabel cl = (CoreLabel) wordNode.label();
                            cl.setBeginPosition(start);
                            cl.setEndPosition(end);
                        }
                        break;
                    }
                }
                if (wordNode == null) {
                    throw new RuntimeException("could not find matching word from lattice in parse reconstruction");
                }
            } else {
                throw new RuntimeException("attempt to get word when sentence and lattice are null!");
            }
            Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
            tagNode.setScore(bestScore);
            if (originalTags[start] != null) {
                tagNode.label().setValue(originalTags[start].tag());
            }
            return tagNode;
        } else {
            // normal lexicon is single words case
            IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr));
            String contextStr = getCoreLabel(start).originalText();
            float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr);
            if (tagScore > Float.NEGATIVE_INFINITY || floodTags) {
                // return a pre-terminal tree
                CoreLabel terminalLabel = getCoreLabel(start);
                Tree wordNode = tf.newLeaf(terminalLabel);
                Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
                tagNode.setScore(bestScore);
                if (terminalLabel.tag() != null) {
                    tagNode.label().setValue(terminalLabel.tag());
                }
                if (tagNode.label() instanceof HasTag) {
                    ((HasTag) tagNode.label()).setTag(tagNode.label().value());
                }
                return tagNode;
            }
        }
    }
    // check binaries first
    for (int split = start + 1; split < end; split++) {
        for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext(); ) {
            BinaryRule br = binaryI.next();
            double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild];
            boolean matches;
            if (op.testOptions.lengthNormalization) {
                double normScore = score / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]);
                matches = matches(normScore, normBestScore);
            } else {
                matches = matches(score, bestScore);
            }
            if (matches) {
                // build binary split
                Tree leftChildTree = extractBestParse(br.leftChild, start, split);
                Tree rightChildTree = extractBestParse(br.rightChild, split, end);
                List<Tree> children = new ArrayList<>();
                children.add(leftChildTree);
                children.add(rightChildTree);
                Tree result = tf.newTreeNode(goalStr, children);
                result.setScore(score);
                // log.info("    Found Binary node: "+result);
                return result;
            }
        }
    }
    // for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) {
    for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext(); ) {
        UnaryRule ur = unaryI.next();
        // log.info("  Trying " + ur + " dtr score: " + iScore[start][end][ur.child]);
        double score = ur.score + iScore[start][end][ur.child];
        boolean matches;
        if (op.testOptions.lengthNormalization) {
            double normScore = score / wordsInSpan[start][end][ur.child];
            matches = matches(normScore, normBestScore);
        } else {
            matches = matches(score, bestScore);
        }
        if (ur.child != ur.parent && matches) {
            // build unary
            Tree childTree = extractBestParse(ur.child, start, end);
            Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree));
            // log.info("    Matched!  Unary node: "+result);
            result.setScore(score);
            return result;
        }
    }
    log.info("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start + ", " + end + "] looking for " + goalStr);
    return null;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) HasTag(edu.stanford.nlp.ling.HasTag) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Tree(edu.stanford.nlp.trees.Tree)

Example 43 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ParserDemo method demoDP.

/**
   * demoDP demonstrates turning a file into tokens and then parse
   * trees.  Note that the trees are printed by calling pennPrint on
   * the Tree object.  It is also possible to pass a PrintWriter to
   * pennPrint if you want to capture the output.
   * This code will work with any supported language.
   */
public static void demoDP(LexicalizedParser lp, String filename) {
    // This option shows loading, sentence-segmenting and tokenizing
    // a file using DocumentPreprocessor.
    // a PennTreebankLanguagePack for English
    TreebankLanguagePack tlp = lp.treebankLanguagePack();
    GrammaticalStructureFactory gsf = null;
    if (tlp.supportsGrammaticalStructures()) {
        gsf = tlp.grammaticalStructureFactory();
    }
    // to DocumentPreprocessor
    for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
        Tree parse = lp.apply(sentence);
        parse.pennPrint();
        System.out.println();
        if (gsf != null) {
            GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
            Collection tdl = gs.typedDependenciesCCprocessed();
            System.out.println(tdl);
            System.out.println();
        }
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Collection(java.util.Collection) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 44 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ParserDemo2 method main.

/** This example shows a few more ways of providing input to a parser.
   *
   *  Usage: ParserDemo2 [grammar [textFile]]
   */
public static void main(String[] args) throws IOException {
    String grammar = args.length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String[] options = { "-maxLength", "80", "-retainTmpSubcategories" };
    LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
    TreebankLanguagePack tlp = lp.getOp().langpack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    Iterable<List<? extends HasWord>> sentences;
    if (args.length > 1) {
        DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
        List<List<? extends HasWord>> tmp = new ArrayList<>();
        for (List<HasWord> sentence : dp) {
            tmp.add(sentence);
        }
        sentences = tmp;
    } else {
        // Showing tokenization and parsing in code a couple of different ways.
        String[] sent = { "This", "is", "an", "easy", "sentence", "." };
        List<HasWord> sentence = new ArrayList<>();
        for (String word : sent) {
            sentence.add(new Word(word));
        }
        String sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization.");
        // Use the default tokenizer for this TreebankLanguagePack
        Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
        List<? extends HasWord> sentence2 = toke.tokenize();
        String[] sent3 = { "It", "can", "can", "it", "." };
        // Parser gets second "can" wrong without help
        String[] tag3 = { "PRP", "MD", "VB", "PRP", "." };
        List<TaggedWord> sentence3 = new ArrayList<>();
        for (int i = 0; i < sent3.length; i++) {
            sentence3.add(new TaggedWord(sent3[i], tag3[i]));
        }
        Tree parse = lp.parse(sentence3);
        parse.pennPrint();
        List<List<? extends HasWord>> tmp = new ArrayList<>();
        tmp.add(sentence);
        tmp.add(sentence2);
        tmp.add(sentence3);
        sentences = tmp;
    }
    for (List<? extends HasWord> sentence : sentences) {
        Tree parse = lp.parse(sentence);
        parse.pennPrint();
        System.out.println();
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
        System.out.println(tdl);
        System.out.println();
        System.out.println("The words of the sentence:");
        for (Label lab : parse.yield()) {
            if (lab instanceof CoreLabel) {
                System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
            } else {
                System.out.println(lab);
            }
        }
        System.out.println();
        System.out.println(parse.taggedYield());
        System.out.println();
    }
    // This method turns the String into a single sentence using the
    // default tokenizer for the TreebankLanguagePack.
    String sent3 = "This is one last test!";
    lp.parse(sent3).pennPrint();
}
Also used : Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) StringReader(java.io.StringReader) HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) TaggedWord(edu.stanford.nlp.ling.TaggedWord) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 45 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class MaxMatchSegmenter method segment.

@Override
public List<HasWord> segment(String s) {
    List<Word> segmentedWords = new ArrayList<>();
    for (int start = 0, length = s.length(); start < length; ) {
        int end = Math.min(length, start + maxLength);
        while (end > start + 1) {
            String nextWord = s.substring(start, end);
            if (words.contains(nextWord)) {
                segmentedWords.add(new Word(nextWord));
                break;
            }
            end--;
        }
        if (end == start + 1) {
            // handle non-BMP characters
            if (s.codePointAt(start) >= 0x10000) {
                segmentedWords.add(new Word(new String(s.substring(start, start + 2))));
                start += 2;
            } else {
                segmentedWords.add(new Word(new String(s.substring(start, start + 1))));
                start++;
            }
        } else {
            start = end;
        }
    }
    return new ArrayList<>(segmentedWords);
}
Also used : Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Aggregations

HasWord (edu.stanford.nlp.ling.HasWord)58 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 TaggedWord (edu.stanford.nlp.ling.TaggedWord)15 ArrayList (java.util.ArrayList)15 HasTag (edu.stanford.nlp.ling.HasTag)13 Tree (edu.stanford.nlp.trees.Tree)13 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)12 StringReader (java.io.StringReader)12 Label (edu.stanford.nlp.ling.Label)10 Word (edu.stanford.nlp.ling.Word)10 List (java.util.List)8 BufferedReader (java.io.BufferedReader)6 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 File (java.io.File)5 PrintWriter (java.io.PrintWriter)5 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4 Pair (edu.stanford.nlp.util.Pair)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 HasIndex (edu.stanford.nlp.ling.HasIndex)3 Sentence (edu.stanford.nlp.ling.Sentence)3