Search in sources :

Example 1 with CategoryWordTag

use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.

the class NegraPennTreebankParserParams method transformTree.

/**
   * transformTree does all language-specific tree
   * transformations. Any parameterizations should be inside the
   * specific TreebankLangParserarams class.
   */
@Override
public Tree transformTree(Tree t, Tree root) {
    if (t == null || t.isLeaf()) {
        return t;
    }
    List<String> annotations = new ArrayList<>();
    CoreLabel lab = (CoreLabel) t.label();
    String word = lab.word();
    String tag = lab.tag();
    String cat = lab.value();
    String baseCat = treebankLanguagePack().basicCategory(cat);
    //categories -- at present there is no tag annotation!!
    if (t.isPhrasal()) {
        List<String> childBasicCats = childBasicCats(t);
        // mark vp's headed by "zu" verbs
        if (DEBUG) {
            if (markZuVP && baseCat.equals("VP")) {
                System.out.println("child basic cats: " + childBasicCats);
            }
        }
        if (markZuVP && baseCat.equals("VP") && (childBasicCats.contains("VZ") || childBasicCats.contains("VVIZU"))) {
            if (DEBUG)
                System.out.println("Marked zu VP" + t);
            annotations.add("%ZU");
        }
        // mark relative clause S's
        if (markRC && (t.label() instanceof NegraLabel) && baseCat.equals("S") && ((NegraLabel) t.label()).getEdge() != null && ((NegraLabel) t.label()).getEdge().equals("RC")) {
            if (DEBUG) {
                System.out.println("annotating this guy as RC:");
                t.pennPrint();
            }
            //throw new RuntimeException("damn, not a Negra Label");
            annotations.add("%RC");
        }
        if (markContainsV && containsVP(t)) {
            annotations.add("%vp");
        }
        if (markLP && leftPhrasal(t)) {
            annotations.add("%LP");
        }
        if (markKonjParent) {
            // this depends on functional tags being present
            for (String cCat : childBasicCats) {
                if (cCat.contains("-KONJ")) {
                    annotations.add("%konjp");
                    break;
                }
            }
        }
        if (markHDParent) {
            // this depends on functional tags being present
            for (String cCat : childBasicCats) {
                if (cCat.contains("-HD")) {
                    annotations.add("%hdp");
                    break;
                }
            }
        }
    } else {
        //t.isPreTerminal() case
        if (markColon && cat.equals("$.") && (word.equals(":") || word.equals(";"))) {
            annotations.add("-%colon");
        }
    }
    //    if(t.isPreTerminal()) {
    //      if(parent != null) {
    //        String parentVal = parent.label().value();
    //        int cutOffPtD = parentVal.indexOf('-');
    //        int cutOffPtC = parentVal.indexOf('^');
    //        int curMin = parentVal.length();
    //        if(cutOffPtD != -1) {
    //          curMin = cutOffPtD;
    //        }
    //        if(cutOffPtC != -1) {
    //          curMin = Math.min(curMin, cutOffPtC);
    //        }
    //        parentVal = parentVal.substring(0, curMin);
    //        annotations.add("^" + parentVal);
    //      }
    //    }
    // put on all the annotations
    StringBuilder catSB = new StringBuilder(cat);
    for (String annotation : annotations) {
        catSB.append(annotation);
    }
    t.setLabel(new CategoryWordTag(catSB.toString(), word, tag));
    return t;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) NegraLabel(edu.stanford.nlp.trees.international.negra.NegraLabel) CategoryWordTag(edu.stanford.nlp.ling.CategoryWordTag)

Example 2 with CategoryWordTag

use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.

the class ExhaustiveDependencyParser method extractBestParse.

/** Find the best (partial) parse within the parameter constraints.
   *  @param start Sentence index of start of span (fenceposts, from 0 up)
   *  @param end   Sentence index of end of span (right side fencepost)
   *  @param hWord Sentence index of head word (left side fencepost)
   *  @param hTag  Tag assigned to hWord
   *  @return The best parse tree within the parameter constraints
   */
private Tree extractBestParse(int start, int end, int hWord, int hTag) {
    if (DEBUG) {
        log.info("Span " + start + " to " + end + " word " + wordIndex.get(words[hWord]) + "/" + hWord + " tag " + tagIndex.get(hTag) + "/" + hTag + " score " + iScore(start, end, hWord, hTag));
    }
    String headWordStr = wordIndex.get(words[hWord]);
    String headTagStr = tagIndex.get(hTag);
    Label headLabel = new CategoryWordTag(headWordStr, headWordStr, headTagStr);
    int numTags = tagIndex.size();
    // deal with span 1
    if (end - start == 1) {
        Tree leaf = tf.newLeaf(new Word(headWordStr));
        return tf.newTreeNode(headLabel, Collections.singletonList(leaf));
    }
    // find backtrace
    List<Tree> children = new ArrayList<>();
    double bestScore = iScore(start, end, hWord, hTag);
    for (int split = start + 1; split < end; split++) {
        int binD = binDistance[hWord][split];
        if (hWord < split) {
            for (int aWord = split; aWord < end; aWord++) {
                for (int aTag = 0; aTag < numTags; aTag++) {
                    if (matches(iScore(start, split, hWord, hTag) + iScore(split, end, aWord, aTag) + headScore[binD][hWord][dg.tagBin(hTag)][aWord][dg.tagBin(aTag)] + headStop[aWord][dg.tagBin(aTag)][split] + headStop[aWord][dg.tagBin(aTag)][end], bestScore)) {
                        if (DEBUG) {
                            String argWordStr = wordIndex.get(words[aWord]);
                            String argTagStr = tagIndex.get(aTag);
                            log.info(headWordStr + "|" + headTagStr + " -> " + argWordStr + "|" + argTagStr + " " + bestScore);
                        }
                        // build it
                        children.add(extractBestParse(start, split, hWord, hTag));
                        children.add(extractBestParse(split, end, aWord, aTag));
                        return tf.newTreeNode(headLabel, children);
                    }
                }
            }
        } else {
            for (int aWord = start; aWord < split; aWord++) {
                for (int aTag = 0; aTag < numTags; aTag++) {
                    if (matches(iScore(start, split, aWord, aTag) + iScore(split, end, hWord, hTag) + headScore[binD][hWord][dg.tagBin(hTag)][aWord][dg.tagBin(aTag)] + headStop[aWord][dg.tagBin(aTag)][start] + headStop[aWord][dg.tagBin(aTag)][split], bestScore)) {
                        if (DEBUG) {
                            String argWordStr = wordIndex.get(words[aWord]);
                            String argTagStr = tagIndex.get(aTag);
                            log.info(headWordStr + "|" + headTagStr + " -> " + argWordStr + "|" + argTagStr + " " + bestScore);
                        }
                        children.add(extractBestParse(start, split, aWord, aTag));
                        children.add(extractBestParse(split, end, hWord, hTag));
                        // build it
                        return tf.newTreeNode(headLabel, children);
                    }
                }
            }
        }
    }
    log.info("Problem in ExhaustiveDependencyParser::extractBestParse");
    return null;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Word(edu.stanford.nlp.ling.Word) Label(edu.stanford.nlp.ling.Label) Tree(edu.stanford.nlp.trees.Tree) CategoryWordTag(edu.stanford.nlp.ling.CategoryWordTag)

Example 3 with CategoryWordTag

use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.

the class ChineseTreebankParserParams method transformTree.

/**
   * transformTree does all language-specific tree
   * transformations. Any parameterizations should be inside the
   * specific TreebankLangParserParams class.
   */
@Override
public Tree transformTree(Tree t, Tree root) {
    if (t == null || t.isLeaf()) {
        return t;
    }
    String parentStr;
    String grandParentStr;
    Tree parent;
    Tree grandParent;
    if (root == null || t.equals(root)) {
        parent = null;
        parentStr = "";
    } else {
        parent = t.parent(root);
        parentStr = parent.label().value();
    }
    if (parent == null || parent.equals(root)) {
        grandParent = null;
        grandParentStr = "";
    } else {
        grandParent = parent.parent(root);
        grandParentStr = grandParent.label().value();
    }
    String baseParentStr = ctlp.basicCategory(parentStr);
    String baseGrandParentStr = ctlp.basicCategory(grandParentStr);
    CoreLabel lab = (CoreLabel) t.label();
    String word = lab.word();
    String tag = lab.tag();
    String baseTag = ctlp.basicCategory(tag);
    String category = lab.value();
    String baseCategory = ctlp.basicCategory(category);
    if (t.isPreTerminal()) {
        // it's a POS tag
        List<String> leftAunts = listBasicCategories(SisterAnnotationStats.leftSisterLabels(parent, grandParent));
        List<String> rightAunts = listBasicCategories(SisterAnnotationStats.rightSisterLabels(parent, grandParent));
        // Chinese-specific punctuation splits
        if (chineseSplitPunct && baseTag.equals("PU")) {
            if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(word)) {
                tag = tag + "-DOU";
            // System.out.println("Punct: Split dou hao"); // debugging
            } else if (ChineseTreebankLanguagePack.chineseCommaAcceptFilter().test(word)) {
                tag = tag + "-COMMA";
            // System.out.println("Punct: Split comma"); // debugging
            } else if (ChineseTreebankLanguagePack.chineseColonAcceptFilter().test(word)) {
                tag = tag + "-COLON";
            // System.out.println("Punct: Split colon"); // debugging
            } else if (ChineseTreebankLanguagePack.chineseQuoteMarkAcceptFilter().test(word)) {
                if (chineseSplitPunctLR) {
                    if (ChineseTreebankLanguagePack.chineseLeftQuoteMarkAcceptFilter().test(word)) {
                        tag += "-LQUOTE";
                    } else {
                        tag += "-RQUOTE";
                    }
                } else {
                    tag = tag + "-QUOTE";
                }
            // System.out.println("Punct: Split quote"); // debugging
            } else if (ChineseTreebankLanguagePack.chineseEndSentenceAcceptFilter().test(word)) {
                tag = tag + "-ENDSENT";
            // System.out.println("Punct: Split end sent"); // debugging
            } else if (ChineseTreebankLanguagePack.chineseParenthesisAcceptFilter().test(word)) {
                if (chineseSplitPunctLR) {
                    if (ChineseTreebankLanguagePack.chineseLeftParenthesisAcceptFilter().test(word)) {
                        tag += "-LPAREN";
                    } else {
                        tag += "-RPAREN";
                    }
                } else {
                    tag += "-PAREN";
                //printlnErr("Just used -PAREN annotation");
                //printlnErr(word);
                //throw new RuntimeException();
                }
            // System.out.println("Punct: Split paren"); // debugging
            } else if (ChineseTreebankLanguagePack.chineseDashAcceptFilter().test(word)) {
                tag = tag + "-DASH";
            // System.out.println("Punct: Split dash"); // debugging
            } else if (ChineseTreebankLanguagePack.chineseOtherAcceptFilter().test(word)) {
                tag = tag + "-OTHER";
            } else {
                printlnErr("Unknown punct (you should add it to CTLP): " + tag + " |" + word + "|");
            }
        } else if (chineseSplitDouHao) {
            // only split DouHao
            if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(word) && baseTag.equals("PU")) {
                tag = tag + "-DOU";
            }
        }
        if (tagWordSize) {
            int l = word.length();
            tag += "-" + l + "CHARS";
        }
        if (mergeNNVV && baseTag.equals("NN")) {
            tag = "VV";
        }
        if ((chineseSelectiveTagPA || chineseVerySelectiveTagPA) && (baseTag.equals("CC") || baseTag.equals("P"))) {
            tag += "-" + baseParentStr;
        }
        if (chineseSelectiveTagPA && (baseTag.equals("VV"))) {
            tag += "-" + baseParentStr;
        }
        if (markMultiNtag && tag.startsWith("N")) {
            for (int i = 0; i < parent.numChildren(); i++) {
                if (parent.children()[i].label().value().startsWith("N") && parent.children()[i] != t) {
                    tag += "=N";
                //System.out.println("Found multi=N rewrite");
                }
            }
        }
        if (markVVsisterIP && baseTag.equals("VV")) {
            boolean seenIP = false;
            for (int i = 0; i < parent.numChildren(); i++) {
                if (parent.children()[i].label().value().startsWith("IP")) {
                    seenIP = true;
                }
            }
            if (seenIP) {
                tag += "-IP";
            //System.out.println("Found VV with IP sister"); // testing
            }
        }
        if (markPsisterIP && baseTag.equals("P")) {
            boolean seenIP = false;
            for (int i = 0; i < parent.numChildren(); i++) {
                if (parent.children()[i].label().value().startsWith("IP")) {
                    seenIP = true;
                }
            }
            if (seenIP) {
                tag += "-IP";
            }
        }
        if (markADgrandchildOfIP && baseTag.equals("AD") && baseGrandParentStr.equals("IP")) {
            tag += "~IP";
        //System.out.println("Found AD with IP grandparent"); // testing
        }
        if (gpaAD && baseTag.equals("AD")) {
            tag += "~" + baseGrandParentStr;
        //System.out.println("Found AD with grandparent " + grandParentStr); // testing
        }
        if (markPostverbalP && leftAunts.contains("VV") && baseTag.equals("P")) {
            //System.out.println("Found post-verbal P");
            tag += "^=lVV";
        }
        // end Chinese-specific tag splits
        Label label = new CategoryWordTag(tag, word, tag);
        t.setLabel(label);
    } else {
        // it's a phrasal category
        Tree[] kids = t.children();
        // Chinese-specific category splits
        List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
        List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));
        if (paRootDtr && baseParentStr.equals("ROOT")) {
            category += "^ROOT";
        }
        if (markIPsisterBA && baseCategory.equals("IP")) {
            if (leftSis.contains("BA")) {
                category += "=BA";
            //System.out.println("Found IP sister of BA");
            }
        }
        if (dominatesV && hasV(t.preTerminalYield())) {
            // mark categories containing a verb
            category += "-v";
        }
        if (markIPsisterVVorP && baseCategory.equals("IP")) {
            // todo: cdm: is just looking for "P" here selective enough??
            if (leftSis.contains("VV") || leftSis.contains("P")) {
                category += "=VVP";
            }
        }
        if (markIPsisDEC && baseCategory.equals("IP")) {
            if (rightSis.contains("DEC")) {
                category += "=DEC";
            //System.out.println("Found prenominal IP");
            }
        }
        if (baseCategory.equals("VP")) {
            // I think that was bad because it also matched VPT verb compounds
            if (chineseSplitVP == 3) {
                boolean hasCC = false;
                boolean hasPU = false;
                boolean hasLexV = false;
                for (Tree kid : kids) {
                    if (kid.label().value().startsWith("CC")) {
                        hasCC = true;
                    } else if (kid.label().value().startsWith("PU")) {
                        hasPU = true;
                    } else if (StringUtils.lookingAt(kid.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
                        hasLexV = true;
                    }
                }
                if (hasCC || (hasPU && !hasLexV)) {
                    category += "-CRD";
                //System.out.println("Found coordinate VP"); // testing
                } else if (hasLexV) {
                    category += "-COMP";
                //System.out.println("Found complementing VP"); // testing
                } else {
                    category += "-ADJT";
                //System.out.println("Found adjoining VP"); // testing
                }
            } else if (chineseSplitVP >= 1) {
                boolean hasBA = false;
                for (Tree kid : kids) {
                    if (kid.label().value().startsWith("BA")) {
                        hasBA = true;
                    } else if (chineseSplitVP == 2 && tlp.basicCategory(kid.label().value()).equals("VP")) {
                        for (Tree kidkid : kid.children()) {
                            if (kidkid.label().value().startsWith("BA")) {
                                hasBA = true;
                            }
                        }
                    }
                }
                if (hasBA) {
                    category += "-BA";
                }
            }
        }
        if (markVPadjunct && baseParentStr.equals("VP")) {
            // cdm 2008: This used to use startsWith("VP") but changed to baseCat
            Tree[] sisters = parent.children();
            boolean hasVPsister = false;
            boolean hasCC = false;
            boolean hasPU = false;
            boolean hasLexV = false;
            for (Tree sister : sisters) {
                if (tlp.basicCategory(sister.label().value()).equals("VP")) {
                    hasVPsister = true;
                }
                if (sister.label().value().startsWith("CC")) {
                    hasCC = true;
                }
                if (sister.label().value().startsWith("PU")) {
                    hasPU = true;
                }
                if (StringUtils.lookingAt(sister.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
                    hasLexV = true;
                }
            }
            if (hasVPsister && !(hasCC || hasPU || hasLexV)) {
                category += "-VPADJ";
            //System.out.println("Found adjunct of VP"); // testing
            }
        }
        if (markNPmodNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
            if (rightSis.contains("NP")) {
                category += "=MODIFIERNP";
            //System.out.println("Found NP modifier of NP"); // testing
            }
        }
        if (markModifiedNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
            if (rightSis.isEmpty() && (leftSis.contains("ADJP") || leftSis.contains("NP") || leftSis.contains("DNP") || leftSis.contains("QP") || leftSis.contains("CP") || leftSis.contains("PP"))) {
                category += "=MODIFIEDNP";
            //System.out.println("Found modified NP"); // testing
            }
        }
        if (markNPconj && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
            if (rightSis.contains("CC") || rightSis.contains("PU") || leftSis.contains("CC") || leftSis.contains("PU")) {
                category += "=CONJ";
            //System.out.println("Found NP conjunct"); // testing
            }
        }
        if (markIPconj && baseCategory.equals("IP") && baseParentStr.equals("IP")) {
            Tree[] sisters = parent.children();
            boolean hasCommaSis = false;
            boolean hasIPSis = false;
            for (Tree sister : sisters) {
                if (ctlp.basicCategory(sister.label().value()).equals("PU") && ChineseTreebankLanguagePack.chineseCommaAcceptFilter().test(sister.children()[0].label().toString())) {
                    hasCommaSis = true;
                //System.out.println("Found CommaSis"); // testing
                }
                if (ctlp.basicCategory(sister.label().value()).equals("IP") && sister != t) {
                    hasIPSis = true;
                }
            }
            if (hasCommaSis && hasIPSis) {
                category += "-CONJ";
            //System.out.println("Found IP conjunct"); // testing
            }
        }
        if (unaryIP && baseCategory.equals("IP") && t.numChildren() == 1) {
            category += "-U";
        //System.out.println("Found unary IP"); //testing
        }
        if (unaryCP && baseCategory.equals("CP") && t.numChildren() == 1) {
            category += "-U";
        //System.out.println("Found unary CP"); //testing
        }
        if (splitBaseNP && baseCategory.equals("NP")) {
            if (t.isPrePreTerminal()) {
                category = category + "-B";
            }
        }
        if (markPostverbalPP && leftSis.contains("VV") && baseCategory.equals("PP")) {
            //System.out.println("Found post-verbal PP");
            category += "=lVV";
        }
        if ((markADgrandchildOfIP || gpaAD) && listBasicCategories(SisterAnnotationStats.kidLabels(t)).contains("AD")) {
            category += "^ADVP";
        }
        if (markCC) {
            // marginal conjunctions which don't conjoin 2 things.
            for (int i = 1; i < kids.length - 1; i++) {
                String cat2 = kids[i].label().value();
                if (cat2.startsWith("CC")) {
                    category += "-CC";
                }
            }
        }
        Label label = new CategoryWordTag(category, word, tag);
        t.setLabel(label);
    }
    return t;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) CategoryWordTag(edu.stanford.nlp.ling.CategoryWordTag)

Example 4 with CategoryWordTag

use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.

the class BiLexPCFGParser method extractParse.

protected Tree extractParse(Edge edge) {
    String head = wordIndex.get(words[edge.head]);
    String tag = tagIndex.get(edge.tag);
    String state = stateIndex.get(edge.state);
    Label label = new CategoryWordTag(state, head, tag);
    if (edge.backEdge == null && edge.backHook == null) {
        // leaf, but needs word terminal
        Tree leaf;
        if (originalLabels[edge.head] != null) {
            leaf = tf.newLeaf(originalLabels[edge.head]);
        } else {
            leaf = tf.newLeaf(head);
        }
        List<Tree> childList = Collections.singletonList(leaf);
        return tf.newTreeNode(label, childList);
    }
    if (edge.backHook == null) {
        // unary
        List<Tree> childList = Collections.singletonList(extractParse(edge.backEdge));
        return tf.newTreeNode(label, childList);
    }
    // binary
    List<Tree> children = new ArrayList<>();
    if (edge.backHook.isPreHook()) {
        children.add(extractParse(edge.backEdge));
        children.add(extractParse(edge.backHook.backEdge));
    } else {
        children.add(extractParse(edge.backHook.backEdge));
        children.add(extractParse(edge.backEdge));
    }
    return tf.newTreeNode(label, children);
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) Tree(edu.stanford.nlp.trees.Tree) CategoryWordTag(edu.stanford.nlp.ling.CategoryWordTag)

Example 5 with CategoryWordTag

use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.

the class TreeAnnotatorAndBinarizer method addRoot.

/**
   * Changes the ROOT label, and adds a Lexicon.BOUNDARY daughter to it.
   * This is needed for the dependency parser.
   * <i>Note:</i> This is a destructive operation on the tree passed in!!
   *
   * @param t The current tree into which a boundary is inserted
   */
public void addRoot(Tree t) {
    if (t.isLeaf()) {
        log.info("Warning: tree is leaf: " + t);
        t = tf.newTreeNode(tlp.startSymbol(), Collections.singletonList(t));
    }
    t.setLabel(new CategoryWordTag(tlp.startSymbol(), Lexicon.BOUNDARY, Lexicon.BOUNDARY_TAG));
    List<Tree> preTermChildList = new ArrayList<>();
    //CategoryWordTag(Lexicon.BOUNDARY,Lexicon.BOUNDARY,""));
    Tree boundaryTerm = tf.newLeaf(new Word(Lexicon.BOUNDARY));
    preTermChildList.add(boundaryTerm);
    Tree boundaryPreTerm = tf.newTreeNode(new CategoryWordTag(Lexicon.BOUNDARY_TAG, Lexicon.BOUNDARY, Lexicon.BOUNDARY_TAG), preTermChildList);
    List<Tree> childList = t.getChildrenAsList();
    childList.add(boundaryPreTerm);
    t.setChildren(childList);
}
Also used : Word(edu.stanford.nlp.ling.Word) CategoryWordTag(edu.stanford.nlp.ling.CategoryWordTag)

Aggregations

CategoryWordTag (edu.stanford.nlp.ling.CategoryWordTag)6 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 Label (edu.stanford.nlp.ling.Label)3 Word (edu.stanford.nlp.ling.Word)3 Tree (edu.stanford.nlp.trees.Tree)3 HasWord (edu.stanford.nlp.ling.HasWord)1 NegraLabel (edu.stanford.nlp.trees.international.negra.NegraLabel)1