use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.
the class NegraPennTreebankParserParams method transformTree.
/**
* transformTree does all language-specific tree
* transformations. Any parameterizations should be inside the
* specific TreebankLangParserarams class.
*/
@Override
public Tree transformTree(Tree t, Tree root) {
if (t == null || t.isLeaf()) {
return t;
}
List<String> annotations = new ArrayList<>();
CoreLabel lab = (CoreLabel) t.label();
String word = lab.word();
String tag = lab.tag();
String cat = lab.value();
String baseCat = treebankLanguagePack().basicCategory(cat);
//categories -- at present there is no tag annotation!!
if (t.isPhrasal()) {
List<String> childBasicCats = childBasicCats(t);
// mark vp's headed by "zu" verbs
if (DEBUG) {
if (markZuVP && baseCat.equals("VP")) {
System.out.println("child basic cats: " + childBasicCats);
}
}
if (markZuVP && baseCat.equals("VP") && (childBasicCats.contains("VZ") || childBasicCats.contains("VVIZU"))) {
if (DEBUG)
System.out.println("Marked zu VP" + t);
annotations.add("%ZU");
}
// mark relative clause S's
if (markRC && (t.label() instanceof NegraLabel) && baseCat.equals("S") && ((NegraLabel) t.label()).getEdge() != null && ((NegraLabel) t.label()).getEdge().equals("RC")) {
if (DEBUG) {
System.out.println("annotating this guy as RC:");
t.pennPrint();
}
//throw new RuntimeException("damn, not a Negra Label");
annotations.add("%RC");
}
if (markContainsV && containsVP(t)) {
annotations.add("%vp");
}
if (markLP && leftPhrasal(t)) {
annotations.add("%LP");
}
if (markKonjParent) {
// this depends on functional tags being present
for (String cCat : childBasicCats) {
if (cCat.contains("-KONJ")) {
annotations.add("%konjp");
break;
}
}
}
if (markHDParent) {
// this depends on functional tags being present
for (String cCat : childBasicCats) {
if (cCat.contains("-HD")) {
annotations.add("%hdp");
break;
}
}
}
} else {
//t.isPreTerminal() case
if (markColon && cat.equals("$.") && (word.equals(":") || word.equals(";"))) {
annotations.add("-%colon");
}
}
// if(t.isPreTerminal()) {
// if(parent != null) {
// String parentVal = parent.label().value();
// int cutOffPtD = parentVal.indexOf('-');
// int cutOffPtC = parentVal.indexOf('^');
// int curMin = parentVal.length();
// if(cutOffPtD != -1) {
// curMin = cutOffPtD;
// }
// if(cutOffPtC != -1) {
// curMin = Math.min(curMin, cutOffPtC);
// }
// parentVal = parentVal.substring(0, curMin);
// annotations.add("^" + parentVal);
// }
// }
// put on all the annotations
StringBuilder catSB = new StringBuilder(cat);
for (String annotation : annotations) {
catSB.append(annotation);
}
t.setLabel(new CategoryWordTag(catSB.toString(), word, tag));
return t;
}
use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.
the class ExhaustiveDependencyParser method extractBestParse.
/** Find the best (partial) parse within the parameter constraints.
* @param start Sentence index of start of span (fenceposts, from 0 up)
* @param end Sentence index of end of span (right side fencepost)
* @param hWord Sentence index of head word (left side fencepost)
* @param hTag Tag assigned to hWord
* @return The best parse tree within the parameter constraints
*/
private Tree extractBestParse(int start, int end, int hWord, int hTag) {
if (DEBUG) {
log.info("Span " + start + " to " + end + " word " + wordIndex.get(words[hWord]) + "/" + hWord + " tag " + tagIndex.get(hTag) + "/" + hTag + " score " + iScore(start, end, hWord, hTag));
}
String headWordStr = wordIndex.get(words[hWord]);
String headTagStr = tagIndex.get(hTag);
Label headLabel = new CategoryWordTag(headWordStr, headWordStr, headTagStr);
int numTags = tagIndex.size();
// deal with span 1
if (end - start == 1) {
Tree leaf = tf.newLeaf(new Word(headWordStr));
return tf.newTreeNode(headLabel, Collections.singletonList(leaf));
}
// find backtrace
List<Tree> children = new ArrayList<>();
double bestScore = iScore(start, end, hWord, hTag);
for (int split = start + 1; split < end; split++) {
int binD = binDistance[hWord][split];
if (hWord < split) {
for (int aWord = split; aWord < end; aWord++) {
for (int aTag = 0; aTag < numTags; aTag++) {
if (matches(iScore(start, split, hWord, hTag) + iScore(split, end, aWord, aTag) + headScore[binD][hWord][dg.tagBin(hTag)][aWord][dg.tagBin(aTag)] + headStop[aWord][dg.tagBin(aTag)][split] + headStop[aWord][dg.tagBin(aTag)][end], bestScore)) {
if (DEBUG) {
String argWordStr = wordIndex.get(words[aWord]);
String argTagStr = tagIndex.get(aTag);
log.info(headWordStr + "|" + headTagStr + " -> " + argWordStr + "|" + argTagStr + " " + bestScore);
}
// build it
children.add(extractBestParse(start, split, hWord, hTag));
children.add(extractBestParse(split, end, aWord, aTag));
return tf.newTreeNode(headLabel, children);
}
}
}
} else {
for (int aWord = start; aWord < split; aWord++) {
for (int aTag = 0; aTag < numTags; aTag++) {
if (matches(iScore(start, split, aWord, aTag) + iScore(split, end, hWord, hTag) + headScore[binD][hWord][dg.tagBin(hTag)][aWord][dg.tagBin(aTag)] + headStop[aWord][dg.tagBin(aTag)][start] + headStop[aWord][dg.tagBin(aTag)][split], bestScore)) {
if (DEBUG) {
String argWordStr = wordIndex.get(words[aWord]);
String argTagStr = tagIndex.get(aTag);
log.info(headWordStr + "|" + headTagStr + " -> " + argWordStr + "|" + argTagStr + " " + bestScore);
}
children.add(extractBestParse(start, split, aWord, aTag));
children.add(extractBestParse(split, end, hWord, hTag));
// build it
return tf.newTreeNode(headLabel, children);
}
}
}
}
}
log.info("Problem in ExhaustiveDependencyParser::extractBestParse");
return null;
}
use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.
the class ChineseTreebankParserParams method transformTree.
/**
* transformTree does all language-specific tree
* transformations. Any parameterizations should be inside the
* specific TreebankLangParserParams class.
*/
@Override
public Tree transformTree(Tree t, Tree root) {
if (t == null || t.isLeaf()) {
return t;
}
String parentStr;
String grandParentStr;
Tree parent;
Tree grandParent;
if (root == null || t.equals(root)) {
parent = null;
parentStr = "";
} else {
parent = t.parent(root);
parentStr = parent.label().value();
}
if (parent == null || parent.equals(root)) {
grandParent = null;
grandParentStr = "";
} else {
grandParent = parent.parent(root);
grandParentStr = grandParent.label().value();
}
String baseParentStr = ctlp.basicCategory(parentStr);
String baseGrandParentStr = ctlp.basicCategory(grandParentStr);
CoreLabel lab = (CoreLabel) t.label();
String word = lab.word();
String tag = lab.tag();
String baseTag = ctlp.basicCategory(tag);
String category = lab.value();
String baseCategory = ctlp.basicCategory(category);
if (t.isPreTerminal()) {
// it's a POS tag
List<String> leftAunts = listBasicCategories(SisterAnnotationStats.leftSisterLabels(parent, grandParent));
List<String> rightAunts = listBasicCategories(SisterAnnotationStats.rightSisterLabels(parent, grandParent));
// Chinese-specific punctuation splits
if (chineseSplitPunct && baseTag.equals("PU")) {
if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(word)) {
tag = tag + "-DOU";
// System.out.println("Punct: Split dou hao"); // debugging
} else if (ChineseTreebankLanguagePack.chineseCommaAcceptFilter().test(word)) {
tag = tag + "-COMMA";
// System.out.println("Punct: Split comma"); // debugging
} else if (ChineseTreebankLanguagePack.chineseColonAcceptFilter().test(word)) {
tag = tag + "-COLON";
// System.out.println("Punct: Split colon"); // debugging
} else if (ChineseTreebankLanguagePack.chineseQuoteMarkAcceptFilter().test(word)) {
if (chineseSplitPunctLR) {
if (ChineseTreebankLanguagePack.chineseLeftQuoteMarkAcceptFilter().test(word)) {
tag += "-LQUOTE";
} else {
tag += "-RQUOTE";
}
} else {
tag = tag + "-QUOTE";
}
// System.out.println("Punct: Split quote"); // debugging
} else if (ChineseTreebankLanguagePack.chineseEndSentenceAcceptFilter().test(word)) {
tag = tag + "-ENDSENT";
// System.out.println("Punct: Split end sent"); // debugging
} else if (ChineseTreebankLanguagePack.chineseParenthesisAcceptFilter().test(word)) {
if (chineseSplitPunctLR) {
if (ChineseTreebankLanguagePack.chineseLeftParenthesisAcceptFilter().test(word)) {
tag += "-LPAREN";
} else {
tag += "-RPAREN";
}
} else {
tag += "-PAREN";
//printlnErr("Just used -PAREN annotation");
//printlnErr(word);
//throw new RuntimeException();
}
// System.out.println("Punct: Split paren"); // debugging
} else if (ChineseTreebankLanguagePack.chineseDashAcceptFilter().test(word)) {
tag = tag + "-DASH";
// System.out.println("Punct: Split dash"); // debugging
} else if (ChineseTreebankLanguagePack.chineseOtherAcceptFilter().test(word)) {
tag = tag + "-OTHER";
} else {
printlnErr("Unknown punct (you should add it to CTLP): " + tag + " |" + word + "|");
}
} else if (chineseSplitDouHao) {
// only split DouHao
if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(word) && baseTag.equals("PU")) {
tag = tag + "-DOU";
}
}
if (tagWordSize) {
int l = word.length();
tag += "-" + l + "CHARS";
}
if (mergeNNVV && baseTag.equals("NN")) {
tag = "VV";
}
if ((chineseSelectiveTagPA || chineseVerySelectiveTagPA) && (baseTag.equals("CC") || baseTag.equals("P"))) {
tag += "-" + baseParentStr;
}
if (chineseSelectiveTagPA && (baseTag.equals("VV"))) {
tag += "-" + baseParentStr;
}
if (markMultiNtag && tag.startsWith("N")) {
for (int i = 0; i < parent.numChildren(); i++) {
if (parent.children()[i].label().value().startsWith("N") && parent.children()[i] != t) {
tag += "=N";
//System.out.println("Found multi=N rewrite");
}
}
}
if (markVVsisterIP && baseTag.equals("VV")) {
boolean seenIP = false;
for (int i = 0; i < parent.numChildren(); i++) {
if (parent.children()[i].label().value().startsWith("IP")) {
seenIP = true;
}
}
if (seenIP) {
tag += "-IP";
//System.out.println("Found VV with IP sister"); // testing
}
}
if (markPsisterIP && baseTag.equals("P")) {
boolean seenIP = false;
for (int i = 0; i < parent.numChildren(); i++) {
if (parent.children()[i].label().value().startsWith("IP")) {
seenIP = true;
}
}
if (seenIP) {
tag += "-IP";
}
}
if (markADgrandchildOfIP && baseTag.equals("AD") && baseGrandParentStr.equals("IP")) {
tag += "~IP";
//System.out.println("Found AD with IP grandparent"); // testing
}
if (gpaAD && baseTag.equals("AD")) {
tag += "~" + baseGrandParentStr;
//System.out.println("Found AD with grandparent " + grandParentStr); // testing
}
if (markPostverbalP && leftAunts.contains("VV") && baseTag.equals("P")) {
//System.out.println("Found post-verbal P");
tag += "^=lVV";
}
// end Chinese-specific tag splits
Label label = new CategoryWordTag(tag, word, tag);
t.setLabel(label);
} else {
// it's a phrasal category
Tree[] kids = t.children();
// Chinese-specific category splits
List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));
if (paRootDtr && baseParentStr.equals("ROOT")) {
category += "^ROOT";
}
if (markIPsisterBA && baseCategory.equals("IP")) {
if (leftSis.contains("BA")) {
category += "=BA";
//System.out.println("Found IP sister of BA");
}
}
if (dominatesV && hasV(t.preTerminalYield())) {
// mark categories containing a verb
category += "-v";
}
if (markIPsisterVVorP && baseCategory.equals("IP")) {
// todo: cdm: is just looking for "P" here selective enough??
if (leftSis.contains("VV") || leftSis.contains("P")) {
category += "=VVP";
}
}
if (markIPsisDEC && baseCategory.equals("IP")) {
if (rightSis.contains("DEC")) {
category += "=DEC";
//System.out.println("Found prenominal IP");
}
}
if (baseCategory.equals("VP")) {
// I think that was bad because it also matched VPT verb compounds
if (chineseSplitVP == 3) {
boolean hasCC = false;
boolean hasPU = false;
boolean hasLexV = false;
for (Tree kid : kids) {
if (kid.label().value().startsWith("CC")) {
hasCC = true;
} else if (kid.label().value().startsWith("PU")) {
hasPU = true;
} else if (StringUtils.lookingAt(kid.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
hasLexV = true;
}
}
if (hasCC || (hasPU && !hasLexV)) {
category += "-CRD";
//System.out.println("Found coordinate VP"); // testing
} else if (hasLexV) {
category += "-COMP";
//System.out.println("Found complementing VP"); // testing
} else {
category += "-ADJT";
//System.out.println("Found adjoining VP"); // testing
}
} else if (chineseSplitVP >= 1) {
boolean hasBA = false;
for (Tree kid : kids) {
if (kid.label().value().startsWith("BA")) {
hasBA = true;
} else if (chineseSplitVP == 2 && tlp.basicCategory(kid.label().value()).equals("VP")) {
for (Tree kidkid : kid.children()) {
if (kidkid.label().value().startsWith("BA")) {
hasBA = true;
}
}
}
}
if (hasBA) {
category += "-BA";
}
}
}
if (markVPadjunct && baseParentStr.equals("VP")) {
// cdm 2008: This used to use startsWith("VP") but changed to baseCat
Tree[] sisters = parent.children();
boolean hasVPsister = false;
boolean hasCC = false;
boolean hasPU = false;
boolean hasLexV = false;
for (Tree sister : sisters) {
if (tlp.basicCategory(sister.label().value()).equals("VP")) {
hasVPsister = true;
}
if (sister.label().value().startsWith("CC")) {
hasCC = true;
}
if (sister.label().value().startsWith("PU")) {
hasPU = true;
}
if (StringUtils.lookingAt(sister.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
hasLexV = true;
}
}
if (hasVPsister && !(hasCC || hasPU || hasLexV)) {
category += "-VPADJ";
//System.out.println("Found adjunct of VP"); // testing
}
}
if (markNPmodNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
if (rightSis.contains("NP")) {
category += "=MODIFIERNP";
//System.out.println("Found NP modifier of NP"); // testing
}
}
if (markModifiedNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
if (rightSis.isEmpty() && (leftSis.contains("ADJP") || leftSis.contains("NP") || leftSis.contains("DNP") || leftSis.contains("QP") || leftSis.contains("CP") || leftSis.contains("PP"))) {
category += "=MODIFIEDNP";
//System.out.println("Found modified NP"); // testing
}
}
if (markNPconj && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
if (rightSis.contains("CC") || rightSis.contains("PU") || leftSis.contains("CC") || leftSis.contains("PU")) {
category += "=CONJ";
//System.out.println("Found NP conjunct"); // testing
}
}
if (markIPconj && baseCategory.equals("IP") && baseParentStr.equals("IP")) {
Tree[] sisters = parent.children();
boolean hasCommaSis = false;
boolean hasIPSis = false;
for (Tree sister : sisters) {
if (ctlp.basicCategory(sister.label().value()).equals("PU") && ChineseTreebankLanguagePack.chineseCommaAcceptFilter().test(sister.children()[0].label().toString())) {
hasCommaSis = true;
//System.out.println("Found CommaSis"); // testing
}
if (ctlp.basicCategory(sister.label().value()).equals("IP") && sister != t) {
hasIPSis = true;
}
}
if (hasCommaSis && hasIPSis) {
category += "-CONJ";
//System.out.println("Found IP conjunct"); // testing
}
}
if (unaryIP && baseCategory.equals("IP") && t.numChildren() == 1) {
category += "-U";
//System.out.println("Found unary IP"); //testing
}
if (unaryCP && baseCategory.equals("CP") && t.numChildren() == 1) {
category += "-U";
//System.out.println("Found unary CP"); //testing
}
if (splitBaseNP && baseCategory.equals("NP")) {
if (t.isPrePreTerminal()) {
category = category + "-B";
}
}
if (markPostverbalPP && leftSis.contains("VV") && baseCategory.equals("PP")) {
//System.out.println("Found post-verbal PP");
category += "=lVV";
}
if ((markADgrandchildOfIP || gpaAD) && listBasicCategories(SisterAnnotationStats.kidLabels(t)).contains("AD")) {
category += "^ADVP";
}
if (markCC) {
// marginal conjunctions which don't conjoin 2 things.
for (int i = 1; i < kids.length - 1; i++) {
String cat2 = kids[i].label().value();
if (cat2.startsWith("CC")) {
category += "-CC";
}
}
}
Label label = new CategoryWordTag(category, word, tag);
t.setLabel(label);
}
return t;
}
use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.
the class BiLexPCFGParser method extractParse.
protected Tree extractParse(Edge edge) {
String head = wordIndex.get(words[edge.head]);
String tag = tagIndex.get(edge.tag);
String state = stateIndex.get(edge.state);
Label label = new CategoryWordTag(state, head, tag);
if (edge.backEdge == null && edge.backHook == null) {
// leaf, but needs word terminal
Tree leaf;
if (originalLabels[edge.head] != null) {
leaf = tf.newLeaf(originalLabels[edge.head]);
} else {
leaf = tf.newLeaf(head);
}
List<Tree> childList = Collections.singletonList(leaf);
return tf.newTreeNode(label, childList);
}
if (edge.backHook == null) {
// unary
List<Tree> childList = Collections.singletonList(extractParse(edge.backEdge));
return tf.newTreeNode(label, childList);
}
// binary
List<Tree> children = new ArrayList<>();
if (edge.backHook.isPreHook()) {
children.add(extractParse(edge.backEdge));
children.add(extractParse(edge.backHook.backEdge));
} else {
children.add(extractParse(edge.backHook.backEdge));
children.add(extractParse(edge.backEdge));
}
return tf.newTreeNode(label, children);
}
use of edu.stanford.nlp.ling.CategoryWordTag in project CoreNLP by stanfordnlp.
the class TreeAnnotatorAndBinarizer method addRoot.
/**
* Changes the ROOT label, and adds a Lexicon.BOUNDARY daughter to it.
* This is needed for the dependency parser.
* <i>Note:</i> This is a destructive operation on the tree passed in!!
*
* @param t The current tree into which a boundary is inserted
*/
public void addRoot(Tree t) {
if (t.isLeaf()) {
log.info("Warning: tree is leaf: " + t);
t = tf.newTreeNode(tlp.startSymbol(), Collections.singletonList(t));
}
t.setLabel(new CategoryWordTag(tlp.startSymbol(), Lexicon.BOUNDARY, Lexicon.BOUNDARY_TAG));
List<Tree> preTermChildList = new ArrayList<>();
//CategoryWordTag(Lexicon.BOUNDARY,Lexicon.BOUNDARY,""));
Tree boundaryTerm = tf.newLeaf(new Word(Lexicon.BOUNDARY));
preTermChildList.add(boundaryTerm);
Tree boundaryPreTerm = tf.newTreeNode(new CategoryWordTag(Lexicon.BOUNDARY_TAG, Lexicon.BOUNDARY, Lexicon.BOUNDARY_TAG), preTermChildList);
List<Tree> childList = t.getChildrenAsList();
childList.add(boundaryPreTerm);
t.setChildren(childList);
}
Aggregations