use of edu.cmu.lti.chineseNLP.util.Tree in project lucida by claritylab.
the class EnglishFeatureExtractor method addSyntacticFeatures.
private static void addSyntacticFeatures(MutableInstance instance, List<Term> terms, String parseTree, Term focusTerm) {
if (parseTree == null) {
log.error("Syntactic parse of the question is null.");
return;
}
Tree tree = TreeHelper.buildTree(parseTree, Tree.ENGLISH);
// MAIN_VERB
TreeHelper.markHeadNode(tree);
String mainVerb = tree.getHeadWord();
//mainVerb = WordnetInterface.getLemma("VERB",mainVerb);
try {
IndexWord word = Dictionary.getInstance().lookupIndexWord(POS.VERB, mainVerb);
String lemma = null;
if (word != null)
lemma = word.getLemma();
if (lemma != null)
mainVerb = lemma;
} catch (Exception e) {
log.warn("Failed to get lemma for verb '" + mainVerb + "'", e);
}
if (mainVerb == null)
mainVerb = "-";
instance.addBinary(new Feature("MAIN_VERB" + "." + mainVerb));
// WH_DET
if (focusTerm != null && focusTerm.getText() != null) {
String focus = focusTerm.getText();
String question = "";
for (Term term : terms) question += term.getText() + " ";
question = question.trim();
for (String ptrn : whPtrns) {
Matcher m = Pattern.compile(ptrn + SPACE_PTRN + focus + REST_PTRN).matcher(question);
if (m.matches()) {
instance.addBinary(new Feature("WH_DET" + ".+"));
break;
}
}
}
// FOCUS_ADJ
Tree focusNode = TreeHelper.findFirstPreterminalWithPrecedingPreterminal(tree, "RB|JJ", "WRB");
if (focusNode != null)
instance.addBinary(new Feature("FOCUS_ADJ" + "." + focusNode.getHeadWord()));
}
use of edu.cmu.lti.chineseNLP.util.Tree in project lucida by claritylab.
the class EnglishFeatureExtractor method createInstance.
/**
* Creates and populates an Instance from a QuestionAnalysis object. All
* features are binary features of one of the following types:</p>
*
* Word-level features:
* <ul>
* <li>UNIGRAM : individual words in the question
* <li>BIGRAM : pairs of adjacent words in the question
* <li>WH_WORD : the wh-word in the question if one exists
* </ul>
*
* Syntactic features:
* <ul>
* <li>MAIN_VERB: the syntactic head of the sentence, as defined in
* {@link edu.cmu.lti.chineseNLP.util.TreeHelper TreeHelper}
* <li>FOCUS_ADJ : the adjective following a wh-word (e.g. 'long' in 'How long is it?')
* <li>WH_DET : whether or not the wh-word is the determiner of a noun phrase, as in 'which printer'
* </ul>
*
* Semantic features:
* <ul>
* <li>FOCUS_TYPE : the semantic type of the focus word,
* </ul>
*
* @throws Exception
*/
public Instance createInstance(List<Term> terms, String parseTree) {
String question = "";
for (Term term : terms) question += term + " ";
question = question.trim();
MutableInstance instance = new MutableInstance(question);
// find the focus word
log.debug("Parse: " + parseTree);
Tree tree = TreeHelper.buildTree(parseTree, Tree.ENGLISH);
Term focus = FocusFinder.findFocusTerm(tree);
if (focus != null)
log.debug("Focus: " + focus.getText());
addWordLevelFeatures(instance, terms, focus);
addSyntacticFeatures(instance, terms, parseTree, focus);
addSemanticFeatures(instance, focus);
return instance;
}
use of edu.cmu.lti.chineseNLP.util.Tree in project lucida by claritylab.
the class FocusFinder method findFocusNode.
/**
* Finds the focus, or target, word of a question using the specified tree
* templates and default rules that look for question words.</p>
*
* Using the following algorithm:</p>
* <ol>
* <li> Look for the head word of a phrase that has a wh-word child and
* more than one sibling
* <ul>
* <li> look for the object of a modifying of-<code>PP</code> if the word
* is "kind" or "type"
* </ul>
* <li> Try to extract a node using the specified tree templates in the order
* they appear in the file
* <li> Using only preterminals, look for the last consecutive <code>NN*</code>
* or <code>JJ</code> that follows a <code>WDT</code> or <code>WP</code>
* <li> Using only preterminals, look for the last consecutive <code>NN*</code>
* that follows the terminals, "how many".
* </ol>
*
* the focus word is returned as soon as one is found.</p>
*
* @param tree The syntactic parse tree of the question
* @return the focus word, or <code>null</code> if none was found
*/
public static Tree findFocusNode(Tree tree) {
Tree focus = null;
TreeHelper.markHeadNode(tree);
// look for a WH word with siblings anywhere in the tree
for (String whLabel : whLabels) {
Tree whPhrase = TreeHelper.findNodeWithChild(tree, whLabel);
if (whPhrase != null && whPhrase.numOfChildren() > 1 && !whPhrase.getLabel().equals("SBAR")) {
Tree head = getHeadWordOrPhrase(whPhrase);
TreeHelper.markHeadNode(head);
String headStr = head.getHeadWord();
if (headStr.matches("kinds?") || headStr.matches("types?") || headStr.matches("genres?")) {
Tree headNode = whPhrase.getHeadNode();
Tree parent = TreeHelper.locateParent(headNode, tree);
Tree pp = parent.getChild("PP");
if (pp == null && parent.getLabel().matches("WHNP|NP")) {
Tree grandParent = TreeHelper.locateParent(parent, tree);
pp = grandParent.getChild("PP");
}
if (pp != null && pp.getChild("IN").getHeadWord().equals("of")) {
focus = getHeadWordOrPhrase(pp.getChild("NP"));
break;
}
}
if (head.getLabel().equals("POS")) {
if (whPhrase.numOfChildren() > 2)
return getHeadWordOrPhrase(whPhrase.getChild(whPhrase.getHeadNodeChildIndex() - 1));
}
return head;
}
}
if (focus != null)
return focus;
// use syntactic patterns for when the focus word is apart from the WH word.
for (Tree template : treeTemplates) {
Tree node = TreeHelper.extractNode(tree, template);
if (node == null)
continue;
focus = getHeadWordOrPhrase(node);
if (focus != null)
return focus;
}
// look for a JJ or last consecutive NN preceded by WDT
List<Tree> nodes = TreeHelper.getPreterminalsAfter(tree, "NN.?|JJ", "WDT");
if (nodes == null)
nodes = TreeHelper.getPreterminalsAfter(tree, "NN.?|JJ", "WP");
if (nodes != null) {
focus = Tree.newNode("NP", nodes);
return getHeadWordOrPhrase(focus);
}
// look for a NN* preceded by 'how many'
List<Tree> tags = TreeHelper.getPreterminals(tree).asList();
nodes = new ArrayList<Tree>();
for (ListIterator<Tree> it = tags.listIterator(); it.hasNext(); ) {
Tree tag = it.next();
if (tag.getHeadWord().toLowerCase().equals("how") && it.next().getHeadWord().toLowerCase().matches("many|much")) {
for (ListIterator<Tree> it2 = tags.listIterator(it.nextIndex()); it2.hasNext() && it2.next().getLabel().matches("NN.?"); ) {
nodes.add(tags.get(it2.previousIndex()));
}
break;
}
}
if (nodes.size() > 0) {
focus = Tree.newNode("NP", nodes);
return getHeadWordOrPhrase(focus);
}
return null;
}
use of edu.cmu.lti.chineseNLP.util.Tree in project lucida by claritylab.
the class FocusFinder method findFocusTerm.
/**
* Given a list of Terms, builds a parse tree using Charniak's parser, and
* then uses the resulting parse tree to find the focus words.
*
* @param terms The list of Terms in the question.
* @return the focus word as a Term or null, if one does not exist
*/
public static Term findFocusTerm(List<Term> terms) {
try {
String question = "";
for (Term term : terms) {
question += term + " ";
}
Tree t = findFocusNode(TreeHelper.buildTree(StanfordParser.parse(question), Tree.ENGLISH));
if (t != null) {
Term res = new Term(0, 0, TreeHelper.getLeaves(t));
res.setPOS(t.getLabel());
return res;
}
return null;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
use of edu.cmu.lti.chineseNLP.util.Tree in project lucida by claritylab.
the class FocusFinder method findFocusWord.
/**
* Given a list of Terms, builds a parse tree using Charniak's parser, and
* then uses the resulting parse tree to find the focus words.
*
* @param terms The list of Terms in the question.
* @return the focus word as a String or null, if one does not exist
*/
public static String findFocusWord(List<Term> terms) {
try {
String question = "";
for (Term term : terms) {
question += term + " ";
}
Tree t = findFocusNode(TreeHelper.buildTree(StanfordParser.parse(question), Tree.ENGLISH));
if (t != null)
return TreeHelper.getLeaves(t);
return null;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
Aggregations