use of edu.cmu.lti.javelin.qa.Term in project lucida by claritylab.
the class EnglishFeatureExtractor method addSyntacticFeatures.
private static void addSyntacticFeatures(MutableInstance instance, List<Term> terms, String parseTree, Term focusTerm) {
if (parseTree == null) {
log.error("Syntactic parse of the question is null.");
return;
}
Tree tree = TreeHelper.buildTree(parseTree, Tree.ENGLISH);
// MAIN_VERB
TreeHelper.markHeadNode(tree);
String mainVerb = tree.getHeadWord();
//mainVerb = WordnetInterface.getLemma("VERB",mainVerb);
try {
IndexWord word = Dictionary.getInstance().lookupIndexWord(POS.VERB, mainVerb);
String lemma = null;
if (word != null)
lemma = word.getLemma();
if (lemma != null)
mainVerb = lemma;
} catch (Exception e) {
log.warn("Failed to get lemma for verb '" + mainVerb + "'", e);
}
if (mainVerb == null)
mainVerb = "-";
instance.addBinary(new Feature("MAIN_VERB" + "." + mainVerb));
// WH_DET
if (focusTerm != null && focusTerm.getText() != null) {
String focus = focusTerm.getText();
String question = "";
for (Term term : terms) question += term.getText() + " ";
question = question.trim();
for (String ptrn : whPtrns) {
Matcher m = Pattern.compile(ptrn + SPACE_PTRN + focus + REST_PTRN).matcher(question);
if (m.matches()) {
instance.addBinary(new Feature("WH_DET" + ".+"));
break;
}
}
}
// FOCUS_ADJ
Tree focusNode = TreeHelper.findFirstPreterminalWithPrecedingPreterminal(tree, "RB|JJ", "WRB");
if (focusNode != null)
instance.addBinary(new Feature("FOCUS_ADJ" + "." + focusNode.getHeadWord()));
}
use of edu.cmu.lti.javelin.qa.Term in project lucida by claritylab.
the class EnglishFeatureExtractor method createInstance.
/**
* Creates and populates an Instance from a QuestionAnalysis object. All
* features are binary features of one of the following types:</p>
*
* Word-level features:
* <ul>
* <li>UNIGRAM : individual words in the question
* <li>BIGRAM : pairs of adjacent words in the question
* <li>WH_WORD : the wh-word in the question if one exists
* </ul>
*
* Syntactic features:
* <ul>
* <li>MAIN_VERB: the syntactic head of the sentence, as defined in
* {@link edu.cmu.lti.chineseNLP.util.TreeHelper TreeHelper}
* <li>FOCUS_ADJ : the adjective following a wh-word (e.g. 'long' in 'How long is it?')
* <li>WH_DET : whether or not the wh-word is the determiner of a noun phrase, as in 'which printer'
* </ul>
*
* Semantic features:
* <ul>
* <li>FOCUS_TYPE : the semantic type of the focus word,
* </ul>
*
* @throws Exception
*/
public Instance createInstance(List<Term> terms, String parseTree) {
String question = "";
for (Term term : terms) question += term + " ";
question = question.trim();
MutableInstance instance = new MutableInstance(question);
// find the focus word
log.debug("Parse: " + parseTree);
Tree tree = TreeHelper.buildTree(parseTree, Tree.ENGLISH);
Term focus = FocusFinder.findFocusTerm(tree);
if (focus != null)
log.debug("Focus: " + focus.getText());
addWordLevelFeatures(instance, terms, focus);
addSyntacticFeatures(instance, terms, parseTree, focus);
addSemanticFeatures(instance, focus);
return instance;
}
use of edu.cmu.lti.javelin.qa.Term in project lucida by claritylab.
the class EnglishFeatureExtractor method addWordLevelFeatures.
private static void addWordLevelFeatures(MutableInstance instance, List<Term> terms, Term focus) {
String[] words = new String[terms.size()];
for (int i = 0; i < terms.size(); i++) {
Term term = terms.get(i);
if (term.getText() != null)
words[i] = term.getText().replaceAll("\\s+", "_");
else
words[i] = "-";
}
// UNIGRAM
for (int i = 0; i < words.length; i++) {
instance.addBinary(new Feature("UNIGRAM" + "." + words[i]));
}
// BIGRAM
for (int i = 0; i < words.length - 1; i++) {
instance.addBinary(new Feature("BIGRAM" + "." + words[i] + "-" + words[i + 1]));
}
// WH_WORD
String question = "";
for (Term term : terms) question += term.getText() + " ";
question = question.trim();
String whWord = null;
// first look at sentence beginning
for (String ptrn : whPtrns) {
Matcher m = Pattern.compile("^" + ptrn + REST_PTRN).matcher(question);
if (m.matches()) {
whWord = m.group(1).toLowerCase().replaceAll("\\s+", "_");
instance.addBinary(new Feature("WH_WORD" + "." + whWord));
break;
}
}
if (whWord == null) {
// then look anywhere in the sentence
for (String ptrn : whPtrns) {
Matcher m = Pattern.compile(ptrn + REST_PTRN).matcher(question);
if (m.find()) {
whWord = m.group(1).toLowerCase().replaceAll("\\s+", "_");
instance.addBinary(new Feature("WH_WORD" + "." + whWord));
break;
}
}
}
// OF_HEAD
if (focus == null)
return;
for (String word : OF_HEAD_WORDS) {
Matcher m = Pattern.compile(word + "s? of " + focus.getText()).matcher(question);
if (m.find()) {
instance.addBinary(new Feature("OF_HEAD" + "." + word));
break;
}
}
}
use of edu.cmu.lti.javelin.qa.Term in project lucida by claritylab.
the class FeatureExtractor method createInstance.
/**
* Convenience method that tokenizes the given question by whitespace, creates
* Terms, and calls {@link #createInstance(List, String)}.
*
* @param question the question to create an Instance from
* @param parseTree the syntactic parse tree of the question
*/
public Instance createInstance(String question, String parseTree) {
String[] tokens = question.split("\\s+");
List<Term> terms = new ArrayList<Term>();
for (String token : tokens) {
terms.add(new Term(0, 0, token));
}
return createInstance(terms, parseTree);
}
use of edu.cmu.lti.javelin.qa.Term in project lucida by claritylab.
the class FocusFinder method findFocusTerm.
/**
* Given a list of Terms, builds a parse tree using Charniak's parser, and
* then uses the resulting parse tree to find the focus words.
*
* @param terms The list of Terms in the question.
* @return the focus word as a Term or null, if one does not exist
*/
public static Term findFocusTerm(List<Term> terms) {
try {
String question = "";
for (Term term : terms) {
question += term + " ";
}
Tree t = findFocusNode(TreeHelper.buildTree(StanfordParser.parse(question), Tree.ENGLISH));
if (t != null) {
Term res = new Term(0, 0, TreeHelper.getLeaves(t));
res.setPOS(t.getLabel());
return res;
}
return null;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
Aggregations