use of info.ephyra.questionanalysis.Term in project lucida by claritylab.
the class PredicateExtractionFilter method getAllVerbForms.
/**
* Gets all forms of the verbs and expansions of predicates with missing
* arguments. The verb forms are associated with their weights.
*
* @param ps predicates
* @return verb forms and their weights
*/
private Hashtable<String[], Double> getAllVerbForms(Predicate[] ps) {
Hashtable<String[], Double> allVerbForms = new Hashtable<String[], Double>();
for (Predicate p : ps) {
// get verbs from predicates with missing arguments only
if (!p.hasMissingArgs())
continue;
// get predicate verb and expansions
Term verbTerm = p.getVerbTerm();
String verb = verbTerm.getText();
Map<String, Double> expansionsMap = verbTerm.getExpansions();
Set<String> expansions = expansionsMap.keySet();
// get all verb forms
String infinitive = WordNet.getLemma(verb, WordNet.VERB);
if (infinitive == null)
infinitive = verb;
String[] verbForms = VerbFormConverter.getAllForms(infinitive);
allVerbForms.put(verbForms, 1d);
for (String expansion : expansions) {
infinitive = WordNet.getLemma(expansion, WordNet.VERB);
if (infinitive == null)
infinitive = expansion;
verbForms = VerbFormConverter.getAllForms(infinitive);
allVerbForms.put(verbForms, expansionsMap.get(expansion));
}
}
return allVerbForms;
}
use of info.ephyra.questionanalysis.Term in project lucida by claritylab.
the class PredicateExtractionFilter method apply.
/**
* Extracts relevant predicates from documents.
*
* @param results array of <code>Result</code> objects containing documents
* @return array of <code>Result</code> objects containing predicates
*/
public Result[] apply(Result[] results) {
if (results.length == 0)
return results;
ArrayList<Result> allResults = new ArrayList<Result>();
// extract relevant sentences
// - get sentences that contain relevant verbs,
// use weights of verbs as confidence scores
HashSet<Result> ssSet = new HashSet<Result>();
for (Result result : results) {
// only apply this filter to results for the semantic parsing
// approach
Query query = result.getQuery();
Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() != 0) {
allResults.add(result);
continue;
}
// get all verb forms and build patterns
Hashtable<String[], Double> verbFormsMap = getAllVerbForms(ps);
ArrayList<String> verbPatterns = new ArrayList<String>();
ArrayList<Double> verbWeights = new ArrayList<Double>();
for (String[] verbForms : verbFormsMap.keySet()) {
String verbPattern = "(?i).*?\\b(" + StringUtils.concat(verbForms, "|") + ")\\b.*+";
verbPatterns.add(verbPattern);
verbWeights.add(verbFormsMap.get(verbForms));
}
String[] paragraphs = result.getAnswer().split("\\n");
for (String p : paragraphs) {
// paragraph does not contain relevant verb?
boolean contains = false;
for (String verbPattern : verbPatterns) {
if (p.matches(verbPattern)) {
contains = true;
break;
}
}
if (!contains)
continue;
String[] sentences = LingPipe.sentDetect(p);
for (String s : sentences) {
// sentence does not contain relevant verb?
Double weight = 0d;
for (int i = 0; i < verbPatterns.size(); i++) {
if (s.matches(verbPatterns.get(i))) {
weight = verbWeights.get(i);
break;
}
}
if (weight == 0d)
continue;
// replace whitespaces by single blanks and trim
s = s.replaceAll("\\s++", " ").trim();
// create sentence-level result object
Result sentence = result.getCopy();
sentence.setAnswer(s);
sentence.setScore(weight.floatValue());
ssSet.add(sentence);
}
}
}
// - check if these sentences are relevant,
// get MAX_SENTENCES sentences with most relevant verbs
Result[] ss = ssSet.toArray(new Result[ssSet.size()]);
ss = (new ScoreSorterFilter()).apply(ss);
ArrayList<Result> ssList = new ArrayList<Result>();
for (Result s : ss) {
s.setScore(0);
if (checkSentence(s))
ssList.add(s);
// get at most MAX_SENTENCES sentences
if (ssList.size() >= MAX_SENTENCES)
break;
}
ss = ssList.toArray(new Result[ssList.size()]);
if (ss.length == 0)
return allResults.toArray(new Result[allResults.size()]);
// annotate predicates in sentences
String[] sentences = new String[ss.length];
for (int i = 0; i < ss.length; i++) sentences[i] = ss[i].getAnswer();
String[][] ass = ASSERT.annotatePredicates(sentences);
// extract predicates from annotations
for (int i = 0; i < ass.length; i++) {
Term[] terms = ss[i].getTerms();
Predicate[] questionPs = ss[i].getQuery().getAnalyzedQuestion().getPredicates();
for (int j = 0; j < ass[i].length; j++) {
// build predicate
Predicate predicate = null;
try {
predicate = new Predicate(sentences[i], ass[i][j], terms);
} catch (ParseException e) {
// System.exit(1);
continue;
}
// calculate similarity score
double simScore = 0;
Predicate simPredicate = null;
for (Predicate questionP : questionPs) // compare to predicates with missing arguments only
if (questionP.hasMissingArgs()) {
double currSimScore = predicate.simScore(questionP);
if (currSimScore > simScore) {
simScore = currSimScore;
simPredicate = questionP;
}
}
// keep predicate if it is similar to a question predicate
if (simScore > 0) {
predicate.setSimScore(simScore);
predicate.setSimPredicate(simPredicate);
Result result = ss[i].getCopy();
result.setAnswer(ass[i][j]);
result.setSentence(sentences[i]);
result.setPredicate(predicate);
allResults.add(result);
}
}
}
return allResults.toArray(new Result[allResults.size()]);
}
use of info.ephyra.questionanalysis.Term in project lucida by claritylab.
the class BagOfTermsG method getQueryString.
/**
* Forms a query string from the terms and individual keywords.
*
* @param terms terms in the question
* @param kws keywords in the question
* @return query string
*/
private String getQueryString(Term[] terms, String[] kws) {
ArrayList<String> phraseL = new ArrayList<String>();
HashSet<String> normSet = new HashSet<String>();
// get terms
for (Term term : terms) {
String text = term.getText();
if (normSet.add(StringUtils.normalize(text))) {
// add quotation marks for compound phrases
if (text.matches(".*?\\s.*+"))
text = "\"" + text + "\"";
String phrase = text;
// append expansions
Map<String, Double> expMap = term.getExpansions();
expMap = TermExpander.reduceExpansionsQuery(expMap, true);
if (expMap != null && expMap.size() > 0) {
String[] expansions = expMap.keySet().toArray(new String[expMap.size()]);
phrase = "(" + phrase;
for (String expansion : expansions) {
// add quotation marks for compound phrases
if (expansion.matches(".*?\\s.*+"))
expansion = "\"" + expansion + "\"";
phrase += " OR " + expansion;
}
phrase += ")";
}
phraseL.add(phrase);
}
}
// get individual keywords
// - expand keywords (not supported by Web search engines!)
// for (Term term : terms) {
// String phrase;
// Map<String, Double> expMap = term.getExpansions();
// expMap = TermExpander.reduceExpansionsQuery(expMap, true);
// boolean newKeyword = false; // term/expansion contains new keyword?
//
// if (expMap.size() == 0) {
// String[] keywords =
// KeywordExtractor.getKeywords(term.getText());
// List<String> uniqueL = new ArrayList<String>();
// for (String keyword : keywords)
// if (normSet.add(StringUtils.normalize(keyword)))
// uniqueL.add(keyword);
// String[] unique = uniqueL.toArray(new String[uniqueL.size()]);
// phrase = StringUtils.concatWithSpaces(unique);
// if (unique.length > 0) newKeyword = true;
// } else {
// // form AND query from keywords in term
// String[] keywords =
// KeywordExtractor.getKeywords(term.getText());
// String and = StringUtils.concat(keywords, " AND ");
// if (keywords.length > 1)
// and = "(" + and + ")";
// for (String keyword : keywords)
// if (normSet.add(StringUtils.normalize(keyword)))
// newKeyword = true;
//
// phrase = and;
//
// // append expansions
// if (expMap != null && expMap.size() > 0) {
// String[] expansions =
// expMap.keySet().toArray(new String[expMap.size()]);
// phrase = "(" + phrase;
// for (String expansion : expansions) {
// // form AND query from keywords in expansion
// keywords = KeywordExtractor.getKeywords(expansion);
// and = StringUtils.concat(keywords, " AND ");
// if (keywords.length > 1)
// and = "(" + and + ")";
// for (String keyword : keywords)
// if (normSet.add(StringUtils.normalize(keyword)))
// newKeyword = true;
//
// phrase += " OR " + and;
// }
// phrase += ")";
// }
// }
//
// // add phrase to the query if the term or one of its expansions has
// // multiple tokens and thus the keyword query is different from the
// // term query
// if (newKeyword) phraseL.add(phrase);
// }
// - do not expand keywords
// for (String kw : kws)
// if (normSet.add(StringUtils.normalize(kw)))
// phraseL.add(kw);
// build query string
String[] phrases = phraseL.toArray(new String[phraseL.size()]);
String queryString = StringUtils.concatWithSpaces(phrases);
return queryString;
}
use of info.ephyra.questionanalysis.Term in project lucida by claritylab.
the class BagOfWordsG method generateQueries.
/**
* Generates a "bag of words" query from the keywords in the question
* string.
*
* @param aq analyzed question
* @return <code>Query</code> objects
*/
public Query[] generateQueries(AnalyzedQuestion aq) {
// only generate a query if the answer type is known, predicates could
// be extracted or the question is not a factoid question
String[] ats = aq.getAnswerTypes();
Predicate[] ps = aq.getPredicates();
if (ats.length == 0 && ps.length == 0 && aq.isFactoid())
return new Query[0];
// create query string
Term[] terms = aq.getTerms();
String[] kws = aq.getKeywords();
String queryString = getQueryString(terms, kws);
// create query, set answer types
Query[] queries = new Query[1];
queries[0] = new Query(queryString, aq, SCORE);
queries[0].setExtractionTechniques(EXTRACTION_TECHNIQUES);
return queries;
}
use of info.ephyra.questionanalysis.Term in project lucida by claritylab.
the class Predicate method simScore.
/**
* Calculates a similarity score from [0,1] for this predicate and a given predicate.
*
* @param p predicate to compare with
* @return similarity score
*/
public double simScore(Predicate p) {
// calculate similarity score for the verbs
Term pVerbTerm = p.getVerbTerm();
if (verbTerm == null || pVerbTerm == null)
return 0;
String verbTermLemma = verbTerm.getLemma();
double verbScore = pVerbTerm.simScore(verbTermLemma);
if (verbScore == 0)
return 0;
// calculate similarity score for the arguments
// (Jaccard coefficient)
Term[] pArgTerms = p.getArgTerms();
if (argTerms.length == 0 || pArgTerms.length == 0)
return 0;
double intersect = 0;
int union = pArgTerms.length;
for (Term argTerm : argTerms) {
String argTermLemma = argTerm.getLemma();
double argTermScore = 0;
for (Term pArgTerm : pArgTerms) argTermScore = Math.max(argTermScore, pArgTerm.simScore(argTermLemma));
if (argTermScore > 0)
intersect += argTermScore;
else
union++;
}
double argScore = intersect / union;
if (argScore == 0)
return 0;
// predicate similarity score is the product of verb score and argument score
simScore = verbScore * argScore;
// remember the predicate this predicate was compared to
simPredicate = p;
return simScore;
}
Aggregations