use of info.ephyra.nlp.semantics.Predicate in project lucida by claritylab.
the class PredicateG method getQueryString.
/**
* Forms a query string from the predicates, terms and individual keywords.
*
* @param predicates predicates in the question
* @param terms terms in the question
* @param kws keywords in the question
* @return query string
*/
public String getQueryString(Predicate[] predicates, Term[] terms, String[] kws) {
ArrayList<String> phraseL = new ArrayList<String>();
HashSet<String> normSet = new HashSet<String>();
// get predicate verbs and arguments
for (Predicate predicate : predicates) {
String[] verbArgs = predicate.getVerbArgs();
for (String verbArg : verbArgs) {
String[] parts = verbArg.split("\t");
for (String part : parts) if (// no words in IGNORE
!part.matches("(?i)" + IGNORE) && // no function words
!FunctionWords.lookup(part) && normSet.add(StringUtils.normalize(part))) {
// drop quotation marks
String noQuotes = part.replace("\"", "");
// add quotation marks for compound phrases
if (noQuotes.matches(".*?\\s.*+"))
noQuotes = "\"" + noQuotes + "\"";
String phrase = noQuotes;
// // append expansions
// Map<String, Double> expMap =
// TermExpander.expandPhrase(part, terms);
// if (expMap.size() > 0) {
// String[] expansions =
// expMap.keySet().toArray(new String[expMap.size()]);
// phrase = "(" + phrase;
// for (String expansion : expansions) {
// // drop quotation marks
// expansion = expansion.replace("\"", "");
// // add quotation marks for compound phrases
// if (expansion.matches(".*?\\s.*+"))
// expansion = "\"" + expansion + "\"";
//
// phrase += " OR " + expansion;
// }
// phrase += ")";
// }
phraseL.add(phrase);
}
}
}
// get terms
// for (Term term : terms) {
// String text = term.getText();
// if (normSet.add(StringUtils.normalize(text))) {
// // add quotation marks for compound phrases
// if (text.matches(".*?\\s.*+"))
// text = "\"" + text + "\"";
//
// String phrase = text;
//
// // append expansions
// Map<String, Double> expMap = term.getExpansions();
// expMap = TermExpander.reduceExpansionsQuery(expMap, true);
// if (expMap != null && expMap.size() > 0) {
// String[] expansions =
// expMap.keySet().toArray(new String[expMap.size()]);
// phrase = "(" + phrase;
// for (String expansion : expansions) {
// // add quotation marks for compound phrases
// if (expansion.matches(".*?\\s.*+"))
// expansion = "\"" + expansion + "\"";
//
// phrase += " OR " + expansion;
// }
// phrase += ")";
// }
//
// phraseL.add(phrase);
// }
// }
// get individual keywords
// - expand keywords (not supported by Web search engines!)
// for (Term term : terms) {
// String phrase;
// Map<String, Double> expMap = term.getExpansions();
// expMap = TermExpander.reduceExpansionsQuery(expMap, true);
// boolean newKeyword = false; // term/expansion contains new keyword?
//
// if (expMap.size() == 0) {
// String[] keywords =
// KeywordExtractor.getKeywords(term.getText());
// List<String> uniqueL = new ArrayList<String>();
// for (String keyword : keywords)
// if (normSet.add(StringUtils.normalize(keyword)))
// uniqueL.add(keyword);
// String[] unique = uniqueL.toArray(new String[uniqueL.size()]);
// phrase = StringUtils.concatWithSpaces(unique);
// if (unique.length > 0) newKeyword = true;
// } else {
// // form AND query from keywords in term
// String[] keywords =
// KeywordExtractor.getKeywords(term.getText());
// String and = StringUtils.concat(keywords, " AND ");
// if (keywords.length > 1)
// and = "(" + and + ")";
// for (String keyword : keywords)
// if (normSet.add(StringUtils.normalize(keyword)))
// newKeyword = true;
//
// phrase = and;
//
// // append expansions
// if (expMap != null && expMap.size() > 0) {
// String[] expansions =
// expMap.keySet().toArray(new String[expMap.size()]);
// phrase = "(" + phrase;
// for (String expansion : expansions) {
// // form AND query from keywords in expansion
// keywords = KeywordExtractor.getKeywords(expansion);
// and = StringUtils.concat(keywords, " AND ");
// if (keywords.length > 1)
// and = "(" + and + ")";
// for (String keyword : keywords)
// if (normSet.add(StringUtils.normalize(keyword)))
// newKeyword = true;
//
// phrase += " OR " + and;
// }
// phrase += ")";
// }
// }
//
// // add phrase to the query if the term or one of its expansions has
// // multiple tokens and thus the keyword query is different from the
// // term query
// if (newKeyword) phraseL.add(phrase);
// }
// - do not expand keywords
// for (String kw : kws)
// if (normSet.add(StringUtils.normalize(kw)))
// phraseL.add(kw);
// build query string
String[] phrases = phraseL.toArray(new String[phraseL.size()]);
String queryString = StringUtils.concatWithSpaces(phrases);
// include context keywords in the query string
for (String kw : kws) if (!StringUtils.equalsCommonNorm(queryString, kw))
queryString += " " + kw;
return queryString;
}
use of info.ephyra.nlp.semantics.Predicate in project lucida by claritylab.
the class PredicateG method generateQueries.
/**
* Generates queries from predicate-argument structures extracted from the
* question string.
*
* @param aq analyzed question
* @return <code>Query</code> objects
*/
public Query[] generateQueries(AnalyzedQuestion aq) {
// only generate a query if predicates could be extracted
Predicate[] ps = aq.getPredicates();
if (ps.length == 0)
return new Query[0];
// create query string
Term[] terms = aq.getTerms();
String[] kws = aq.getKeywords();
String queryString = getQueryString(ps, terms, kws);
// create query, set answer types and predicates
Query[] queries = new Query[1];
queries[0] = new Query(queryString, aq, SCORE);
queries[0].setExtractionTechniques(EXTRACTION_TECHNIQUES);
return queries;
}
use of info.ephyra.nlp.semantics.Predicate in project lucida by claritylab.
the class PredicateExtractor method getPredicates.
/**
* Extracts the predicates from a question string.
*
* @param qn normalized question string
* @param verbMod question string with modified verbs
* @param ats expected answer types
* @param terms question terms
* @return predicate-argument structures
*/
public static Predicate[] getPredicates(String qn, String verbMod, String[] ats, Term[] terms) {
// check if question contains a predicate
if (!containsPredicate(qn))
return new Predicate[0];
// transform question into statement
String statement = questionToStatement(qn, verbMod, ats);
// annotate and extract predicates
String[][] ass = ASSERT.annotatePredicates(new String[] { statement });
String[] as = (ass.length > 0) ? ass[0] : new String[0];
List<Predicate> predicates = new ArrayList<Predicate>();
for (int i = 0; i < as.length; i++) {
// build predicate
Predicate predicate = null;
try {
predicate = new Predicate(statement, as[i], terms);
} catch (ParseException e) {
// System.exit(1);
continue;
}
predicates.add(predicate);
}
// drop placeholders
boolean missingArgs = false;
for (Predicate p : predicates) {
if (p.dropArgs(PERSON_R) | p.dropArgs(THING_R) | p.dropArgs(DATE_TIME_R) | p.dropArgs(DURATION_R) | p.dropArgs(LOCATION_R) | p.dropArgs(PURPOSE_R) | p.dropArgs(MANNER_R) | p.dropArgs(QUANTIFICATION_R) | p.dropArgs(UNKNOWN_R))
missingArgs = true;
}
// (else the answer extraction does not work)
return (missingArgs) ? predicates.toArray(new Predicate[predicates.size()]) : new Predicate[0];
}
use of info.ephyra.nlp.semantics.Predicate in project lucida by claritylab.
the class BagOfTermsG method generateQueries.
/**
* Generates a "bag of terms" query from the terms in the question string.
*
* @param aq analyzed question
* @return <code>Query</code> objects
*/
public Query[] generateQueries(AnalyzedQuestion aq) {
// only generate a query if the answer type is known, predicates could
// be extracted or the question is not a factoid question
String[] ats = aq.getAnswerTypes();
Predicate[] ps = aq.getPredicates();
if (ats.length == 0 && ps.length == 0 && aq.isFactoid())
return new Query[0];
// create query string
Term[] terms = aq.getTerms();
String[] kws = aq.getKeywords();
String queryString = getQueryString(terms, kws);
// create query, set answer types
Query[] queries = new Query[1];
queries[0] = new Query(queryString, aq, SCORE);
queries[0].setExtractionTechniques(EXTRACTION_TECHNIQUES);
return queries;
}
use of info.ephyra.nlp.semantics.Predicate in project lucida by claritylab.
the class WebDocumentFetcher method apply.
/**
* Fetches the top <code>MAX_DOCS</code> documents containing the given
* search engine snippets. The original snippets are dropped.
*
* @param results array of <code>Result</code> objects containing snippets
* @return array of <code>Result</code> objects containing entire documents
*/
public Result[] apply(Result[] results) {
// documents containing the search engine snippets
docs = new ArrayList<Result>();
// start document fetchers
HashSet<String> urls = new HashSet<String>();
for (Result result : results) {
// only apply this filter to results for the semantic parsing
// approach
Query query = result.getQuery();
Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
continue;
// if result is not a web document then just make a copy
if (!result.getDocID().contains(":")) {
Result newResult = result.getCopy();
newResult.setScore(0);
docs.add(newResult);
continue;
}
// fetch at most MAX_DOCS documents
if (urls.size() >= MAX_DOCS)
break;
String url = result.getDocID();
// no forbidden document type
if (url.matches("(?i).*?" + FORBIDDEN_DOCS))
continue;
// only HTTP connections
try {
URLConnection conn = (new URL(url)).openConnection();
if (!(conn instanceof HttpURLConnection))
continue;
} catch (IOException e) {
continue;
}
// no duplicate document
if (!urls.add(url))
continue;
// if caching is enabled, try to read document from cache
if (CACHING) {
FileCache cache = new FileCache(CACHE_DIR);
String[] entries = cache.read(url);
if (entries != null) {
StringBuilder sb = new StringBuilder();
for (String entry : entries) {
sb.append(entry);
sb.append("\n");
}
String docText = sb.toString();
Result doc = new Result(docText, result.getQuery(), url, result.getHitPos());
doc.setScore(0);
docs.add(doc);
continue;
}
}
(new WebDocumentFetcher()).start(this, result);
}
// wait until all fetchers are done
waitForDocs();
// keep old results
Result[] newResults = docs.toArray(new Result[docs.size()]);
Result[] allResults = new Result[results.length + newResults.length];
for (int i = 0; i < results.length; i++) allResults[i] = results[i];
for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
return allResults;
}
Aggregations