use of info.ephyra.querygeneration.Query in project lucida by claritylab.
the class WebTermImportanceFilter method main.
public static void main(String[] args) {
TEST_TARGET_GENERATION = true;
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.enableErrorMsgs(true);
// create tokenizer
MsgPrinter.printStatusMsg("Creating tokenizer...");
if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
MsgPrinter.printErrorMsg("Could not create tokenizer.");
// LingPipe.createTokenizer();
// create sentence detector
// MsgPrinter.printStatusMsg("Creating sentence detector...");
// if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create sentence detector.");
// LingPipe.createSentenceDetector();
// create stemmer
MsgPrinter.printStatusMsg("Creating stemmer...");
SnowballStemmer.create();
// create part of speech tagger
MsgPrinter.printStatusMsg("Creating POS tagger...");
if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz", "res/nlp/postagger/opennlp/tagdict"))
MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
// if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
// "train-wsj-0-18.holder"))
// MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
// create chunker
MsgPrinter.printStatusMsg("Creating chunker...");
if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" + "EnglishChunk.bin.gz"))
MsgPrinter.printErrorMsg("Could not create chunker.");
// create named entity taggers
MsgPrinter.printStatusMsg("Creating NE taggers...");
NETagger.loadListTaggers("res/nlp/netagger/lists/");
NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
MsgPrinter.printStatusMsg(" ...loading models");
// MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
MsgPrinter.printStatusMsg(" ...done");
WebTermImportanceFilter wtif = new TargetGeneratorTest(NO_NORMALIZATION);
TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
for (TRECTarget target : targets) {
String question = target.getTargetDesc();
// query generation
MsgPrinter.printGeneratingQueries();
String qn = QuestionNormalizer.normalize(question);
// print normalized question string
MsgPrinter.printNormalization(qn);
// log normalized question string
Logger.logNormalization(qn);
String[] kws = KeywordExtractor.getKeywords(qn);
AnalyzedQuestion aq = new AnalyzedQuestion(question);
aq.setKeywords(kws);
aq.setFactoid(false);
Query[] queries = new BagOfWordsG().generateQueries(aq);
for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
Result[] results = new Result[1];
results[0] = new Result("This would be the answer", queries[0]);
wtif.apply(results);
}
}
use of info.ephyra.querygeneration.Query in project lucida by claritylab.
the class WebDocumentFetcher method apply.
/**
* Fetches the top <code>MAX_DOCS</code> documents containing the given
* search engine snippets. The original snippets are dropped.
*
* @param results array of <code>Result</code> objects containing snippets
* @return array of <code>Result</code> objects containing entire documents
*/
public Result[] apply(Result[] results) {
// documents containing the search engine snippets
docs = new ArrayList<Result>();
// start document fetchers
HashSet<String> urls = new HashSet<String>();
for (Result result : results) {
// only apply this filter to results for the semantic parsing
// approach
Query query = result.getQuery();
Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
continue;
// if result is not a web document then just make a copy
if (!result.getDocID().contains(":")) {
Result newResult = result.getCopy();
newResult.setScore(0);
docs.add(newResult);
continue;
}
// fetch at most MAX_DOCS documents
if (urls.size() >= MAX_DOCS)
break;
String url = result.getDocID();
// no forbidden document type
if (url.matches("(?i).*?" + FORBIDDEN_DOCS))
continue;
// only HTTP connections
try {
URLConnection conn = (new URL(url)).openConnection();
if (!(conn instanceof HttpURLConnection))
continue;
} catch (IOException e) {
continue;
}
// no duplicate document
if (!urls.add(url))
continue;
// if caching is enabled, try to read document from cache
if (CACHING) {
FileCache cache = new FileCache(CACHE_DIR);
String[] entries = cache.read(url);
if (entries != null) {
StringBuilder sb = new StringBuilder();
for (String entry : entries) {
sb.append(entry);
sb.append("\n");
}
String docText = sb.toString();
Result doc = new Result(docText, result.getQuery(), url, result.getHitPos());
doc.setScore(0);
docs.add(doc);
continue;
}
}
(new WebDocumentFetcher()).start(this, result);
}
// wait until all fetchers are done
waitForDocs();
// keep old results
Result[] newResults = docs.toArray(new Result[docs.size()]);
Result[] allResults = new Result[results.length + newResults.length];
for (int i = 0; i < results.length; i++) allResults[i] = results[i];
for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
return allResults;
}
use of info.ephyra.querygeneration.Query in project lucida by claritylab.
the class FactoidsFromPredicatesFilter method apply.
/**
* Extracts factoids from the predicates withing the answer strings of the
* <code>Result</code> objects and creates a new <code>Result</code> for
* each extracted unique answer.
*
* @param results array of <code>Result</code> objects containing predicates
* @return array of <code>Result</code> objects containing factoids
*/
public Result[] apply(Result[] results) {
// old results that are passed along the pipeline
ArrayList<Result> oldResults = new ArrayList<Result>();
// extracted factoid answers and corresponding results
Hashtable<String, Result> factoids = new Hashtable<String, Result>();
// extracted factoid answers and maximum weights of predicates
Hashtable<String, Double> maxScores = new Hashtable<String, Double>();
for (Result result : results) {
// only apply this filter to results for the semantic parsing
// approach
Query query = result.getQuery();
Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
if (!query.extractWith(ID) || ps.length == 0 || result.getScore() != 0) {
oldResults.add(result);
continue;
}
Predicate p = result.getPredicate();
Predicate questionP = p.getSimPredicate();
double simScore = p.getSimScore();
Map<String, String[]> nes = result.getNes();
// get answer strings
ArrayList<String> answers = new ArrayList<String>();
if (nes != null) {
// - allow entities in all arguments
for (String ne : nes.keySet()) for (String arg : p.getArgs()) if (arg.contains(ne)) {
answers.add(ne);
break;
}
// - allow entities in missing arguments only
// for (String ne : nes.keySet())
// for (String missing : questionP.getMissingArgs()) {
// String arg = p.get(missing);
// if (arg != null && arg.contains(ne)) {
// answers.add(ne);
// break;
// }
// }
} else {
// arguments as factoid answers
for (String missing : questionP.getMissingArgs()) {
String arg = p.get(missing);
if (arg != null)
answers.add(arg);
}
}
// create result objects
for (String answer : answers) {
String norm = StringUtils.normalize(answer);
Result factoid = factoids.get(norm);
if (factoid == null) {
// new answer
// query, doc ID and sentence can be ambiguous
factoid = new Result(answer, result.getQuery(), result.getDocID());
factoid.setSentence(result.getSentence());
factoid.addExtractionTechnique(ID);
factoids.put(norm, factoid);
maxScores.put(norm, simScore);
} else if (simScore > maxScores.get(norm)) {
// remember document ID of predicate with highest score
factoid.setDocID(result.getDocID());
maxScores.put(norm, simScore);
}
if (nes != null)
for (String neType : nes.get(answer)) factoid.addNeType(neType);
factoid.incScore((float) simScore);
}
}
// keep old results
Result[] newResults = factoids.values().toArray(new Result[factoids.size()]);
Result[] allResults = new Result[oldResults.size() + newResults.length];
oldResults.toArray(allResults);
for (int i = 0; i < newResults.length; i++) allResults[oldResults.size() + i] = newResults[i];
return allResults;
}
use of info.ephyra.querygeneration.Query in project lucida by claritylab.
the class AnswerTypeFilter method apply.
/**
* Extracts NEs of particular types from the answer strings of the
* <code>Result</code> objects and creates a new <code>Result</code> for
* each extracted unique answer.
*
* @param results array of <code>Result</code> objects
* @return extended array of <code>Result</code> objects
*/
public Result[] apply(Result[] results) {
// extracted factoid answers and corresponding results
Hashtable<String, Result> factoids = new Hashtable<String, Result>();
for (Result result : results) {
// only apply this filter to results for the answer type testing
// approach
Query query = result.getQuery();
String[] answerTypes = query.getAnalyzedQuestion().getAnswerTypes();
if (!query.extractWith(ID) || answerTypes.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
continue;
// split answer string into sentences and tokenize sentences
String answer = result.getAnswer();
String[] sentences = OpenNLP.sentDetect(answer);
String[][] tokens = new String[sentences.length][];
for (int i = 0; i < sentences.length; i++) tokens[i] = NETagger.tokenize(sentences[i]);
for (String answerType : answerTypes) {
// get IDs of the taggers for the most specific NE type that can
// be tagged
String[] neTypes = answerType.split("->");
int[] neIds = new int[0];
for (String neType : neTypes) {
int[] thisIds = NETagger.getNeIds(neType);
if (thisIds.length > 0)
neIds = thisIds;
}
// extract NEs of that type
for (int neId : neIds) {
String neType = NETagger.getNeType(neId);
String[][] nes = NETagger.extractNes(tokens, neId);
for (int i = 0; i < sentences.length; i++) {
// untokenize NEs
for (int j = 0; j < nes[i].length; j++) nes[i][j] = OpenNLP.untokenize(nes[i][j], sentences[i]);
// create new result for each unique normalized NE
for (String ne : nes[i]) {
String norm = StringUtils.normalize(ne);
Result factoid = factoids.get(norm);
if (factoid == null) {
// new answer
// query, doc ID and sentence can be ambiguous
factoid = new Result(ne, result.getQuery(), result.getDocID());
factoid.setSentence(sentences[i]);
factoid.addExtractionTechnique(ID);
factoids.put(norm, factoid);
}
factoid.addNeType(neType);
factoid.incScore(1);
// TODO consider query score, #keywords, hit pos
}
}
}
}
}
// keep old results
Result[] newResults = factoids.values().toArray(new Result[factoids.size()]);
Result[] allResults = new Result[results.length + newResults.length];
for (int i = 0; i < results.length; i++) allResults[i] = results[i];
for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
return allResults;
}
use of info.ephyra.querygeneration.Query in project lucida by claritylab.
the class PredicateExtractionFilter method apply.
/**
* Extracts relevant predicates from documents.
*
* @param results array of <code>Result</code> objects containing documents
* @return array of <code>Result</code> objects containing predicates
*/
public Result[] apply(Result[] results) {
if (results.length == 0)
return results;
ArrayList<Result> allResults = new ArrayList<Result>();
// extract relevant sentences
// - get sentences that contain relevant verbs,
// use weights of verbs as confidence scores
HashSet<Result> ssSet = new HashSet<Result>();
for (Result result : results) {
// only apply this filter to results for the semantic parsing
// approach
Query query = result.getQuery();
Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() != 0) {
allResults.add(result);
continue;
}
// get all verb forms and build patterns
Hashtable<String[], Double> verbFormsMap = getAllVerbForms(ps);
ArrayList<String> verbPatterns = new ArrayList<String>();
ArrayList<Double> verbWeights = new ArrayList<Double>();
for (String[] verbForms : verbFormsMap.keySet()) {
String verbPattern = "(?i).*?\\b(" + StringUtils.concat(verbForms, "|") + ")\\b.*+";
verbPatterns.add(verbPattern);
verbWeights.add(verbFormsMap.get(verbForms));
}
String[] paragraphs = result.getAnswer().split("\\n");
for (String p : paragraphs) {
// paragraph does not contain relevant verb?
boolean contains = false;
for (String verbPattern : verbPatterns) {
if (p.matches(verbPattern)) {
contains = true;
break;
}
}
if (!contains)
continue;
String[] sentences = LingPipe.sentDetect(p);
for (String s : sentences) {
// sentence does not contain relevant verb?
Double weight = 0d;
for (int i = 0; i < verbPatterns.size(); i++) {
if (s.matches(verbPatterns.get(i))) {
weight = verbWeights.get(i);
break;
}
}
if (weight == 0d)
continue;
// replace whitespaces by single blanks and trim
s = s.replaceAll("\\s++", " ").trim();
// create sentence-level result object
Result sentence = result.getCopy();
sentence.setAnswer(s);
sentence.setScore(weight.floatValue());
ssSet.add(sentence);
}
}
}
// - check if these sentences are relevant,
// get MAX_SENTENCES sentences with most relevant verbs
Result[] ss = ssSet.toArray(new Result[ssSet.size()]);
ss = (new ScoreSorterFilter()).apply(ss);
ArrayList<Result> ssList = new ArrayList<Result>();
for (Result s : ss) {
s.setScore(0);
if (checkSentence(s))
ssList.add(s);
// get at most MAX_SENTENCES sentences
if (ssList.size() >= MAX_SENTENCES)
break;
}
ss = ssList.toArray(new Result[ssList.size()]);
if (ss.length == 0)
return allResults.toArray(new Result[allResults.size()]);
// annotate predicates in sentences
String[] sentences = new String[ss.length];
for (int i = 0; i < ss.length; i++) sentences[i] = ss[i].getAnswer();
String[][] ass = ASSERT.annotatePredicates(sentences);
// extract predicates from annotations
for (int i = 0; i < ass.length; i++) {
Term[] terms = ss[i].getTerms();
Predicate[] questionPs = ss[i].getQuery().getAnalyzedQuestion().getPredicates();
for (int j = 0; j < ass[i].length; j++) {
// build predicate
Predicate predicate = null;
try {
predicate = new Predicate(sentences[i], ass[i][j], terms);
} catch (ParseException e) {
// System.exit(1);
continue;
}
// calculate similarity score
double simScore = 0;
Predicate simPredicate = null;
for (Predicate questionP : questionPs) // compare to predicates with missing arguments only
if (questionP.hasMissingArgs()) {
double currSimScore = predicate.simScore(questionP);
if (currSimScore > simScore) {
simScore = currSimScore;
simPredicate = questionP;
}
}
// keep predicate if it is similar to a question predicate
if (simScore > 0) {
predicate.setSimScore(simScore);
predicate.setSimPredicate(simPredicate);
Result result = ss[i].getCopy();
result.setAnswer(ass[i][j]);
result.setSentence(sentences[i]);
result.setPredicate(predicate);
allResults.add(result);
}
}
}
return allResults.toArray(new Result[allResults.size()]);
}
Aggregations