use of info.ephyra.search.Result in project lucida by claritylab.
the class WebDocumentFetcher method run.
/**
* Fetches the document text and returns it to the
* <code>WebDocumentFetcherFilter</code>.
*/
public void run() {
// fetch document text, retry up to RETRIES times
String docText = null;
int retries = RETRIES;
boolean cached = false;
do {
// fetch document and convert to plain text
try {
docText = HTMLConverter.url2text(snippet.getDocID());
if (docText == null)
MsgPrinter.printHttpError("Document " + snippet.getDocID() + " not available.");
} catch (SocketTimeoutException e) {
docText = null;
MsgPrinter.printHttpError("Connection to " + snippet.getDocID() + " timed out.");
}
retries--;
// retrieve cached document if original document unavailable
if (docText == null && retries < 0 && snippet.getCacheID() != null && !snippet.getCacheID().equals(snippet.getDocID())) {
MsgPrinter.printErrorMsg("\nCould not fetch original source, " + "trying cached source instead...");
snippet.setDocID(snippet.getCacheID());
retries = RETRIES;
cached = true;
}
} while (docText == null && retries >= 0);
// pass document to WebDocumentFetcherFilter
if (docText != null) {
Result doc = new Result(docText, snippet.getQuery(), snippet.getDocID(), snippet.getHitPos());
doc.setScore(0);
filter.addDoc(doc, cached);
} else {
MsgPrinter.printErrorMsg("\nCould not fetch document.");
filter.addDoc(null, cached);
// System.exit(1);
}
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class WikipediaTermImportanceFilter method main.
public static void main(String[] args) {
TEST_TERM_DOWMLOD = true;
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.enableErrorMsgs(true);
// create tokenizer
MsgPrinter.printStatusMsg("Creating tokenizer...");
if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
MsgPrinter.printErrorMsg("Could not create tokenizer.");
// LingPipe.createTokenizer();
// // create sentence detector
// MsgPrinter.printStatusMsg("Creating sentence detector...");
// if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create sentence detector.");
// LingPipe.createSentenceDetector();
// create stemmer
MsgPrinter.printStatusMsg("Creating stemmer...");
SnowballStemmer.create();
// // create part of speech tagger
// MsgPrinter.printStatusMsg("Creating POS tagger...");
// if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz",
// "res/nlp/postagger/opennlp/tagdict"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
// if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
// "train-wsj-0-18.holder"))
// MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
// // create chunker
// MsgPrinter.printStatusMsg("Creating chunker...");
// if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" +
// "EnglishChunk.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create chunker.");
// create named entity taggers
MsgPrinter.printStatusMsg("Creating NE taggers...");
NETagger.loadListTaggers("res/nlp/netagger/lists/");
NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
MsgPrinter.printStatusMsg(" ...loading models");
// if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
// if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
// MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
MsgPrinter.printStatusMsg(" ...done");
WikipediaTermImportanceFilter wtif = new WikipediaTermImportanceFilter(NO_NORMALIZATION, NO_NORMALIZATION, false);
TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
for (TRECTarget target : targets) {
String question = target.getTargetDesc();
// query generation
MsgPrinter.printGeneratingQueries();
String qn = QuestionNormalizer.normalize(question);
// print normalized question string
MsgPrinter.printNormalization(qn);
// log normalized question string
Logger.logNormalization(qn);
String[] kws = KeywordExtractor.getKeywords(qn);
AnalyzedQuestion aq = new AnalyzedQuestion(question);
aq.setKeywords(kws);
aq.setFactoid(false);
Query[] queries = new BagOfWordsG().generateQueries(aq);
for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
Result[] results = new Result[1];
results[0] = new Result("This would be the answer", queries[0]);
wtif.apply(results);
}
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class EphyraTREC13To16 method runAndEval.
/**
* Initializes Ephyra, asks the questions or loads the answers from a log
* file, evaluates the answers if patterns are available and logs and saves
* the answers.
*/
private static void runAndEval() {
// initialize Ephyra
EphyraTREC13To16 ephyra = new EphyraTREC13To16();
// evaluate for multiple thresholds
boolean firstThreshold = true;
// for (float fAbsThresh = FACTOID_ABS_THRESH;
// fAbsThresh <= 1; fAbsThresh += 0.01) {
float fAbsThresh = FACTOID_ABS_THRESH;
// for (float lRelThresh = LIST_REL_THRESH;
// lRelThresh <= 1; lRelThresh += 0.01) {
float lRelThresh = LIST_REL_THRESH;
for (TRECTarget target : targets) {
MsgPrinter.printTarget(target.getTargetDesc());
// normalize target description, determine target types
if (firstThreshold)
TargetPreprocessor.preprocess(target);
String targetDesc = target.getTargetDesc();
String condensedTarget = target.getCondensedTarget();
TRECQuestion[] questions = target.getQuestions();
// condensed target is used as contextual information
QuestionAnalysis.setContext(condensedTarget);
for (int i = 0; i < questions.length; i++) {
MsgPrinter.printQuestion(questions[i].getQuestionString());
String id = questions[i].getId();
String type = questions[i].getType();
String qs;
if (type.equals("FACTOID") || type.equals("LIST")) {
// resolve coreferences in factoid and list questions
if (firstThreshold) {
MsgPrinter.printResolvingCoreferences();
CorefResolver.resolvePronounsToTarget(target, i);
}
qs = questions[i].getQuestionString();
} else {
qs = targetDesc;
}
// set pattern used to evaluate answers for overlap analysis
OverlapAnalysisFilter.setPattern(null);
if (type.equals("FACTOID")) {
for (TRECPattern pattern : factoidPatterns) {
if (pattern.getId().equals(id)) {
OverlapAnalysisFilter.setPattern(pattern);
break;
}
}
}
// ask Ephyra or load answer from log file
Result[] results = null;
if ((type.equals("FACTOID") && factoidLog) || (type.equals("LIST") && listLog) || (type.equals("OTHER") && otherLog)) {
results = TREC13To16Parser.loadResults(qs, type, inputLogFile);
}
if (results == null) {
// answer not loaded from log file
if (type.equals("FACTOID")) {
Logger.logFactoidStart(qs);
results = ephyra.askFactoid(qs, FACTOID_MAX_ANSWERS, FACTOID_ABS_THRESH);
// results = new Result[0];
Logger.logResults(results);
Logger.logFactoidEnd();
} else if (type.equals("LIST")) {
Logger.logListStart(qs);
results = ephyra.askList(qs, LIST_REL_THRESH);
// results = new Result[0];
Logger.logResults(results);
Logger.logListEnd();
} else {
Logger.logDefinitionalStart(qs);
results = ephyra.askOther(target);
// results = new Result[0];
Logger.logResults(results);
Logger.logDefinitionalEnd();
}
}
// calculate question score if patterns are available
boolean[] correct = null;
if (type.equals("FACTOID") && factoidPatterns != null)
correct = evalFactoidQuestion(id, results, fAbsThresh);
else if (type.equals("LIST") && listPatterns != null)
correct = evalListQuestion(id, results, lRelThresh);
// update target data structure
TRECAnswer[] answers = new TRECAnswer[results.length];
for (int j = 0; j < results.length; j++) {
String answer = results[j].getAnswer();
String supportDoc = results[j].getDocID();
answers[j] = new TRECAnswer(id, answer, supportDoc);
}
questions[i].setAnswers(answers);
if (results.length > 0) {
QuestionInterpretation qi = results[0].getQuery().getInterpretation();
if (qi != null)
questions[i].setInterpretation(qi);
}
if (answers.length == 0) {
// no answer found
answers = new TRECAnswer[1];
if (type.equals("FACTOID"))
answers[0] = new TRECAnswer(id, null, "NIL");
else
answers[0] = new TRECAnswer(id, "No answers found.", "XIE19960101.0001");
}
// save answers to output file
TREC13To16Parser.saveAnswers("log/" + runTag, answers, correct, runTag);
}
// calculate target scores if patterns are available
if (factoidPatterns != null)
evalFactoidTarget();
if (listPatterns != null)
evalListTarget();
}
// calculate component scores and log scores if patterns are available
if (factoidPatterns != null)
evalFactoidTotal(fAbsThresh);
if (listPatterns != null)
evalListTotal(lRelThresh);
firstThreshold = false;
// }
// }
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class EphyraTREC13To16 method askOther.
// Layout 2
// /**
// * Initializes the pipeline for 'other' questions.
// */
// protected void initOther() {
// // query generation
// QueryGeneration.clearQueryGenerators();
//
// // search
// // - knowledge miners for unstructured knowledge sources
// Search.clearKnowledgeMiners();
// for (String[] indriIndices : IndriKM.getIndriIndices())
// Search.addKnowledgeMiner(new IndriKM(indriIndices, false));
// for (String[] indriServers : IndriKM.getIndriServers())
// Search.addKnowledgeMiner(new IndriKM(indriServers, true));
// // - knowledge annotators for (semi-)structured knowledge sources
// Search.clearKnowledgeAnnotators();
//
// // answer extraction and selection
// // (the filters are applied in this order)
// AnswerSelection.clearFilters();
//
// // initialize scores
// AnswerSelection.addFilter(new ScoreResetterFilter());
//
// // extract sentences from snippets
// AnswerSelection.addFilter(new SentenceExtractionFilter());
//
// // cut meaningless introductions from sentences
// AnswerSelection.addFilter(new CutKeywordsFilter());
// AnswerSelection.addFilter(new CutStatementProviderFilter());
// AnswerSelection.addFilter(new SentenceSplitterFilter());
// AnswerSelection.addFilter(new CutKeywordsFilter());
//
// // remove duplicates
// AnswerSelection.addFilter(new DuplicateSnippetFilter());
//
// // throw out enumerations of proper names
// AnswerSelection.addFilter(new ProperNameFilter());
//
// // throw out direct speech snippets, rarely contain useful information
// AnswerSelection.addFilter(new DirectSpeechFilter());
//
// AnswerSelection.addFilter(
// new WikipediaGoogleWebTermImportanceFilter(
// WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
// WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
// false
// )
// );
// AnswerSelection.addFilter(new ScoreSorterFilter());
//
// // cut off result
// AnswerSelection.addFilter(new ResultLengthFilter(3000));
// }
// Layout 3
// /**
// * Initializes the pipeline for 'other' questions.
// */
// protected void initOther() {
// // query generation
// QueryGeneration.clearQueryGenerators();
//
// // search
// // - knowledge miners for unstructured knowledge sources
// Search.clearKnowledgeMiners();
// for (String[] indriIndices : IndriKM.getIndriIndices())
// Search.addKnowledgeMiner(new IndriDocumentKM(indriIndices, false));
// for (String[] indriServers : IndriKM.getIndriServers())
// Search.addKnowledgeMiner(new IndriDocumentKM(indriServers, true));
// // - knowledge annotators for (semi-)structured knowledge sources
// Search.clearKnowledgeAnnotators();
//
// // answer extraction and selection
// // (the filters are applied in this order)
// AnswerSelection.clearFilters();
//
// // initialize scores
// AnswerSelection.addFilter(new ScoreResetterFilter());
//
// // extract sentences from snippets
// AnswerSelection.addFilter(new SentenceExtractionFilter());
//
// // cut meaningless introductions from sentences
// AnswerSelection.addFilter(new CutKeywordsFilter());
// AnswerSelection.addFilter(new CutStatementProviderFilter());
// AnswerSelection.addFilter(new SentenceSplitterFilter());
// AnswerSelection.addFilter(new CutKeywordsFilter());
//
// // remove duplicates
// AnswerSelection.addFilter(new DuplicateSnippetFilter());
//
// // throw out enumerations of proper names
// AnswerSelection.addFilter(new ProperNameFilter());
//
// // throw out direct speech snippets, rarely contain useful information
// AnswerSelection.addFilter(new DirectSpeechFilter());
//
// // sort out snippets containing no new terms
// AnswerSelection.addFilter(new TermFilter());
//
// AnswerSelection.addFilter(
// new WikipediaGoogleWebTermImportanceFilter(
// WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
// WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
// false
// )
// );
// AnswerSelection.addFilter(new ScoreSorterFilter());
//
// // cut off result
// AnswerSelection.addFilter(new ResultLengthFilter(3000));
// }
/**
* Asks Ephyra an 'other' question.
*
* @param question other question
* @return array of results
*/
public final Result[] askOther(String question) {
// initialize pipeline
initOther();
// query generation
MsgPrinter.printGeneratingQueries();
String qn = QuestionNormalizer.normalize(question);
// print normalized question string
MsgPrinter.printNormalization(qn);
// log normalized question string
Logger.logNormalization(qn);
String[] kws = KeywordExtractor.getKeywords(qn);
AnalyzedQuestion aq = new AnalyzedQuestion(question);
aq.setKeywords(kws);
aq.setFactoid(false);
BagOfWordsG gen = new BagOfWordsG();
Query[] queries = gen.generateQueries(aq);
for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
// print query strings
MsgPrinter.printQueryStrings(queries);
// log query strings
Logger.logQueryStrings(queries);
// search
MsgPrinter.printSearching();
Result[] results = Search.doSearch(queries);
// answer selection
MsgPrinter.printSelectingAnswers();
results = AnswerSelection.getResults(results, Integer.MAX_VALUE, 0);
return results;
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class EphyraTREC13To16 method evalListQuestion.
/**
* Calculates the score for a single list question.
*
* @param qid ID of the question
* @param results the results from Ephyra
* @param relThresh relative confidence threshold for results
* @return for each answer a flag that is true iff the answer is correct
*/
private static boolean[] evalListQuestion(String qid, Result[] results, float relThresh) {
// get pattern
TRECPattern pattern = null;
for (TRECPattern listPattern : listPatterns) if (listPattern.getId().equals(qid)) {
pattern = listPattern;
break;
}
// pattern not available
if (pattern == null)
return new boolean[0];
// get results with a score of at least relThresh * top score
ArrayList<Result> resultList = new ArrayList<Result>();
if (results.length > 0) {
float topScore = results[0].getScore();
for (Result result : results) if (result.getScore() >= relThresh * topScore)
resultList.add(result);
}
// F measure
float f = 0;
// correct results
boolean[] correct = new boolean[resultList.size()];
if (resultList.size() > 0) {
String[] regexs = pattern.getRegexs();
// total number of known answers
int total = regexs.length;
// number of returned results
int returned = resultList.size();
// number of answers covered by the results
int covered = 0;
for (String regex : regexs) {
boolean found = false;
for (int i = 0; i < resultList.size(); i++) {
String answer = resultList.get(i).getAnswer();
if (answer.matches(".*?" + regex + ".*+")) {
if (!found) {
covered++;
found = true;
}
correct[i] = true;
}
}
}
if (covered > 0) {
float recall = ((float) covered) / total;
float precision = ((float) covered) / returned;
f = (2 * recall * precision) / (recall + precision);
}
}
listQuestionScores.add(f);
return correct;
}
Aggregations