use of info.ephyra.questionanalysis.AnalyzedQuestion in project lucida by claritylab.
the class ScoreNormalizationFilter method readSerializedResults.
/**
* Reads serialized results from a file.
*
* @param input input file
* @return result objects
*/
private static Result[] readSerializedResults(File input) {
ArrayList<Result> results = new ArrayList<Result>();
try {
FileInputStream fis = new FileInputStream(input);
ObjectInputStream ois = new ObjectInputStream(fis);
// then discard it
if (!(ois.readObject() instanceof AnalyzedQuestion)) {
MsgPrinter.printErrorMsg("First serialized object is not an" + "AnalyzedQuestion.");
System.exit(1);
}
try {
while (true) results.add((Result) ois.readObject());
} catch (EOFException e) {
/* end of file reached */
}
ois.close();
} catch (Exception e) {
MsgPrinter.printErrorMsg("Could not read serialized results:");
MsgPrinter.printErrorMsg(e.toString());
System.exit(1);
}
return results.toArray(new Result[results.size()]);
}
use of info.ephyra.questionanalysis.AnalyzedQuestion in project lucida by claritylab.
the class PredicateExtractionFilter method checkSentence.
/**
* Decides if predicates should be extracted from this sentence. If the
* sentence passes the tests, NEs of the expected answer types and terms
* are extracted and added to the result.
*
* @param sentence sentence-level result
* @return <code>true</code> iff the sentence is relevant
*/
private boolean checkSentence(Result sentence) {
AnalyzedQuestion aq = sentence.getQuery().getAnalyzedQuestion();
String s = sentence.getAnswer();
// check the length of the sentence against thresholds
if (s.length() > MAX_SENT_LENGTH_CHARS)
return false;
String[] tokens = NETagger.tokenize(s);
if (tokens.length > MAX_SENT_LENGTH_TOKENS)
return false;
// // check if the sentence contains a matching verb term
// boolean match = false;
// Predicate[] questionPs = aq.getPredicates();
// String[] tokens = OpenNLP.tokenize(s);
// String[] pos = OpenNLP.tagPos(tokens);
// for (int i = 0; i < tokens.length; i++) {
// // look for verbs only
// if (!pos[i].startsWith("VB") || !pos[i].matches("[a-zA-Z]*"))
// continue;
// Term sentenceTerm = new Term(tokens[i], pos[i]);
//
// for (Predicate questionP : questionPs) {
// // compare to predicates with missing arguments only
// if (!questionP.hasMissingArgs()) continue;
// Term predicateTerm = questionP.getVerbTerm();
//
// if (predicateTerm.simScore(sentenceTerm.getLemma()) > 0) {
// match = true;
// break;
// }
// }
//
// if (match) break;
// }
// if (!match) return false;
// -> checked in apply() (performance optimized)
// check if the sentence contains NEs of the expected types
String[] answerTypes = aq.getAnswerTypes();
if (answerTypes.length != 0) {
// answer type known
boolean newNE = false;
Map<String, String[]> extracted = extractNes(s, answerTypes);
String questionNorm = StringUtils.normalize(aq.getQuestion());
for (String ne : extracted.keySet()) {
String neNorm = StringUtils.normalize(ne);
if (!StringUtils.isSubsetKeywords(neNorm, questionNorm)) {
newNE = true;
break;
}
}
// no NEs that are not in the question
if (!newNE)
return false;
sentence.setNes(extracted);
}
// check if the sentence contains a matching argument term
// - single-token terms are extracted first to avoid dictionary lookups
boolean match = false;
Term[] singleTerms = TermExtractor.getSingleTokenTerms(s);
Predicate[] questionPs = aq.getPredicates();
for (Term singleTerm : singleTerms) {
for (Predicate questionP : questionPs) {
// compare to predicates with missing arguments only
if (!questionP.hasMissingArgs())
continue;
Term[] predicateTerms = questionP.getArgTerms();
for (Term predicateTerm : predicateTerms) if (predicateTerm.simScore(singleTerm.getLemma()) > 0) {
match = true;
break;
}
if (match)
break;
}
if (match)
break;
}
if (!match)
return false;
// - multi-token terms are extracted from sentences that pass the test
Dictionary[] dicts = QuestionAnalysis.getDictionaries();
Term[] multiTerms = TermExtractor.getTerms(s, dicts);
sentence.setTerms(multiTerms);
return true;
}
use of info.ephyra.questionanalysis.AnalyzedQuestion in project lucida by claritylab.
the class WikipediaTermImportanceFilter method main.
public static void main(String[] args) {
TEST_TERM_DOWMLOD = true;
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.enableErrorMsgs(true);
// create tokenizer
MsgPrinter.printStatusMsg("Creating tokenizer...");
if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
MsgPrinter.printErrorMsg("Could not create tokenizer.");
// LingPipe.createTokenizer();
// // create sentence detector
// MsgPrinter.printStatusMsg("Creating sentence detector...");
// if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create sentence detector.");
// LingPipe.createSentenceDetector();
// create stemmer
MsgPrinter.printStatusMsg("Creating stemmer...");
SnowballStemmer.create();
// // create part of speech tagger
// MsgPrinter.printStatusMsg("Creating POS tagger...");
// if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz",
// "res/nlp/postagger/opennlp/tagdict"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
// if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
// "train-wsj-0-18.holder"))
// MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
// // create chunker
// MsgPrinter.printStatusMsg("Creating chunker...");
// if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" +
// "EnglishChunk.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create chunker.");
// create named entity taggers
MsgPrinter.printStatusMsg("Creating NE taggers...");
NETagger.loadListTaggers("res/nlp/netagger/lists/");
NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
MsgPrinter.printStatusMsg(" ...loading models");
// if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
// if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
// MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
MsgPrinter.printStatusMsg(" ...done");
WikipediaTermImportanceFilter wtif = new WikipediaTermImportanceFilter(NO_NORMALIZATION, NO_NORMALIZATION, false);
TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
for (TRECTarget target : targets) {
String question = target.getTargetDesc();
// query generation
MsgPrinter.printGeneratingQueries();
String qn = QuestionNormalizer.normalize(question);
// print normalized question string
MsgPrinter.printNormalization(qn);
// log normalized question string
Logger.logNormalization(qn);
String[] kws = KeywordExtractor.getKeywords(qn);
AnalyzedQuestion aq = new AnalyzedQuestion(question);
aq.setKeywords(kws);
aq.setFactoid(false);
Query[] queries = new BagOfWordsG().generateQueries(aq);
for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
Result[] results = new Result[1];
results[0] = new Result("This would be the answer", queries[0]);
wtif.apply(results);
}
}
use of info.ephyra.questionanalysis.AnalyzedQuestion in project lucida by claritylab.
the class EphyraTREC13To16 method askOther.
// Layout 2
// /**
// * Initializes the pipeline for 'other' questions.
// */
// protected void initOther() {
// // query generation
// QueryGeneration.clearQueryGenerators();
//
// // search
// // - knowledge miners for unstructured knowledge sources
// Search.clearKnowledgeMiners();
// for (String[] indriIndices : IndriKM.getIndriIndices())
// Search.addKnowledgeMiner(new IndriKM(indriIndices, false));
// for (String[] indriServers : IndriKM.getIndriServers())
// Search.addKnowledgeMiner(new IndriKM(indriServers, true));
// // - knowledge annotators for (semi-)structured knowledge sources
// Search.clearKnowledgeAnnotators();
//
// // answer extraction and selection
// // (the filters are applied in this order)
// AnswerSelection.clearFilters();
//
// // initialize scores
// AnswerSelection.addFilter(new ScoreResetterFilter());
//
// // extract sentences from snippets
// AnswerSelection.addFilter(new SentenceExtractionFilter());
//
// // cut meaningless introductions from sentences
// AnswerSelection.addFilter(new CutKeywordsFilter());
// AnswerSelection.addFilter(new CutStatementProviderFilter());
// AnswerSelection.addFilter(new SentenceSplitterFilter());
// AnswerSelection.addFilter(new CutKeywordsFilter());
//
// // remove duplicates
// AnswerSelection.addFilter(new DuplicateSnippetFilter());
//
// // throw out enumerations of proper names
// AnswerSelection.addFilter(new ProperNameFilter());
//
// // throw out direct speech snippets, rarely contain useful information
// AnswerSelection.addFilter(new DirectSpeechFilter());
//
// AnswerSelection.addFilter(
// new WikipediaGoogleWebTermImportanceFilter(
// WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
// WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
// false
// )
// );
// AnswerSelection.addFilter(new ScoreSorterFilter());
//
// // cut off result
// AnswerSelection.addFilter(new ResultLengthFilter(3000));
// }
// Layout 3
// /**
// * Initializes the pipeline for 'other' questions.
// */
// protected void initOther() {
// // query generation
// QueryGeneration.clearQueryGenerators();
//
// // search
// // - knowledge miners for unstructured knowledge sources
// Search.clearKnowledgeMiners();
// for (String[] indriIndices : IndriKM.getIndriIndices())
// Search.addKnowledgeMiner(new IndriDocumentKM(indriIndices, false));
// for (String[] indriServers : IndriKM.getIndriServers())
// Search.addKnowledgeMiner(new IndriDocumentKM(indriServers, true));
// // - knowledge annotators for (semi-)structured knowledge sources
// Search.clearKnowledgeAnnotators();
//
// // answer extraction and selection
// // (the filters are applied in this order)
// AnswerSelection.clearFilters();
//
// // initialize scores
// AnswerSelection.addFilter(new ScoreResetterFilter());
//
// // extract sentences from snippets
// AnswerSelection.addFilter(new SentenceExtractionFilter());
//
// // cut meaningless introductions from sentences
// AnswerSelection.addFilter(new CutKeywordsFilter());
// AnswerSelection.addFilter(new CutStatementProviderFilter());
// AnswerSelection.addFilter(new SentenceSplitterFilter());
// AnswerSelection.addFilter(new CutKeywordsFilter());
//
// // remove duplicates
// AnswerSelection.addFilter(new DuplicateSnippetFilter());
//
// // throw out enumerations of proper names
// AnswerSelection.addFilter(new ProperNameFilter());
//
// // throw out direct speech snippets, rarely contain useful information
// AnswerSelection.addFilter(new DirectSpeechFilter());
//
// // sort out snippets containing no new terms
// AnswerSelection.addFilter(new TermFilter());
//
// AnswerSelection.addFilter(
// new WikipediaGoogleWebTermImportanceFilter(
// WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
// WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
// false
// )
// );
// AnswerSelection.addFilter(new ScoreSorterFilter());
//
// // cut off result
// AnswerSelection.addFilter(new ResultLengthFilter(3000));
// }
/**
* Asks Ephyra an 'other' question.
*
* @param question other question
* @return array of results
*/
public final Result[] askOther(String question) {
// initialize pipeline
initOther();
// query generation
MsgPrinter.printGeneratingQueries();
String qn = QuestionNormalizer.normalize(question);
// print normalized question string
MsgPrinter.printNormalization(qn);
// log normalized question string
Logger.logNormalization(qn);
String[] kws = KeywordExtractor.getKeywords(qn);
AnalyzedQuestion aq = new AnalyzedQuestion(question);
aq.setKeywords(kws);
aq.setFactoid(false);
BagOfWordsG gen = new BagOfWordsG();
Query[] queries = gen.generateQueries(aq);
for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
// print query strings
MsgPrinter.printQueryStrings(queries);
// log query strings
Logger.logQueryStrings(queries);
// search
MsgPrinter.printSearching();
Result[] results = Search.doSearch(queries);
// answer selection
MsgPrinter.printSelectingAnswers();
results = AnswerSelection.getResults(results, Integer.MAX_VALUE, 0);
return results;
}
Aggregations