Search in sources :

Example 6 with AnalyzedQuestion

use of info.ephyra.questionanalysis.AnalyzedQuestion in project lucida by claritylab.

the class ScoreNormalizationFilter method readSerializedResults.

/**
	 * Reads serialized results from a file.
	 * 
	 * @param input input file
	 * @return result objects
	 */
private static Result[] readSerializedResults(File input) {
    ArrayList<Result> results = new ArrayList<Result>();
    try {
        FileInputStream fis = new FileInputStream(input);
        ObjectInputStream ois = new ObjectInputStream(fis);
        // then discard it
        if (!(ois.readObject() instanceof AnalyzedQuestion)) {
            MsgPrinter.printErrorMsg("First serialized object is not an" + "AnalyzedQuestion.");
            System.exit(1);
        }
        try {
            while (true) results.add((Result) ois.readObject());
        } catch (EOFException e) {
        /* end of file reached */
        }
        ois.close();
    } catch (Exception e) {
        MsgPrinter.printErrorMsg("Could not read serialized results:");
        MsgPrinter.printErrorMsg(e.toString());
        System.exit(1);
    }
    return results.toArray(new Result[results.size()]);
}
Also used : ArrayList(java.util.ArrayList) EOFException(java.io.EOFException) AnalyzedQuestion(info.ephyra.questionanalysis.AnalyzedQuestion) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) EOFException(java.io.EOFException) Result(info.ephyra.search.Result) ObjectInputStream(java.io.ObjectInputStream)

Example 7 with AnalyzedQuestion

use of info.ephyra.questionanalysis.AnalyzedQuestion in project lucida by claritylab.

the class PredicateExtractionFilter method checkSentence.

/**
	 * Decides if predicates should be extracted from this sentence. If the
	 * sentence passes the tests, NEs of the expected answer types and terms
	 * are extracted and added to the result.
	 * 
	 * @param sentence sentence-level result
	 * @return <code>true</code> iff the sentence is relevant
	 */
private boolean checkSentence(Result sentence) {
    AnalyzedQuestion aq = sentence.getQuery().getAnalyzedQuestion();
    String s = sentence.getAnswer();
    // check the length of the sentence against thresholds
    if (s.length() > MAX_SENT_LENGTH_CHARS)
        return false;
    String[] tokens = NETagger.tokenize(s);
    if (tokens.length > MAX_SENT_LENGTH_TOKENS)
        return false;
    //		// check if the sentence contains a matching verb term
    //		boolean match = false;
    //		Predicate[] questionPs = aq.getPredicates();
    //		String[] tokens = OpenNLP.tokenize(s);
    //		String[] pos = OpenNLP.tagPos(tokens);
    //		for (int i = 0; i < tokens.length; i++) {
    //			// look for verbs only
    //			if (!pos[i].startsWith("VB") || !pos[i].matches("[a-zA-Z]*"))
    //				continue;
    //			Term sentenceTerm = new Term(tokens[i], pos[i]);
    //			
    //			for (Predicate questionP : questionPs) {
    //				// compare to predicates with missing arguments only
    //				if (!questionP.hasMissingArgs()) continue;
    //				Term predicateTerm = questionP.getVerbTerm();
    //				
    //				if (predicateTerm.simScore(sentenceTerm.getLemma()) > 0) {
    //					match = true;
    //					break;
    //				}
    //			}
    //			
    //			if (match) break;
    //		}
    //		if (!match) return false;
    //		-> checked in apply() (performance optimized)
    // check if the sentence contains NEs of the expected types
    String[] answerTypes = aq.getAnswerTypes();
    if (answerTypes.length != 0) {
        // answer type known
        boolean newNE = false;
        Map<String, String[]> extracted = extractNes(s, answerTypes);
        String questionNorm = StringUtils.normalize(aq.getQuestion());
        for (String ne : extracted.keySet()) {
            String neNorm = StringUtils.normalize(ne);
            if (!StringUtils.isSubsetKeywords(neNorm, questionNorm)) {
                newNE = true;
                break;
            }
        }
        // no NEs that are not in the question
        if (!newNE)
            return false;
        sentence.setNes(extracted);
    }
    // check if the sentence contains a matching argument term
    // - single-token terms are extracted first to avoid dictionary lookups
    boolean match = false;
    Term[] singleTerms = TermExtractor.getSingleTokenTerms(s);
    Predicate[] questionPs = aq.getPredicates();
    for (Term singleTerm : singleTerms) {
        for (Predicate questionP : questionPs) {
            // compare to predicates with missing arguments only
            if (!questionP.hasMissingArgs())
                continue;
            Term[] predicateTerms = questionP.getArgTerms();
            for (Term predicateTerm : predicateTerms) if (predicateTerm.simScore(singleTerm.getLemma()) > 0) {
                match = true;
                break;
            }
            if (match)
                break;
        }
        if (match)
            break;
    }
    if (!match)
        return false;
    // - multi-token terms are extracted from sentences that pass the test
    Dictionary[] dicts = QuestionAnalysis.getDictionaries();
    Term[] multiTerms = TermExtractor.getTerms(s, dicts);
    sentence.setTerms(multiTerms);
    return true;
}
Also used : Dictionary(info.ephyra.util.Dictionary) AnalyzedQuestion(info.ephyra.questionanalysis.AnalyzedQuestion) Term(info.ephyra.questionanalysis.Term) Predicate(info.ephyra.nlp.semantics.Predicate)

Example 8 with AnalyzedQuestion

use of info.ephyra.questionanalysis.AnalyzedQuestion in project lucida by claritylab.

the class WikipediaTermImportanceFilter method main.

public static void main(String[] args) {
    TEST_TERM_DOWMLOD = true;
    MsgPrinter.enableStatusMsgs(true);
    MsgPrinter.enableErrorMsgs(true);
    // create tokenizer
    MsgPrinter.printStatusMsg("Creating tokenizer...");
    if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
        MsgPrinter.printErrorMsg("Could not create tokenizer.");
    //		LingPipe.createTokenizer();
    //		// create sentence detector
    //		MsgPrinter.printStatusMsg("Creating sentence detector...");
    //		if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
    //			MsgPrinter.printErrorMsg("Could not create sentence detector.");
    //		LingPipe.createSentenceDetector();
    // create stemmer
    MsgPrinter.printStatusMsg("Creating stemmer...");
    SnowballStemmer.create();
    //		// create part of speech tagger
    //		MsgPrinter.printStatusMsg("Creating POS tagger...");
    //		if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz",
    //									 "res/nlp/postagger/opennlp/tagdict"))
    //			MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
    //		if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
    //				"train-wsj-0-18.holder"))
    //			MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
    //		// create chunker
    //		MsgPrinter.printStatusMsg("Creating chunker...");
    //		if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" +
    //								   "EnglishChunk.bin.gz"))
    //			MsgPrinter.printErrorMsg("Could not create chunker.");
    // create named entity taggers
    MsgPrinter.printStatusMsg("Creating NE taggers...");
    NETagger.loadListTaggers("res/nlp/netagger/lists/");
    NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
    MsgPrinter.printStatusMsg("  ...loading models");
    //		if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/"))
    //			MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
    //		if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
    //			MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
    MsgPrinter.printStatusMsg("  ...done");
    WikipediaTermImportanceFilter wtif = new WikipediaTermImportanceFilter(NO_NORMALIZATION, NO_NORMALIZATION, false);
    TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
    for (TRECTarget target : targets) {
        String question = target.getTargetDesc();
        // query generation
        MsgPrinter.printGeneratingQueries();
        String qn = QuestionNormalizer.normalize(question);
        // print normalized question string
        MsgPrinter.printNormalization(qn);
        // log normalized question string
        Logger.logNormalization(qn);
        String[] kws = KeywordExtractor.getKeywords(qn);
        AnalyzedQuestion aq = new AnalyzedQuestion(question);
        aq.setKeywords(kws);
        aq.setFactoid(false);
        Query[] queries = new BagOfWordsG().generateQueries(aq);
        for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
        Result[] results = new Result[1];
        results[0] = new Result("This would be the answer", queries[0]);
        wtif.apply(results);
    }
}
Also used : Query(info.ephyra.querygeneration.Query) TRECTarget(info.ephyra.trec.TRECTarget) AnalyzedQuestion(info.ephyra.questionanalysis.AnalyzedQuestion) BagOfWordsG(info.ephyra.querygeneration.generators.BagOfWordsG) Result(info.ephyra.search.Result)

Example 9 with AnalyzedQuestion

use of info.ephyra.questionanalysis.AnalyzedQuestion in project lucida by claritylab.

the class EphyraTREC13To16 method askOther.

// Layout 2
//	/**
//	 * Initializes the pipeline for 'other' questions.
//	 */
//	protected void initOther() {
//		// query generation
//		QueryGeneration.clearQueryGenerators();
//		
//		// search
//		// - knowledge miners for unstructured knowledge sources
//		Search.clearKnowledgeMiners();
//		for (String[] indriIndices : IndriKM.getIndriIndices())
//			Search.addKnowledgeMiner(new IndriKM(indriIndices, false));
//		for (String[] indriServers : IndriKM.getIndriServers())
//			Search.addKnowledgeMiner(new IndriKM(indriServers, true));
//		// - knowledge annotators for (semi-)structured knowledge sources
//		Search.clearKnowledgeAnnotators();
//		
//		// answer extraction and selection
//		// (the filters are applied in this order)
//		AnswerSelection.clearFilters();
//		
//		//	initialize scores
//		AnswerSelection.addFilter(new ScoreResetterFilter());
//		
//		//	extract sentences from snippets
//		AnswerSelection.addFilter(new SentenceExtractionFilter());
//		
//		//	cut meaningless introductions from sentences
//		AnswerSelection.addFilter(new CutKeywordsFilter());
//		AnswerSelection.addFilter(new CutStatementProviderFilter());
//		AnswerSelection.addFilter(new SentenceSplitterFilter());
//		AnswerSelection.addFilter(new CutKeywordsFilter());
//		
//		//	remove duplicates
//		AnswerSelection.addFilter(new DuplicateSnippetFilter());
//		
//		//	throw out enumerations of proper names
//		AnswerSelection.addFilter(new ProperNameFilter());
//		
//		//	throw out direct speech snippets, rarely contain useful information
//		AnswerSelection.addFilter(new DirectSpeechFilter());
//		
//		AnswerSelection.addFilter(
//				new WikipediaGoogleWebTermImportanceFilter(
//					WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
//					WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
//					false
//				)
//			);
//		AnswerSelection.addFilter(new ScoreSorterFilter());
//		
//		//	cut off result
//		AnswerSelection.addFilter(new ResultLengthFilter(3000));
//	}
// Layout 3
//	/**
//	 * Initializes the pipeline for 'other' questions.
//	 */
//	protected void initOther() {
//		// query generation
//		QueryGeneration.clearQueryGenerators();
//		
//		// search
//		// - knowledge miners for unstructured knowledge sources
//		Search.clearKnowledgeMiners();
//		for (String[] indriIndices : IndriKM.getIndriIndices())
//			Search.addKnowledgeMiner(new IndriDocumentKM(indriIndices, false));
//		for (String[] indriServers : IndriKM.getIndriServers())
//			Search.addKnowledgeMiner(new IndriDocumentKM(indriServers, true));
//		// - knowledge annotators for (semi-)structured knowledge sources
//		Search.clearKnowledgeAnnotators();
//		
//		// answer extraction and selection
//		// (the filters are applied in this order)
//		AnswerSelection.clearFilters();
//		
//		//	initialize scores
//		AnswerSelection.addFilter(new ScoreResetterFilter());
//		
//		//	extract sentences from snippets
//		AnswerSelection.addFilter(new SentenceExtractionFilter());
//		
//		//	cut meaningless introductions from sentences
//		AnswerSelection.addFilter(new CutKeywordsFilter());
//		AnswerSelection.addFilter(new CutStatementProviderFilter());
//		AnswerSelection.addFilter(new SentenceSplitterFilter());
//		AnswerSelection.addFilter(new CutKeywordsFilter());
//		
//		//	remove duplicates
//		AnswerSelection.addFilter(new DuplicateSnippetFilter());
//		
//		//	throw out enumerations of proper names
//		AnswerSelection.addFilter(new ProperNameFilter());
//		
//		//	throw out direct speech snippets, rarely contain useful information
//		AnswerSelection.addFilter(new DirectSpeechFilter());
//		
//		//	sort out snippets containing no new terms
//		AnswerSelection.addFilter(new TermFilter());
//		
//		AnswerSelection.addFilter(
//				new WikipediaGoogleWebTermImportanceFilter(
//					WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
//					WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
//					false
//				)
//			);
//		AnswerSelection.addFilter(new ScoreSorterFilter());
//		
//		//	cut off result
//		AnswerSelection.addFilter(new ResultLengthFilter(3000));
//	}
/**
	 * Asks Ephyra an 'other' question.
	 * 
	 * @param question other question
	 * @return array of results
	 */
public final Result[] askOther(String question) {
    // initialize pipeline
    initOther();
    // query generation
    MsgPrinter.printGeneratingQueries();
    String qn = QuestionNormalizer.normalize(question);
    // print normalized question string
    MsgPrinter.printNormalization(qn);
    // log normalized question string
    Logger.logNormalization(qn);
    String[] kws = KeywordExtractor.getKeywords(qn);
    AnalyzedQuestion aq = new AnalyzedQuestion(question);
    aq.setKeywords(kws);
    aq.setFactoid(false);
    BagOfWordsG gen = new BagOfWordsG();
    Query[] queries = gen.generateQueries(aq);
    for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
    // print query strings
    MsgPrinter.printQueryStrings(queries);
    // log query strings
    Logger.logQueryStrings(queries);
    // search
    MsgPrinter.printSearching();
    Result[] results = Search.doSearch(queries);
    // answer selection
    MsgPrinter.printSelectingAnswers();
    results = AnswerSelection.getResults(results, Integer.MAX_VALUE, 0);
    return results;
}
Also used : Query(info.ephyra.querygeneration.Query) AnalyzedQuestion(info.ephyra.questionanalysis.AnalyzedQuestion) BagOfWordsG(info.ephyra.querygeneration.generators.BagOfWordsG) Result(info.ephyra.search.Result)

Aggregations

AnalyzedQuestion (info.ephyra.questionanalysis.AnalyzedQuestion)9 Result (info.ephyra.search.Result)8 Query (info.ephyra.querygeneration.Query)3 BagOfWordsG (info.ephyra.querygeneration.generators.BagOfWordsG)3 TRECTarget (info.ephyra.trec.TRECTarget)2 ArrayList (java.util.ArrayList)2 Predicate (info.ephyra.nlp.semantics.Predicate)1 Term (info.ephyra.questionanalysis.Term)1 Dictionary (info.ephyra.util.Dictionary)1 EOFException (java.io.EOFException)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 ObjectInputStream (java.io.ObjectInputStream)1