Search in sources :

Example 61 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class WebDocumentFetcher method run.

/**
	 * Fetches the document text and returns it to the
	 * <code>WebDocumentFetcherFilter</code>.
	 */
public void run() {
    // fetch document text, retry up to RETRIES times
    String docText = null;
    int retries = RETRIES;
    boolean cached = false;
    do {
        // fetch document and convert to plain text
        try {
            docText = HTMLConverter.url2text(snippet.getDocID());
            if (docText == null)
                MsgPrinter.printHttpError("Document " + snippet.getDocID() + " not available.");
        } catch (SocketTimeoutException e) {
            docText = null;
            MsgPrinter.printHttpError("Connection to " + snippet.getDocID() + " timed out.");
        }
        retries--;
        // retrieve cached document if original document unavailable
        if (docText == null && retries < 0 && snippet.getCacheID() != null && !snippet.getCacheID().equals(snippet.getDocID())) {
            MsgPrinter.printErrorMsg("\nCould not fetch original source, " + "trying cached source instead...");
            snippet.setDocID(snippet.getCacheID());
            retries = RETRIES;
            cached = true;
        }
    } while (docText == null && retries >= 0);
    // pass document to WebDocumentFetcherFilter
    if (docText != null) {
        Result doc = new Result(docText, snippet.getQuery(), snippet.getDocID(), snippet.getHitPos());
        doc.setScore(0);
        filter.addDoc(doc, cached);
    } else {
        MsgPrinter.printErrorMsg("\nCould not fetch document.");
        filter.addDoc(null, cached);
    //			System.exit(1);
    }
}
Also used : SocketTimeoutException(java.net.SocketTimeoutException) Result(info.ephyra.search.Result)

Example 62 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class WikipediaTermImportanceFilter method main.

public static void main(String[] args) {
    TEST_TERM_DOWMLOD = true;
    MsgPrinter.enableStatusMsgs(true);
    MsgPrinter.enableErrorMsgs(true);
    // create tokenizer
    MsgPrinter.printStatusMsg("Creating tokenizer...");
    if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
        MsgPrinter.printErrorMsg("Could not create tokenizer.");
    //		LingPipe.createTokenizer();
    //		// create sentence detector
    //		MsgPrinter.printStatusMsg("Creating sentence detector...");
    //		if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
    //			MsgPrinter.printErrorMsg("Could not create sentence detector.");
    //		LingPipe.createSentenceDetector();
    // create stemmer
    MsgPrinter.printStatusMsg("Creating stemmer...");
    SnowballStemmer.create();
    //		// create part of speech tagger
    //		MsgPrinter.printStatusMsg("Creating POS tagger...");
    //		if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz",
    //									 "res/nlp/postagger/opennlp/tagdict"))
    //			MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
    //		if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
    //				"train-wsj-0-18.holder"))
    //			MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
    //		// create chunker
    //		MsgPrinter.printStatusMsg("Creating chunker...");
    //		if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" +
    //								   "EnglishChunk.bin.gz"))
    //			MsgPrinter.printErrorMsg("Could not create chunker.");
    // create named entity taggers
    MsgPrinter.printStatusMsg("Creating NE taggers...");
    NETagger.loadListTaggers("res/nlp/netagger/lists/");
    NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
    MsgPrinter.printStatusMsg("  ...loading models");
    //		if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/"))
    //			MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
    //		if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
    //			MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
    MsgPrinter.printStatusMsg("  ...done");
    WikipediaTermImportanceFilter wtif = new WikipediaTermImportanceFilter(NO_NORMALIZATION, NO_NORMALIZATION, false);
    TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
    for (TRECTarget target : targets) {
        String question = target.getTargetDesc();
        // query generation
        MsgPrinter.printGeneratingQueries();
        String qn = QuestionNormalizer.normalize(question);
        // print normalized question string
        MsgPrinter.printNormalization(qn);
        // log normalized question string
        Logger.logNormalization(qn);
        String[] kws = KeywordExtractor.getKeywords(qn);
        AnalyzedQuestion aq = new AnalyzedQuestion(question);
        aq.setKeywords(kws);
        aq.setFactoid(false);
        Query[] queries = new BagOfWordsG().generateQueries(aq);
        for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
        Result[] results = new Result[1];
        results[0] = new Result("This would be the answer", queries[0]);
        wtif.apply(results);
    }
}
Also used : Query(info.ephyra.querygeneration.Query) TRECTarget(info.ephyra.trec.TRECTarget) AnalyzedQuestion(info.ephyra.questionanalysis.AnalyzedQuestion) BagOfWordsG(info.ephyra.querygeneration.generators.BagOfWordsG) Result(info.ephyra.search.Result)

Example 63 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class EphyraTREC13To16 method runAndEval.

/**
	 * Initializes Ephyra, asks the questions or loads the answers from a log
	 * file, evaluates the answers if patterns are available and logs and saves
	 * the answers.
	 */
private static void runAndEval() {
    // initialize Ephyra
    EphyraTREC13To16 ephyra = new EphyraTREC13To16();
    // evaluate for multiple thresholds
    boolean firstThreshold = true;
    //		for (float fAbsThresh = FACTOID_ABS_THRESH;
    //			 fAbsThresh <= 1; fAbsThresh += 0.01) {
    float fAbsThresh = FACTOID_ABS_THRESH;
    //		for (float lRelThresh = LIST_REL_THRESH;
    //			 lRelThresh <= 1; lRelThresh += 0.01) {
    float lRelThresh = LIST_REL_THRESH;
    for (TRECTarget target : targets) {
        MsgPrinter.printTarget(target.getTargetDesc());
        // normalize target description, determine target types
        if (firstThreshold)
            TargetPreprocessor.preprocess(target);
        String targetDesc = target.getTargetDesc();
        String condensedTarget = target.getCondensedTarget();
        TRECQuestion[] questions = target.getQuestions();
        // condensed target is used as contextual information
        QuestionAnalysis.setContext(condensedTarget);
        for (int i = 0; i < questions.length; i++) {
            MsgPrinter.printQuestion(questions[i].getQuestionString());
            String id = questions[i].getId();
            String type = questions[i].getType();
            String qs;
            if (type.equals("FACTOID") || type.equals("LIST")) {
                // resolve coreferences in factoid and list questions
                if (firstThreshold) {
                    MsgPrinter.printResolvingCoreferences();
                    CorefResolver.resolvePronounsToTarget(target, i);
                }
                qs = questions[i].getQuestionString();
            } else {
                qs = targetDesc;
            }
            // set pattern used to evaluate answers for overlap analysis
            OverlapAnalysisFilter.setPattern(null);
            if (type.equals("FACTOID")) {
                for (TRECPattern pattern : factoidPatterns) {
                    if (pattern.getId().equals(id)) {
                        OverlapAnalysisFilter.setPattern(pattern);
                        break;
                    }
                }
            }
            // ask Ephyra or load answer from log file
            Result[] results = null;
            if ((type.equals("FACTOID") && factoidLog) || (type.equals("LIST") && listLog) || (type.equals("OTHER") && otherLog)) {
                results = TREC13To16Parser.loadResults(qs, type, inputLogFile);
            }
            if (results == null) {
                // answer not loaded from log file
                if (type.equals("FACTOID")) {
                    Logger.logFactoidStart(qs);
                    results = ephyra.askFactoid(qs, FACTOID_MAX_ANSWERS, FACTOID_ABS_THRESH);
                    //						results = new Result[0];
                    Logger.logResults(results);
                    Logger.logFactoidEnd();
                } else if (type.equals("LIST")) {
                    Logger.logListStart(qs);
                    results = ephyra.askList(qs, LIST_REL_THRESH);
                    //						results = new Result[0];
                    Logger.logResults(results);
                    Logger.logListEnd();
                } else {
                    Logger.logDefinitionalStart(qs);
                    results = ephyra.askOther(target);
                    //						results = new Result[0];
                    Logger.logResults(results);
                    Logger.logDefinitionalEnd();
                }
            }
            // calculate question score if patterns are available
            boolean[] correct = null;
            if (type.equals("FACTOID") && factoidPatterns != null)
                correct = evalFactoidQuestion(id, results, fAbsThresh);
            else if (type.equals("LIST") && listPatterns != null)
                correct = evalListQuestion(id, results, lRelThresh);
            // update target data structure
            TRECAnswer[] answers = new TRECAnswer[results.length];
            for (int j = 0; j < results.length; j++) {
                String answer = results[j].getAnswer();
                String supportDoc = results[j].getDocID();
                answers[j] = new TRECAnswer(id, answer, supportDoc);
            }
            questions[i].setAnswers(answers);
            if (results.length > 0) {
                QuestionInterpretation qi = results[0].getQuery().getInterpretation();
                if (qi != null)
                    questions[i].setInterpretation(qi);
            }
            if (answers.length == 0) {
                // no answer found
                answers = new TRECAnswer[1];
                if (type.equals("FACTOID"))
                    answers[0] = new TRECAnswer(id, null, "NIL");
                else
                    answers[0] = new TRECAnswer(id, "No answers found.", "XIE19960101.0001");
            }
            // save answers to output file
            TREC13To16Parser.saveAnswers("log/" + runTag, answers, correct, runTag);
        }
        // calculate target scores if patterns are available
        if (factoidPatterns != null)
            evalFactoidTarget();
        if (listPatterns != null)
            evalListTarget();
    }
    // calculate component scores and log scores if patterns are available
    if (factoidPatterns != null)
        evalFactoidTotal(fAbsThresh);
    if (listPatterns != null)
        evalListTotal(lRelThresh);
    firstThreshold = false;
//		}
//		}
}
Also used : QuestionInterpretation(info.ephyra.questionanalysis.QuestionInterpretation) Result(info.ephyra.search.Result)

Example 64 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class EphyraTREC13To16 method askOther.

// Layout 2
//	/**
//	 * Initializes the pipeline for 'other' questions.
//	 */
//	protected void initOther() {
//		// query generation
//		QueryGeneration.clearQueryGenerators();
//		
//		// search
//		// - knowledge miners for unstructured knowledge sources
//		Search.clearKnowledgeMiners();
//		for (String[] indriIndices : IndriKM.getIndriIndices())
//			Search.addKnowledgeMiner(new IndriKM(indriIndices, false));
//		for (String[] indriServers : IndriKM.getIndriServers())
//			Search.addKnowledgeMiner(new IndriKM(indriServers, true));
//		// - knowledge annotators for (semi-)structured knowledge sources
//		Search.clearKnowledgeAnnotators();
//		
//		// answer extraction and selection
//		// (the filters are applied in this order)
//		AnswerSelection.clearFilters();
//		
//		//	initialize scores
//		AnswerSelection.addFilter(new ScoreResetterFilter());
//		
//		//	extract sentences from snippets
//		AnswerSelection.addFilter(new SentenceExtractionFilter());
//		
//		//	cut meaningless introductions from sentences
//		AnswerSelection.addFilter(new CutKeywordsFilter());
//		AnswerSelection.addFilter(new CutStatementProviderFilter());
//		AnswerSelection.addFilter(new SentenceSplitterFilter());
//		AnswerSelection.addFilter(new CutKeywordsFilter());
//		
//		//	remove duplicates
//		AnswerSelection.addFilter(new DuplicateSnippetFilter());
//		
//		//	throw out enumerations of proper names
//		AnswerSelection.addFilter(new ProperNameFilter());
//		
//		//	throw out direct speech snippets, rarely contain useful information
//		AnswerSelection.addFilter(new DirectSpeechFilter());
//		
//		AnswerSelection.addFilter(
//				new WikipediaGoogleWebTermImportanceFilter(
//					WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
//					WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
//					false
//				)
//			);
//		AnswerSelection.addFilter(new ScoreSorterFilter());
//		
//		//	cut off result
//		AnswerSelection.addFilter(new ResultLengthFilter(3000));
//	}
// Layout 3
//	/**
//	 * Initializes the pipeline for 'other' questions.
//	 */
//	protected void initOther() {
//		// query generation
//		QueryGeneration.clearQueryGenerators();
//		
//		// search
//		// - knowledge miners for unstructured knowledge sources
//		Search.clearKnowledgeMiners();
//		for (String[] indriIndices : IndriKM.getIndriIndices())
//			Search.addKnowledgeMiner(new IndriDocumentKM(indriIndices, false));
//		for (String[] indriServers : IndriKM.getIndriServers())
//			Search.addKnowledgeMiner(new IndriDocumentKM(indriServers, true));
//		// - knowledge annotators for (semi-)structured knowledge sources
//		Search.clearKnowledgeAnnotators();
//		
//		// answer extraction and selection
//		// (the filters are applied in this order)
//		AnswerSelection.clearFilters();
//		
//		//	initialize scores
//		AnswerSelection.addFilter(new ScoreResetterFilter());
//		
//		//	extract sentences from snippets
//		AnswerSelection.addFilter(new SentenceExtractionFilter());
//		
//		//	cut meaningless introductions from sentences
//		AnswerSelection.addFilter(new CutKeywordsFilter());
//		AnswerSelection.addFilter(new CutStatementProviderFilter());
//		AnswerSelection.addFilter(new SentenceSplitterFilter());
//		AnswerSelection.addFilter(new CutKeywordsFilter());
//		
//		//	remove duplicates
//		AnswerSelection.addFilter(new DuplicateSnippetFilter());
//		
//		//	throw out enumerations of proper names
//		AnswerSelection.addFilter(new ProperNameFilter());
//		
//		//	throw out direct speech snippets, rarely contain useful information
//		AnswerSelection.addFilter(new DirectSpeechFilter());
//		
//		//	sort out snippets containing no new terms
//		AnswerSelection.addFilter(new TermFilter());
//		
//		AnswerSelection.addFilter(
//				new WikipediaGoogleWebTermImportanceFilter(
//					WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
//					WebTermImportanceFilter.LOG_LENGTH_NORMALIZATION,
//					false
//				)
//			);
//		AnswerSelection.addFilter(new ScoreSorterFilter());
//		
//		//	cut off result
//		AnswerSelection.addFilter(new ResultLengthFilter(3000));
//	}
/**
	 * Asks Ephyra an 'other' question.
	 * 
	 * @param question other question
	 * @return array of results
	 */
public final Result[] askOther(String question) {
    // initialize pipeline
    initOther();
    // query generation
    MsgPrinter.printGeneratingQueries();
    String qn = QuestionNormalizer.normalize(question);
    // print normalized question string
    MsgPrinter.printNormalization(qn);
    // log normalized question string
    Logger.logNormalization(qn);
    String[] kws = KeywordExtractor.getKeywords(qn);
    AnalyzedQuestion aq = new AnalyzedQuestion(question);
    aq.setKeywords(kws);
    aq.setFactoid(false);
    BagOfWordsG gen = new BagOfWordsG();
    Query[] queries = gen.generateQueries(aq);
    for (int q = 0; q < queries.length; q++) queries[q].setOriginalQueryString(question);
    // print query strings
    MsgPrinter.printQueryStrings(queries);
    // log query strings
    Logger.logQueryStrings(queries);
    // search
    MsgPrinter.printSearching();
    Result[] results = Search.doSearch(queries);
    // answer selection
    MsgPrinter.printSelectingAnswers();
    results = AnswerSelection.getResults(results, Integer.MAX_VALUE, 0);
    return results;
}
Also used : Query(info.ephyra.querygeneration.Query) AnalyzedQuestion(info.ephyra.questionanalysis.AnalyzedQuestion) BagOfWordsG(info.ephyra.querygeneration.generators.BagOfWordsG) Result(info.ephyra.search.Result)

Example 65 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class EphyraTREC13To16 method evalListQuestion.

/**
	 * Calculates the score for a single list question.
	 * 
	 * @param qid ID of the question
	 * @param results the results from Ephyra
	 * @param relThresh relative confidence threshold for results
	 * @return for each answer a flag that is true iff the answer is correct
	 */
private static boolean[] evalListQuestion(String qid, Result[] results, float relThresh) {
    // get pattern
    TRECPattern pattern = null;
    for (TRECPattern listPattern : listPatterns) if (listPattern.getId().equals(qid)) {
        pattern = listPattern;
        break;
    }
    // pattern not available
    if (pattern == null)
        return new boolean[0];
    // get results with a score of at least relThresh * top score
    ArrayList<Result> resultList = new ArrayList<Result>();
    if (results.length > 0) {
        float topScore = results[0].getScore();
        for (Result result : results) if (result.getScore() >= relThresh * topScore)
            resultList.add(result);
    }
    // F measure
    float f = 0;
    // correct results
    boolean[] correct = new boolean[resultList.size()];
    if (resultList.size() > 0) {
        String[] regexs = pattern.getRegexs();
        // total number of known answers
        int total = regexs.length;
        // number of returned results
        int returned = resultList.size();
        // number of answers covered by the results
        int covered = 0;
        for (String regex : regexs) {
            boolean found = false;
            for (int i = 0; i < resultList.size(); i++) {
                String answer = resultList.get(i).getAnswer();
                if (answer.matches(".*?" + regex + ".*+")) {
                    if (!found) {
                        covered++;
                        found = true;
                    }
                    correct[i] = true;
                }
            }
        }
        if (covered > 0) {
            float recall = ((float) covered) / total;
            float precision = ((float) covered) / returned;
            f = (2 * recall * precision) / (recall + precision);
        }
    }
    listQuestionScores.add(f);
    return correct;
}
Also used : ArrayList(java.util.ArrayList) Result(info.ephyra.search.Result)

Aggregations

Result (info.ephyra.search.Result)68 ArrayList (java.util.ArrayList)36 Query (info.ephyra.querygeneration.Query)11 HashSet (java.util.HashSet)9 Hashtable (java.util.Hashtable)9 AnalyzedQuestion (info.ephyra.questionanalysis.AnalyzedQuestion)8 IOException (java.io.IOException)7 QuestionInterpretation (info.ephyra.questionanalysis.QuestionInterpretation)5 Feature (edu.cmu.minorthird.classify.Feature)4 HashMap (java.util.HashMap)4 Predicate (info.ephyra.nlp.semantics.Predicate)3 BagOfWordsG (info.ephyra.querygeneration.generators.BagOfWordsG)3 BufferedReader (java.io.BufferedReader)3 File (java.io.File)3 URL (java.net.URL)3 TRECTarget (info.ephyra.trec.TRECTarget)2 EOFException (java.io.EOFException)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 InputStreamReader (java.io.InputStreamReader)2