Search in sources :

Example 26 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class ScoreNormalizationFilter method addMaxScoreFeature.

/**
	 * Adds the maximum score of all factoid answers from the same extractor as
	 * a feature to the instance.
	 */
private static void addMaxScoreFeature(MutableInstance instance, Result result, Result[] results) {
    // calculate maximum score
    double maxScore = 0;
    //		String extractor = result.getExtractionTechniques()[0];
    for (Result r : results) if (r.getScore() > 0 && r.getScore() < Float.POSITIVE_INFINITY)
        //				if (r.extractedWith(extractor))
        maxScore = Math.max(r.getScore(), maxScore);
    Feature feature = new Feature(MAX_SCORE_F);
    instance.addNumeric(feature, maxScore);
}
Also used : Feature(edu.cmu.minorthird.classify.Feature) Result(info.ephyra.search.Result)

Example 27 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class SubclauseSplitterFilter method apply.

/**
	 * Splits sentences into individual subclauses in order to facilitate
	 * subsequent filtering. The idea is that redundancy detection is easier for
	 * shorter snippets than for longer ones.
	 * 
	 * @param results array of <code>Result</code> objects
	 * @return extended array of <code>Result</code> objects
	 */
public Result[] apply(Result[] results) {
    // raw results returned by the searchers
    ArrayList<Result> rawResults = new ArrayList<Result>();
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String sentence = r.getAnswer();
            String[] sentences = sentence.split("(\\b(although|but|how|until|what|when|where|which|who|whom|why)\\b)");
            if (sentences.length != 0) {
                r.setAnswer(sentences[0]);
                rawResults.add(r);
                for (int s = 1; s < sentences.length; s++) {
                    Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
                    newRes.setScore(r.getScore());
                    rawResults.add(newRes);
                }
            } else
                rawResults.add(r);
        }
    }
    return rawResults.toArray(new Result[rawResults.size()]);
}
Also used : ArrayList(java.util.ArrayList) Result(info.ephyra.search.Result)

Example 28 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class TripletFilter method apply.

/**
	 * Increments the score of each result snippet according to the number of
	 * NP-VP-NP triplets it is the first to contain. This is meant to prefer
	 * snippets that provide new information over those that repeat information
	 * from previous snippets.
	 * 
	 * @param results array of <code>Result</code> objects
	 * @return modified array of <code>Result</code> objects
	 */
public Result[] apply(Result[] results) {
    // raw results returned by the searchers
    ArrayList<Result> rawResults = new ArrayList<Result>();
    HashSet<String> found = new HashSet<String>();
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String stemmedQuestion = SnowballStemmer.stemAllTokens(r.getQuery().getAnalyzedQuestion().getQuestion());
            String text = r.getAnswer();
            //	tokenize and tag sentence
            if (!text.endsWith("."))
                text += ".";
            String[] sentence = OpenNLP.tokenize(text);
            String[] posTags = OpenNLP.tagPos(sentence);
            String[] chunkTags = OpenNLP.tagChunks(sentence, posTags);
            chunkTags = OpenNLP.joinNounPhrases(sentence, chunkTags);
            int tripStart = -1;
            int index = 0;
            int numberOfTriplets = 0;
            //	scan sentence for NP-VP-NP triplets
            while (index < sentence.length) {
                //	find start of first NP
                while ((index < sentence.length) && !"B-NP".equals(chunkTags[index])) index++;
                if (index < sentence.length) {
                    tripStart = index;
                    int i = 1;
                    //	find start of VP
                    while (((index + i) < sentence.length) && !"B-VP".equals(chunkTags[index + i])) {
                        if ("B-NP".equals(chunkTags[index + i]))
                            i = sentence.length;
                        else if ("O".equals(chunkTags[index + i]))
                            i = sentence.length;
                        else
                            i++;
                    }
                    i++;
                    //	find start of second NP
                    while (((index + i) < sentence.length) && !"B-NP".equals(chunkTags[index + i])) {
                        if ("B-VP".equals(chunkTags[index + i]))
                            i = sentence.length;
                        else if ("O".equals(chunkTags[index + i]))
                            i = sentence.length;
                        else if ("B-SBAR".equals(chunkTags[index + i]))
                            i = sentence.length;
                        else
                            i++;
                    }
                    //	complete second NP
                    i++;
                    while (((index + i) < sentence.length) && "I-NP".equals(chunkTags[index + i])) i++;
                    //	remember NP-VP-NP triplet
                    if ((index + i) < sentence.length) {
                        String trip = "";
                        for (int s = tripStart; s < (tripStart + i); s++) trip += " " + sentence[s];
                        trip = SnowballStemmer.stemAllTokens(trip.trim());
                        if (!found.contains(trip)) {
                            found.add(trip);
                            if (!StringUtils.isSubsetKeywords(trip, stemmedQuestion)) {
                                //System.out.println("Triplet:\n  " + trip);
                                //									Result newRes = new Result(trip, r.getQuery(), r.getDocID(), r.getHitPos());
                                //									newRes.setScore(r.getScore() + 1);
                                //									rawResults.add(newRes);
                                numberOfTriplets++;
                            }
                        }
                    //							if (!StringUtils.isSubsetKeywords(trip, r.getQuery().getQuestion())) {
                    //								if (resultsByTriplets.containsKey(trip)) {
                    //									Result res = resultsByTriplets.get(trip);
                    //									res.setScore(res.getScore() + 1);
                    //								} else resultsByTriplets.put(trip, r);
                    //							}
                    }
                    index++;
                }
            }
            if (numberOfTriplets != 0) {
                //	20060724_2x runs
                r.incScore(numberOfTriplets);
                //					r.incScore(numberOfTriplets * (((float) results.length) / ((float) sentence.length)));	//	20060725_0x runs
                rawResults.add(r);
            }
        }
    }
    return rawResults.toArray(new Result[rawResults.size()]);
}
Also used : ArrayList(java.util.ArrayList) Result(info.ephyra.search.Result) HashSet(java.util.HashSet)

Example 29 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class TruncationFilter method apply.

/**
	 * Filters an array of <code>Result</code> objects.
	 * 
	 * @param results results to filter
	 * @return filtered results
	 */
public Result[] apply(Result[] results) {
    // all results that pass the filter
    ArrayList<Result> filtered = new ArrayList<Result>();
    // for each extractor, truncated answers and corresponding results
    Hashtable<String, Hashtable<String, Result>> truncated = new Hashtable<String, Hashtable<String, Result>>();
    // sort results by their scores in descending order
    results = (new ScoreSorterFilter()).apply(results);
    for (Result result : results) {
        // only truncate factoid answers
        if (result.getScore() <= 0 || result.getScore() == Float.POSITIVE_INFINITY) {
            filtered.add(result);
            continue;
        }
        // make sure that answers come from a single extractor
        String[] extractors = result.getExtractionTechniques();
        if (extractors == null || extractors.length != 1) {
            filtered.add(result);
            continue;
        }
        String extractor = extractors[0];
        // truncate result
        result = apply(result);
        // merge with similar results from same extractor
        Hashtable<String, Result> truncatedT = truncated.get(extractor);
        if (truncatedT == null) {
            truncatedT = new Hashtable<String, Result>();
            truncated.put(extractor, truncatedT);
        }
        String norm = StringUtils.normalize(result.getAnswer());
        Result similar = truncatedT.get(norm);
        if (similar == null) {
            filtered.add(result);
            truncatedT.put(norm, result);
        } else {
            similar.incScore(result.getScore());
        }
    }
    return filtered.toArray(new Result[filtered.size()]);
}
Also used : Hashtable(java.util.Hashtable) ArrayList(java.util.ArrayList) Result(info.ephyra.search.Result)

Example 30 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class WebDocumentFetcher method apply.

/**
	 * Fetches the top <code>MAX_DOCS</code> documents containing the given
	 * search engine snippets. The original snippets are dropped.
	 * 
	 * @param results array of <code>Result</code> objects containing snippets
	 * @return array of <code>Result</code> objects containing entire documents
	 */
public Result[] apply(Result[] results) {
    // documents containing the search engine snippets
    docs = new ArrayList<Result>();
    // start document fetchers
    HashSet<String> urls = new HashSet<String>();
    for (Result result : results) {
        // only apply this filter to results for the semantic parsing
        // approach
        Query query = result.getQuery();
        Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
        if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
            continue;
        // if result is not a web document then just make a copy
        if (!result.getDocID().contains(":")) {
            Result newResult = result.getCopy();
            newResult.setScore(0);
            docs.add(newResult);
            continue;
        }
        // fetch at most MAX_DOCS documents
        if (urls.size() >= MAX_DOCS)
            break;
        String url = result.getDocID();
        // no forbidden document type
        if (url.matches("(?i).*?" + FORBIDDEN_DOCS))
            continue;
        // only HTTP connections
        try {
            URLConnection conn = (new URL(url)).openConnection();
            if (!(conn instanceof HttpURLConnection))
                continue;
        } catch (IOException e) {
            continue;
        }
        // no duplicate document
        if (!urls.add(url))
            continue;
        // if caching is enabled, try to read document from cache
        if (CACHING) {
            FileCache cache = new FileCache(CACHE_DIR);
            String[] entries = cache.read(url);
            if (entries != null) {
                StringBuilder sb = new StringBuilder();
                for (String entry : entries) {
                    sb.append(entry);
                    sb.append("\n");
                }
                String docText = sb.toString();
                Result doc = new Result(docText, result.getQuery(), url, result.getHitPos());
                doc.setScore(0);
                docs.add(doc);
                continue;
            }
        }
        (new WebDocumentFetcher()).start(this, result);
    }
    // wait until all fetchers are done
    waitForDocs();
    // keep old results
    Result[] newResults = docs.toArray(new Result[docs.size()]);
    Result[] allResults = new Result[results.length + newResults.length];
    for (int i = 0; i < results.length; i++) allResults[i] = results[i];
    for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
    return allResults;
}
Also used : Query(info.ephyra.querygeneration.Query) IOException(java.io.IOException) HttpURLConnection(java.net.HttpURLConnection) URLConnection(java.net.URLConnection) URL(java.net.URL) Result(info.ephyra.search.Result) Predicate(info.ephyra.nlp.semantics.Predicate) FileCache(info.ephyra.util.FileCache) HttpURLConnection(java.net.HttpURLConnection) HashSet(java.util.HashSet)

Aggregations

Result (info.ephyra.search.Result)68 ArrayList (java.util.ArrayList)36 Query (info.ephyra.querygeneration.Query)11 HashSet (java.util.HashSet)9 Hashtable (java.util.Hashtable)9 AnalyzedQuestion (info.ephyra.questionanalysis.AnalyzedQuestion)8 IOException (java.io.IOException)7 QuestionInterpretation (info.ephyra.questionanalysis.QuestionInterpretation)5 Feature (edu.cmu.minorthird.classify.Feature)4 HashMap (java.util.HashMap)4 Predicate (info.ephyra.nlp.semantics.Predicate)3 BagOfWordsG (info.ephyra.querygeneration.generators.BagOfWordsG)3 BufferedReader (java.io.BufferedReader)3 File (java.io.File)3 URL (java.net.URL)3 TRECTarget (info.ephyra.trec.TRECTarget)2 EOFException (java.io.EOFException)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 InputStreamReader (java.io.InputStreamReader)2