Search in sources :

Example 56 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class SentenceSplitterFilter method apply.

/**
	 * Splits long snippets into individual sentences in order to facilitate
	 * subsequent filtering. The idea is that redundancy detection is easier for
	 * shorter snippets than for longer ones.
	 * 
	 * @param results array of <code>Result</code> objects
	 * @return extended array of <code>Result</code> objects
	 */
public Result[] apply(Result[] results) {
    // raw results returned by the searchers
    ArrayList<Result> rawResults = new ArrayList<Result>();
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String sentence = r.getAnswer();
            String[] sentences = sentence.split("\\.");
            if (sentences.length != 0) {
                //	re-join cut abbreviations
                ArrayList<String> sentenceList = new ArrayList<String>();
                String sen = sentences[0];
                for (int s = 1; s < sentences.length; s++) {
                    String end = sen.substring(sen.lastIndexOf(" ") + 1).toLowerCase();
                    if ((end.length() < 3) || end.matches("(^[aeiouy])++"))
                        sen = sen + ". " + sentences[s];
                    else {
                        sentenceList.add(sen);
                        sen = sentences[s];
                    }
                }
                sentenceList.add(sen);
                sentences = sentenceList.toArray(new String[sentenceList.size()]);
                r.setAnswer(sentences[0]);
                rawResults.add(r);
                for (int s = 1; s < sentences.length; s++) {
                    Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
                    newRes.setScore(r.getScore());
                    rawResults.add(newRes);
                }
            }
        }
    }
    results = rawResults.toArray(new Result[rawResults.size()]);
    rawResults.clear();
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String sentence = r.getAnswer();
            String[] sentences = sentence.split("\\?|\\!");
            if (sentences.length != 0) {
                r.setAnswer(sentences[0]);
                rawResults.add(r);
                for (int s = 1; s < sentences.length; s++) {
                    Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
                    newRes.setScore(r.getScore());
                    rawResults.add(newRes);
                }
            }
        }
    }
    results = rawResults.toArray(new Result[rawResults.size()]);
    rawResults.clear();
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String sentence = r.getAnswer();
            String[] sentences = sentence.split("\\;");
            if (sentences.length != 0) {
                r.setAnswer(sentences[0]);
                rawResults.add(r);
                for (int s = 1; s < sentences.length; s++) {
                    Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
                    newRes.setScore(r.getScore());
                    rawResults.add(newRes);
                }
            }
        }
    }
    results = rawResults.toArray(new Result[rawResults.size()]);
    rawResults.clear();
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String sentence = r.getAnswer();
            String[] sentences = sentence.split("\\-\\-");
            if (sentences.length != 0) {
                r.setAnswer(sentences[0]);
                rawResults.add(r);
                for (int s = 1; s < sentences.length; s++) {
                    Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
                    newRes.setScore(r.getScore());
                    rawResults.add(newRes);
                }
            }
        }
    }
    results = rawResults.toArray(new Result[rawResults.size()]);
    rawResults.clear();
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String sentence = r.getAnswer();
            String[] sentences = sentence.split("\\.\\'\\'");
            if (sentences.length != 0) {
                r.setAnswer(sentences[0]);
                rawResults.add(r);
                for (int s = 1; s < sentences.length; s++) {
                    Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
                    newRes.setScore(r.getScore());
                    rawResults.add(newRes);
                }
            }
        }
    }
    results = rawResults.toArray(new Result[rawResults.size()]);
    rawResults.clear();
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String sentence = r.getAnswer();
            String[] sentences = sentence.split(":");
            if (sentences.length != 0) {
                r.setAnswer(sentences[0]);
                rawResults.add(r);
                for (int s = 1; s < sentences.length; s++) {
                    Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
                    newRes.setScore(r.getScore());
                    rawResults.add(newRes);
                }
            }
        }
    }
    return rawResults.toArray(new Result[rawResults.size()]);
}
Also used : ArrayList(java.util.ArrayList) Result(info.ephyra.search.Result)

Example 57 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class SerializationFilter method apply.

/**
	 * Filters an array of <code>Result</code> objects.
	 * 
	 * @param results results to filter
	 * @return filtered results
	 */
public Result[] apply(Result[] results) {
    // output file set?
    if (serialFile == null)
        return results;
    // modify file name if file already exists
    // (comment this out to replace existing files)
    String path = serialFile.getPath();
    File serialFile = new File(path);
    if (serialFile.exists()) {
        path = serialFile.getPath() + "_2";
        serialFile = new File(path);
        int i = 2;
        while (serialFile.exists()) {
            path = serialFile.getPath();
            path = path.replaceFirst("_" + i + "$", "_" + ++i);
            serialFile = new File(path);
        }
    }
    // serialize results
    try {
        FileOutputStream fos = new FileOutputStream(serialFile);
        ObjectOutputStream oos = new ObjectOutputStream(fos);
        for (Result result : results) oos.writeObject(result);
        oos.close();
    } catch (IOException e) {
        MsgPrinter.printErrorMsg("Could not write serialized results:");
        MsgPrinter.printErrorMsg(e.toString());
        System.exit(1);
    }
    return results;
}
Also used : FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) ObjectOutputStream(java.io.ObjectOutputStream) File(java.io.File) Result(info.ephyra.search.Result)

Example 58 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class TermFilter method apply.

/**
	 * Filters out snippets that are likely to contain the answer to a
	 * previously asked factoid or list question. This is to prevent wasting
	 * result length with information redundant to the factoid and list
	 * questions.
	 * 
	 * @param results array of <code>Result</code> objects
	 * @return filtered array of <code>Result</code> objects
	 */
public Result[] apply(Result[] results) {
    // raw results returned by the searchers
    HashMap<String, Integer> termCounters = new HashMap<String, Integer>();
    for (Result r : results) {
        //			if (r.getScore() != Float.NEGATIVE_INFINITY) {
        String text = r.getAnswer();
        //	tokenize and tag sentence
        String[] sentence = NETagger.tokenize(text);
        //	scan sentence for NPs
        for (int i = 0; i < sentence.length; i++) {
            String term = SnowballStemmer.stem(sentence[i].toLowerCase());
            if (term.length() > 1) {
                Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
                termCounters.put(term, new Integer(count.intValue() + 1));
            }
        }
    //			}
    }
    ArrayList<Result> rawResults = new ArrayList<Result>();
    HashSet<String> found = new HashSet<String>();
    found.addAll(previousResultTerms);
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String text = r.getAnswer();
            //	tokenize and tag sentence
            String[] sentence = NETagger.tokenize(text);
            int numberOfTerms = 0;
            int numberOfKeyTerms = 0;
            HashSet<String> resFound = new HashSet<String>();
            //	scan sentence for NPs
            for (int i = 0; i < sentence.length; i++) {
                String term = SnowballStemmer.stem(sentence[i].toLowerCase());
                if (!found.contains(term) && !resFound.contains(term)) {
                    resFound.add(term);
                    //	count only terms that are contained in at least one percent of the results
                    Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
                    if (count.intValue() > (results.length / 100))
                        if ((term.length() > 1) && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term))
                            numberOfKeyTerms++;
                    if ((term.length() > 1) && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term))
                        numberOfTerms++;
                }
            }
            //30.50% freeze	if ((numberOfTerms > (1 + results.length / 100)) && (numberOfKeyTerms != 0)) {
            if (numberOfTerms != 0) {
                //					found.addAll(resFound);
                //					r.incScore(numberOfTerms * (((float) results.length) / ((float) sentence.length)));
                rawResults.add(r);
            }
        }
    }
    return rawResults.toArray(new Result[rawResults.size()]);
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Result(info.ephyra.search.Result) HashSet(java.util.HashSet)

Example 59 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class TermImportanceFilter method apply.

/**
	 * Increments the score of each result snippet for each word in it according
	 * to the number of result snippets containing this particular word. This is
	 * sort of a centrality measure, which favors snippets that provide
	 * information given frequently and thus likely to be more important with
	 * regard to the target.
	 * 
	 * @param results array of <code>Result</code> objects
	 * @return filtered array of <code>Result</code> objects
	 */
public Result[] apply(Result[] results) {
    // raw results returned by the searchers
    HashMap<String, Integer> termCounters = new HashMap<String, Integer>();
    ArrayList<Result> rawResults = new ArrayList<Result>();
    int lengthSum = 0;
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String text = r.getAnswer();
            //	tokenize and tag sentence
            String[] sentence = NETagger.tokenize(text);
            lengthSum += sentence.length;
            //	scan sentence for NPs
            for (int i = 0; i < sentence.length; i++) {
                String term = SnowballStemmer.stem(sentence[i].toLowerCase());
                if (term.length() > 1) {
                    Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
                    termCounters.put(term, new Integer(count.intValue() + 1));
                }
            }
        }
    }
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String text = r.getAnswer();
            //	tokenize sentence
            String[] sentence = NETagger.tokenize(text);
            float importance = 0;
            //	scan sentence for NPs
            for (int i = 0; i < sentence.length; i++) {
                String term = sentence[i];
                if ((term.length() > 1) && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term)) {
                    //					if (term.length() > 1) {
                    term = SnowballStemmer.stem(term.toLowerCase());
                    Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
                    if (count.intValue() > Math.floor(Math.sqrt(results.length / 100)))
                        importance += count.intValue();
                //						if (count.intValue() > (results.length / 100))
                //							importance += (((float) count.intValue()) / ((float) results.length));
                }
            }
            if (importance > 0) {
                r.incScore(importance);
                rawResults.add(r);
            //					r.incScore((float) Math.sqrt(importance));
            }
        }
    }
    return rawResults.toArray(new Result[rawResults.size()]);
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Result(info.ephyra.search.Result)

Example 60 with Result

use of info.ephyra.search.Result in project lucida by claritylab.

the class UnnecessaryCharactersFilter method apply.

public Result[] apply(Result[] results) {
    for (Result r : results) {
        if (r.getScore() != Float.NEGATIVE_INFINITY) {
            String sentence = r.getAnswer();
            sentence = sentence.replaceAll("(\\'|\\\"|\\`|\\_)", "");
            r.setAnswer(sentence);
        }
    }
    return results;
}
Also used : Result(info.ephyra.search.Result)

Aggregations

Result (info.ephyra.search.Result)68 ArrayList (java.util.ArrayList)36 Query (info.ephyra.querygeneration.Query)11 HashSet (java.util.HashSet)9 Hashtable (java.util.Hashtable)9 AnalyzedQuestion (info.ephyra.questionanalysis.AnalyzedQuestion)8 IOException (java.io.IOException)7 QuestionInterpretation (info.ephyra.questionanalysis.QuestionInterpretation)5 Feature (edu.cmu.minorthird.classify.Feature)4 HashMap (java.util.HashMap)4 Predicate (info.ephyra.nlp.semantics.Predicate)3 BagOfWordsG (info.ephyra.querygeneration.generators.BagOfWordsG)3 BufferedReader (java.io.BufferedReader)3 File (java.io.File)3 URL (java.net.URL)3 TRECTarget (info.ephyra.trec.TRECTarget)2 EOFException (java.io.EOFException)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 InputStreamReader (java.io.InputStreamReader)2