use of info.ephyra.search.Result in project lucida by claritylab.
the class SentenceSplitterFilter method apply.
/**
* Splits long snippets into individual sentences in order to facilitate
* subsequent filtering. The idea is that redundancy detection is easier for
* shorter snippets than for longer ones.
*
* @param results array of <code>Result</code> objects
* @return extended array of <code>Result</code> objects
*/
public Result[] apply(Result[] results) {
// raw results returned by the searchers
ArrayList<Result> rawResults = new ArrayList<Result>();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String sentence = r.getAnswer();
String[] sentences = sentence.split("\\.");
if (sentences.length != 0) {
// re-join cut abbreviations
ArrayList<String> sentenceList = new ArrayList<String>();
String sen = sentences[0];
for (int s = 1; s < sentences.length; s++) {
String end = sen.substring(sen.lastIndexOf(" ") + 1).toLowerCase();
if ((end.length() < 3) || end.matches("(^[aeiouy])++"))
sen = sen + ". " + sentences[s];
else {
sentenceList.add(sen);
sen = sentences[s];
}
}
sentenceList.add(sen);
sentences = sentenceList.toArray(new String[sentenceList.size()]);
r.setAnswer(sentences[0]);
rawResults.add(r);
for (int s = 1; s < sentences.length; s++) {
Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
newRes.setScore(r.getScore());
rawResults.add(newRes);
}
}
}
}
results = rawResults.toArray(new Result[rawResults.size()]);
rawResults.clear();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String sentence = r.getAnswer();
String[] sentences = sentence.split("\\?|\\!");
if (sentences.length != 0) {
r.setAnswer(sentences[0]);
rawResults.add(r);
for (int s = 1; s < sentences.length; s++) {
Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
newRes.setScore(r.getScore());
rawResults.add(newRes);
}
}
}
}
results = rawResults.toArray(new Result[rawResults.size()]);
rawResults.clear();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String sentence = r.getAnswer();
String[] sentences = sentence.split("\\;");
if (sentences.length != 0) {
r.setAnswer(sentences[0]);
rawResults.add(r);
for (int s = 1; s < sentences.length; s++) {
Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
newRes.setScore(r.getScore());
rawResults.add(newRes);
}
}
}
}
results = rawResults.toArray(new Result[rawResults.size()]);
rawResults.clear();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String sentence = r.getAnswer();
String[] sentences = sentence.split("\\-\\-");
if (sentences.length != 0) {
r.setAnswer(sentences[0]);
rawResults.add(r);
for (int s = 1; s < sentences.length; s++) {
Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
newRes.setScore(r.getScore());
rawResults.add(newRes);
}
}
}
}
results = rawResults.toArray(new Result[rawResults.size()]);
rawResults.clear();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String sentence = r.getAnswer();
String[] sentences = sentence.split("\\.\\'\\'");
if (sentences.length != 0) {
r.setAnswer(sentences[0]);
rawResults.add(r);
for (int s = 1; s < sentences.length; s++) {
Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
newRes.setScore(r.getScore());
rawResults.add(newRes);
}
}
}
}
results = rawResults.toArray(new Result[rawResults.size()]);
rawResults.clear();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String sentence = r.getAnswer();
String[] sentences = sentence.split(":");
if (sentences.length != 0) {
r.setAnswer(sentences[0]);
rawResults.add(r);
for (int s = 1; s < sentences.length; s++) {
Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
newRes.setScore(r.getScore());
rawResults.add(newRes);
}
}
}
}
return rawResults.toArray(new Result[rawResults.size()]);
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class SerializationFilter method apply.
/**
* Filters an array of <code>Result</code> objects.
*
* @param results results to filter
* @return filtered results
*/
public Result[] apply(Result[] results) {
// output file set?
if (serialFile == null)
return results;
// modify file name if file already exists
// (comment this out to replace existing files)
String path = serialFile.getPath();
File serialFile = new File(path);
if (serialFile.exists()) {
path = serialFile.getPath() + "_2";
serialFile = new File(path);
int i = 2;
while (serialFile.exists()) {
path = serialFile.getPath();
path = path.replaceFirst("_" + i + "$", "_" + ++i);
serialFile = new File(path);
}
}
// serialize results
try {
FileOutputStream fos = new FileOutputStream(serialFile);
ObjectOutputStream oos = new ObjectOutputStream(fos);
for (Result result : results) oos.writeObject(result);
oos.close();
} catch (IOException e) {
MsgPrinter.printErrorMsg("Could not write serialized results:");
MsgPrinter.printErrorMsg(e.toString());
System.exit(1);
}
return results;
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class TermFilter method apply.
/**
* Filters out snippets that are likely to contain the answer to a
* previously asked factoid or list question. This is to prevent wasting
* result length with information redundant to the factoid and list
* questions.
*
* @param results array of <code>Result</code> objects
* @return filtered array of <code>Result</code> objects
*/
public Result[] apply(Result[] results) {
// raw results returned by the searchers
HashMap<String, Integer> termCounters = new HashMap<String, Integer>();
for (Result r : results) {
// if (r.getScore() != Float.NEGATIVE_INFINITY) {
String text = r.getAnswer();
// tokenize and tag sentence
String[] sentence = NETagger.tokenize(text);
// scan sentence for NPs
for (int i = 0; i < sentence.length; i++) {
String term = SnowballStemmer.stem(sentence[i].toLowerCase());
if (term.length() > 1) {
Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
termCounters.put(term, new Integer(count.intValue() + 1));
}
}
// }
}
ArrayList<Result> rawResults = new ArrayList<Result>();
HashSet<String> found = new HashSet<String>();
found.addAll(previousResultTerms);
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String text = r.getAnswer();
// tokenize and tag sentence
String[] sentence = NETagger.tokenize(text);
int numberOfTerms = 0;
int numberOfKeyTerms = 0;
HashSet<String> resFound = new HashSet<String>();
// scan sentence for NPs
for (int i = 0; i < sentence.length; i++) {
String term = SnowballStemmer.stem(sentence[i].toLowerCase());
if (!found.contains(term) && !resFound.contains(term)) {
resFound.add(term);
// count only terms that are contained in at least one percent of the results
Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
if (count.intValue() > (results.length / 100))
if ((term.length() > 1) && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term))
numberOfKeyTerms++;
if ((term.length() > 1) && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term))
numberOfTerms++;
}
}
//30.50% freeze if ((numberOfTerms > (1 + results.length / 100)) && (numberOfKeyTerms != 0)) {
if (numberOfTerms != 0) {
// found.addAll(resFound);
// r.incScore(numberOfTerms * (((float) results.length) / ((float) sentence.length)));
rawResults.add(r);
}
}
}
return rawResults.toArray(new Result[rawResults.size()]);
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class TermImportanceFilter method apply.
/**
* Increments the score of each result snippet for each word in it according
* to the number of result snippets containing this particular word. This is
* sort of a centrality measure, which favors snippets that provide
* information given frequently and thus likely to be more important with
* regard to the target.
*
* @param results array of <code>Result</code> objects
* @return filtered array of <code>Result</code> objects
*/
public Result[] apply(Result[] results) {
// raw results returned by the searchers
HashMap<String, Integer> termCounters = new HashMap<String, Integer>();
ArrayList<Result> rawResults = new ArrayList<Result>();
int lengthSum = 0;
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String text = r.getAnswer();
// tokenize and tag sentence
String[] sentence = NETagger.tokenize(text);
lengthSum += sentence.length;
// scan sentence for NPs
for (int i = 0; i < sentence.length; i++) {
String term = SnowballStemmer.stem(sentence[i].toLowerCase());
if (term.length() > 1) {
Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
termCounters.put(term, new Integer(count.intValue() + 1));
}
}
}
}
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String text = r.getAnswer();
// tokenize sentence
String[] sentence = NETagger.tokenize(text);
float importance = 0;
// scan sentence for NPs
for (int i = 0; i < sentence.length; i++) {
String term = sentence[i];
if ((term.length() > 1) && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term)) {
// if (term.length() > 1) {
term = SnowballStemmer.stem(term.toLowerCase());
Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
if (count.intValue() > Math.floor(Math.sqrt(results.length / 100)))
importance += count.intValue();
// if (count.intValue() > (results.length / 100))
// importance += (((float) count.intValue()) / ((float) results.length));
}
}
if (importance > 0) {
r.incScore(importance);
rawResults.add(r);
// r.incScore((float) Math.sqrt(importance));
}
}
}
return rawResults.toArray(new Result[rawResults.size()]);
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class UnnecessaryCharactersFilter method apply.
public Result[] apply(Result[] results) {
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String sentence = r.getAnswer();
sentence = sentence.replaceAll("(\\'|\\\"|\\`|\\_)", "");
r.setAnswer(sentence);
}
}
return results;
}
Aggregations