use of info.ephyra.search.Result in project lucida by claritylab.
the class ScoreNormalizationFilter method addMaxScoreFeature.
/**
* Adds the maximum score of all factoid answers from the same extractor as
* a feature to the instance.
*/
private static void addMaxScoreFeature(MutableInstance instance, Result result, Result[] results) {
// calculate maximum score
double maxScore = 0;
// String extractor = result.getExtractionTechniques()[0];
for (Result r : results) if (r.getScore() > 0 && r.getScore() < Float.POSITIVE_INFINITY)
// if (r.extractedWith(extractor))
maxScore = Math.max(r.getScore(), maxScore);
Feature feature = new Feature(MAX_SCORE_F);
instance.addNumeric(feature, maxScore);
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class SubclauseSplitterFilter method apply.
/**
* Splits sentences into individual subclauses in order to facilitate
* subsequent filtering. The idea is that redundancy detection is easier for
* shorter snippets than for longer ones.
*
* @param results array of <code>Result</code> objects
* @return extended array of <code>Result</code> objects
*/
public Result[] apply(Result[] results) {
// raw results returned by the searchers
ArrayList<Result> rawResults = new ArrayList<Result>();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String sentence = r.getAnswer();
String[] sentences = sentence.split("(\\b(although|but|how|until|what|when|where|which|who|whom|why)\\b)");
if (sentences.length != 0) {
r.setAnswer(sentences[0]);
rawResults.add(r);
for (int s = 1; s < sentences.length; s++) {
Result newRes = new Result(sentences[s], r.getQuery(), r.getDocID(), r.getHitPos());
newRes.setScore(r.getScore());
rawResults.add(newRes);
}
} else
rawResults.add(r);
}
}
return rawResults.toArray(new Result[rawResults.size()]);
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class TripletFilter method apply.
/**
* Increments the score of each result snippet according to the number of
* NP-VP-NP triplets it is the first to contain. This is meant to prefer
* snippets that provide new information over those that repeat information
* from previous snippets.
*
* @param results array of <code>Result</code> objects
* @return modified array of <code>Result</code> objects
*/
public Result[] apply(Result[] results) {
// raw results returned by the searchers
ArrayList<Result> rawResults = new ArrayList<Result>();
HashSet<String> found = new HashSet<String>();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String stemmedQuestion = SnowballStemmer.stemAllTokens(r.getQuery().getAnalyzedQuestion().getQuestion());
String text = r.getAnswer();
// tokenize and tag sentence
if (!text.endsWith("."))
text += ".";
String[] sentence = OpenNLP.tokenize(text);
String[] posTags = OpenNLP.tagPos(sentence);
String[] chunkTags = OpenNLP.tagChunks(sentence, posTags);
chunkTags = OpenNLP.joinNounPhrases(sentence, chunkTags);
int tripStart = -1;
int index = 0;
int numberOfTriplets = 0;
// scan sentence for NP-VP-NP triplets
while (index < sentence.length) {
// find start of first NP
while ((index < sentence.length) && !"B-NP".equals(chunkTags[index])) index++;
if (index < sentence.length) {
tripStart = index;
int i = 1;
// find start of VP
while (((index + i) < sentence.length) && !"B-VP".equals(chunkTags[index + i])) {
if ("B-NP".equals(chunkTags[index + i]))
i = sentence.length;
else if ("O".equals(chunkTags[index + i]))
i = sentence.length;
else
i++;
}
i++;
// find start of second NP
while (((index + i) < sentence.length) && !"B-NP".equals(chunkTags[index + i])) {
if ("B-VP".equals(chunkTags[index + i]))
i = sentence.length;
else if ("O".equals(chunkTags[index + i]))
i = sentence.length;
else if ("B-SBAR".equals(chunkTags[index + i]))
i = sentence.length;
else
i++;
}
// complete second NP
i++;
while (((index + i) < sentence.length) && "I-NP".equals(chunkTags[index + i])) i++;
// remember NP-VP-NP triplet
if ((index + i) < sentence.length) {
String trip = "";
for (int s = tripStart; s < (tripStart + i); s++) trip += " " + sentence[s];
trip = SnowballStemmer.stemAllTokens(trip.trim());
if (!found.contains(trip)) {
found.add(trip);
if (!StringUtils.isSubsetKeywords(trip, stemmedQuestion)) {
//System.out.println("Triplet:\n " + trip);
// Result newRes = new Result(trip, r.getQuery(), r.getDocID(), r.getHitPos());
// newRes.setScore(r.getScore() + 1);
// rawResults.add(newRes);
numberOfTriplets++;
}
}
// if (!StringUtils.isSubsetKeywords(trip, r.getQuery().getQuestion())) {
// if (resultsByTriplets.containsKey(trip)) {
// Result res = resultsByTriplets.get(trip);
// res.setScore(res.getScore() + 1);
// } else resultsByTriplets.put(trip, r);
// }
}
index++;
}
}
if (numberOfTriplets != 0) {
// 20060724_2x runs
r.incScore(numberOfTriplets);
// r.incScore(numberOfTriplets * (((float) results.length) / ((float) sentence.length))); // 20060725_0x runs
rawResults.add(r);
}
}
}
return rawResults.toArray(new Result[rawResults.size()]);
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class TruncationFilter method apply.
/**
* Filters an array of <code>Result</code> objects.
*
* @param results results to filter
* @return filtered results
*/
public Result[] apply(Result[] results) {
// all results that pass the filter
ArrayList<Result> filtered = new ArrayList<Result>();
// for each extractor, truncated answers and corresponding results
Hashtable<String, Hashtable<String, Result>> truncated = new Hashtable<String, Hashtable<String, Result>>();
// sort results by their scores in descending order
results = (new ScoreSorterFilter()).apply(results);
for (Result result : results) {
// only truncate factoid answers
if (result.getScore() <= 0 || result.getScore() == Float.POSITIVE_INFINITY) {
filtered.add(result);
continue;
}
// make sure that answers come from a single extractor
String[] extractors = result.getExtractionTechniques();
if (extractors == null || extractors.length != 1) {
filtered.add(result);
continue;
}
String extractor = extractors[0];
// truncate result
result = apply(result);
// merge with similar results from same extractor
Hashtable<String, Result> truncatedT = truncated.get(extractor);
if (truncatedT == null) {
truncatedT = new Hashtable<String, Result>();
truncated.put(extractor, truncatedT);
}
String norm = StringUtils.normalize(result.getAnswer());
Result similar = truncatedT.get(norm);
if (similar == null) {
filtered.add(result);
truncatedT.put(norm, result);
} else {
similar.incScore(result.getScore());
}
}
return filtered.toArray(new Result[filtered.size()]);
}
use of info.ephyra.search.Result in project lucida by claritylab.
the class WebDocumentFetcher method apply.
/**
* Fetches the top <code>MAX_DOCS</code> documents containing the given
* search engine snippets. The original snippets are dropped.
*
* @param results array of <code>Result</code> objects containing snippets
* @return array of <code>Result</code> objects containing entire documents
*/
public Result[] apply(Result[] results) {
// documents containing the search engine snippets
docs = new ArrayList<Result>();
// start document fetchers
HashSet<String> urls = new HashSet<String>();
for (Result result : results) {
// only apply this filter to results for the semantic parsing
// approach
Query query = result.getQuery();
Predicate[] ps = query.getAnalyzedQuestion().getPredicates();
if (!query.extractWith(FactoidsFromPredicatesFilter.ID) || ps.length == 0 || result.getScore() > Float.NEGATIVE_INFINITY)
continue;
// if result is not a web document then just make a copy
if (!result.getDocID().contains(":")) {
Result newResult = result.getCopy();
newResult.setScore(0);
docs.add(newResult);
continue;
}
// fetch at most MAX_DOCS documents
if (urls.size() >= MAX_DOCS)
break;
String url = result.getDocID();
// no forbidden document type
if (url.matches("(?i).*?" + FORBIDDEN_DOCS))
continue;
// only HTTP connections
try {
URLConnection conn = (new URL(url)).openConnection();
if (!(conn instanceof HttpURLConnection))
continue;
} catch (IOException e) {
continue;
}
// no duplicate document
if (!urls.add(url))
continue;
// if caching is enabled, try to read document from cache
if (CACHING) {
FileCache cache = new FileCache(CACHE_DIR);
String[] entries = cache.read(url);
if (entries != null) {
StringBuilder sb = new StringBuilder();
for (String entry : entries) {
sb.append(entry);
sb.append("\n");
}
String docText = sb.toString();
Result doc = new Result(docText, result.getQuery(), url, result.getHitPos());
doc.setScore(0);
docs.add(doc);
continue;
}
}
(new WebDocumentFetcher()).start(this, result);
}
// wait until all fetchers are done
waitForDocs();
// keep old results
Result[] newResults = docs.toArray(new Result[docs.size()]);
Result[] allResults = new Result[results.length + newResults.length];
for (int i = 0; i < results.length; i++) allResults[i] = results[i];
for (int i = 0; i < newResults.length; i++) allResults[results.length + i] = newResults[i];
return allResults;
}
Aggregations