use of io.anserini.util.FeatureVector in project Anserini by castorini.
the class Rm3Reranker method estimateRelevanceModel.
public FeatureVector estimateRelevanceModel(ScoredDocuments docs, IndexReader reader) {
FeatureVector f = new FeatureVector();
Set<String> vocab = Sets.newHashSet();
int numdocs = docs.documents.length < fbDocs ? docs.documents.length : fbDocs;
FeatureVector[] docvectors = new FeatureVector[numdocs];
for (int i = 0; i < numdocs; i++) {
try {
FeatureVector docVector = FeatureVector.fromLuceneTermVector(reader.getTermVector(docs.ids[i], field), stopper);
docVector.pruneToSize(fbTerms);
vocab.addAll(docVector.getFeatures());
docvectors[i] = docVector;
} catch (IOException e) {
e.printStackTrace();
// Just return empty feature vector.
return f;
}
}
// Precompute the norms once and cache results.
float[] norms = new float[docvectors.length];
for (int i = 0; i < docvectors.length; i++) {
norms[i] = (float) docvectors[i].computeL1Norm();
}
for (String term : vocab) {
float fbWeight = 0.0f;
for (int i = 0; i < docvectors.length; i++) {
fbWeight += (docvectors[i].getFeatureWeight(term) / norms[i]) * docs.scores[i];
}
f.addFeatureWeight(term, fbWeight);
}
f.pruneToSize(fbTerms);
f.scaleToUnitL1Norm();
return f;
}
use of io.anserini.util.FeatureVector in project Anserini by castorini.
the class Rm3Reranker method createdFeatureVector.
private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) {
FeatureVector f = new FeatureVector();
try {
int numDocs = reader.numDocs();
TermsEnum termsEnum = terms.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
if (term.length() < 2 || term.length() > 20)
continue;
if (this.filterTerms && !term.matches("[a-z0-9]+"))
continue;
// This seemingly arbitrary logic needs some explanation. See following PR for details:
// https://github.com/castorini/Anserini/pull/289
//
// We have long known that stopwords have a big impact in RM3. If we include stopwords
// in feedback, effectiveness is affected negatively. In the previous implementation, we
// built custom stopwords lists by selecting top k terms from the collection. We only
// had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all
// collections other than Twitter.
//
// The logic below instead uses a df threshold: If a term appears in more than n percent
// of the documents, then it is discarded as a feedback term. This heuristic has the
// advantage of getting rid of collection-specific stopwords lists, but at the cost of
// introducing an additional tuning parameter.
//
// Cognizant of the dangers of (essentially) tuning on test data, here's what I
// (@lintool) did:
//
// + For newswire collections, I picked a number, 10%, that seemed right. This value
// actually increased effectiveness in most conditions across all newswire collections.
//
// + This 10% value worked fine on web collections; effectiveness didn't change much.
//
// Since this was the first and only heuristic value I selected, we're not really tuning
// parameters.
//
// The 10% threshold, however, doesn't work well on tweets because tweets are much
// shorter. Based on a list terms in the collection by df: For the Tweets2011 collection,
// I found a threshold close to a nice round number that approximated the length of the
// current stopwords list, by eyeballing the df values. This turned out to be 1%. I did
// this again for the Tweets2013 collection, using the same approach, and obtained a value
// of 0.7%.
//
// With both values, we obtained effectiveness pretty close to the old values with the
// custom stopwords list.
int df = reader.docFreq(new Term(IndexArgs.CONTENTS, term));
float ratio = (float) df / numDocs;
if (tweetsearch) {
if (numDocs > 100000000) {
// Probably Tweets2013
if (ratio > 0.007f)
continue;
} else {
if (ratio > 0.01f)
continue;
}
} else if (ratio > 0.1f)
continue;
int freq = (int) termsEnum.totalTermFreq();
f.addFeatureWeight(term, (float) freq);
}
} catch (Exception e) {
e.printStackTrace();
// Return empty feature vector
return f;
}
return f;
}
use of io.anserini.util.FeatureVector in project Anserini by castorini.
the class Rm3Reranker method estimateRelevanceModel.
private FeatureVector estimateRelevanceModel(ScoredDocuments docs, IndexReader reader, boolean tweetsearch, boolean useRf) {
FeatureVector f = new FeatureVector();
Set<String> vocab = new HashSet<>();
int numdocs;
if (useRf) {
numdocs = docs.documents.length;
} else {
numdocs = docs.documents.length < fbDocs ? docs.documents.length : fbDocs;
}
List<FeatureVector> docvectors = new ArrayList<>();
List<Float> docScores = new ArrayList<>();
for (int i = 0; i < numdocs; i++) {
if (useRf && docs.scores[i] <= .0) {
continue;
}
try {
FeatureVector docVector = createdFeatureVector(reader.getTermVector(docs.ids[i], field), reader, tweetsearch);
docVector.pruneToSize(fbTerms);
vocab.addAll(docVector.getFeatures());
docvectors.add(docVector);
docScores.add(Float.valueOf(docs.scores[i]));
} catch (IOException e) {
e.printStackTrace();
// Just return empty feature vector.
return f;
}
}
// Precompute the norms once and cache results.
float[] norms = new float[docvectors.size()];
for (int i = 0; i < docvectors.size(); i++) {
norms[i] = (float) docvectors.get(i).computeL1Norm();
}
for (String term : vocab) {
float fbWeight = 0.0f;
for (int i = 0; i < docvectors.size(); i++) {
// that accents (which are indexed, but not selected for feedback).
if (norms[i] > 0.001f) {
fbWeight += (docvectors.get(i).getFeatureWeight(term) / norms[i]) * docScores.get(i);
}
}
f.addFeatureWeight(term, fbWeight);
}
f.pruneToSize(fbTerms);
f.scaleToUnitL1Norm();
return f;
}
use of io.anserini.util.FeatureVector in project Anserini by castorini.
the class Rm3Reranker method rerank.
@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
assert (docs.documents.length == docs.scores.length);
IndexSearcher searcher = context.getIndexSearcher();
IndexReader reader = searcher.getIndexReader();
FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.analyze(analyzer, context.getQueryText())).scaleToUnitL1Norm();
boolean useRf = (context.getSearchArgs().rf_qrels != null);
FeatureVector rm = estimateRelevanceModel(docs, reader, context.getSearchArgs().searchtweets, useRf);
rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);
BooleanQuery.Builder feedbackQueryBuilder = new BooleanQuery.Builder();
Iterator<String> terms = rm.iterator();
while (terms.hasNext()) {
String term = terms.next();
float prob = rm.getFeatureWeight(term);
feedbackQueryBuilder.add(new BoostQuery(new TermQuery(new Term(this.field, term)), prob), BooleanClause.Occur.SHOULD);
}
Query feedbackQuery = feedbackQueryBuilder.build();
if (this.outputQuery) {
LOG.info("QID: " + context.getQueryId());
LOG.info("Original Query: " + context.getQuery().toString(this.field));
LOG.info("Running new query: " + feedbackQuery.toString(this.field));
}
TopDocs rs;
try {
Query finalQuery = feedbackQuery;
// Otherwise, just use the feedback query.
if (context.getFilter() != null) {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
bqBuilder.add(feedbackQuery, BooleanClause.Occur.MUST);
finalQuery = bqBuilder.build();
}
// Figure out how to break the scoring ties.
if (context.getSearchArgs().arbitraryScoreTieBreak) {
rs = searcher.search(finalQuery, context.getSearchArgs().hits);
} else if (context.getSearchArgs().searchtweets) {
rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true);
} else {
rs = searcher.search(finalQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true);
}
} catch (IOException e) {
e.printStackTrace();
return docs;
}
return ScoredDocuments.fromTopDocs(rs, searcher);
}
use of io.anserini.util.FeatureVector in project Anserini by castorini.
the class Rm3Reranker method rerank.
@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
Preconditions.checkState(docs.documents.length == docs.scores.length);
IndexSearcher searcher = context.getIndexSearcher();
IndexReader reader = searcher.getIndexReader();
FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();
FeatureVector rm = estimateRelevanceModel(docs, reader);
LOG.info("Relevance model estimated.");
rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);
StringBuilder builder = new StringBuilder();
Iterator<String> terms = rm.iterator();
while (terms.hasNext()) {
String term = terms.next();
double prob = rm.getFeatureWeight(term);
builder.append(term + "^" + prob + " ");
}
String queryText = builder.toString().trim();
QueryParser p = new QueryParser(field, new WhitespaceAnalyzer());
Query nq = null;
try {
nq = p.parse(queryText);
} catch (ParseException e) {
e.printStackTrace();
return docs;
}
LOG.info("Running new query: " + nq);
TopDocs rs = null;
try {
if (context.getFilter() == null) {
rs = searcher.search(nq, 1000);
} else {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
bqBuilder.add(nq, BooleanClause.Occur.MUST);
Query q = bqBuilder.build();
rs = searcher.search(q, 1000);
}
} catch (IOException e) {
e.printStackTrace();
return docs;
}
return ScoredDocuments.fromTopDocs(rs, searcher);
}
Aggregations