use of io.anserini.ltr.feature.FeatureExtractors in project Anserini by castorini.
the class BaseFeatureExtractor method printFeatures.
/**
* Prints feature vectors wrt to the qrels, one vector per qrel
* @param out
* @throws IOException
*/
public void printFeatures(PrintStream out) throws IOException {
Map<String, RerankerContext> queryContextMap = buildRerankerContextMap();
FeatureExtractors extractors = getExtractors();
Bits liveDocs = MultiFields.getLiveDocs(reader);
Set<String> fieldsToLoad = getFieldsToLoad();
// We need to open a searcher
IndexSearcher searcher = new IndexSearcher(reader);
this.printHeader(out, extractors);
// Iterate through all the qrels and for each document id we have for them
LOG.debug("Processing queries");
for (String qid : this.qrels.getQids()) {
LOG.debug(String.format("Processing qid: %s", qid));
// Get the map of documents
RerankerContext context = queryContextMap.get(qid);
for (Map.Entry<String, Integer> entry : this.qrels.getDocMap(qid).entrySet()) {
String docId = entry.getKey();
int qrelScore = entry.getValue();
// We issue a specific query
TopDocs topDocs = searcher.search(docIdQuery(docId), 1);
if (topDocs.totalHits == 0) {
LOG.warn(String.format("Document Id %s expected but not found in index, skipping...", docId));
continue;
}
ScoreDoc hit = topDocs.scoreDocs[0];
Document doc = reader.document(hit.doc, fieldsToLoad);
//TODO factor for test
Terms terms = reader.getTermVector(hit.doc, getTermVectorField());
if (terms == null) {
LOG.debug(String.format("No term vectors found for doc %s, qid %s", docId, qid));
continue;
}
float[] featureValues = extractors.extractAll(doc, terms, context);
writeFeatureVector(out, qid, qrelScore, docId, featureValues);
}
LOG.debug(String.format("Finished processing for qid: %s", qid));
out.flush();
}
}
use of io.anserini.ltr.feature.FeatureExtractors in project Anserini by castorini.
the class BaseFeatureExtractor method printFeatureForAllDocs.
/**
* Iterates through all the documents and print the features for each of the queries
* This way we are not iterating over the entire index for each query to save disk access
* @param out
* @throws IOException
*/
public void printFeatureForAllDocs(PrintStream out) throws IOException {
Map<String, RerankerContext> queryContextMap = buildRerankerContextMap();
FeatureExtractors extractors = getExtractors();
Bits liveDocs = MultiFields.getLiveDocs(reader);
Set<String> fieldsToLoad = getFieldsToLoad();
this.printHeader(out, extractors);
for (int docId = 0; docId < reader.maxDoc(); docId++) {
// Only check live docs if we have some
if (reader.hasDeletions() && (liveDocs == null || !liveDocs.get(docId))) {
LOG.warn(String.format("Document %d not in live docs", docId));
continue;
}
Document doc = reader.document(docId, fieldsToLoad);
String docIdString = doc.get(getIdField());
// NOTE doc frequencies should not be retrieved from here, term vector returned is as if on single document
// index
//reader.getTermVector(docId, getTermVectorField());
Terms terms = MultiFields.getTerms(reader, getTermVectorField());
if (terms == null) {
continue;
}
for (Map.Entry<String, RerankerContext> entry : queryContextMap.entrySet()) {
float[] featureValues = extractors.extractAll(doc, terms, entry.getValue());
writeFeatureVector(out, entry.getKey(), qrels.getRelevanceGrade(entry.getKey(), docIdString), docIdString, featureValues);
}
out.flush();
LOG.debug(String.format("Completed computing feature vectors for doc %d", docId));
}
}
use of io.anserini.ltr.feature.FeatureExtractors in project Anserini by castorini.
the class FeatureExtractorCli method main.
/**
* requires the user to supply the index directory and also the directory containing the qrels and topics
* @param args indexDir, qrelFile, topicFile, outputFile
*/
public static void main(String[] args) throws Exception {
long curTime = System.nanoTime();
FeatureExtractionArgs parsedArgs = new FeatureExtractionArgs();
CmdLineParser parser = new CmdLineParser(parsedArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
return;
}
Directory indexDirectory = FSDirectory.open(Paths.get(parsedArgs.indexDir));
IndexReader reader = DirectoryReader.open(indexDirectory);
Qrels qrels = new Qrels(parsedArgs.qrelFile);
FeatureExtractors extractors = null;
if (parsedArgs.extractors != null) {
extractors = FeatureExtractors.loadExtractor(parsedArgs.extractors);
}
// Query parser needed to construct the query object for feature extraction in the loop
PrintStream out = new PrintStream(new FileOutputStream(new File(parsedArgs.outputFile)));
if (parsedArgs.collection.equals("Trec") || parsedArgs.collection.equals("Webxml")) {
// Open the topics file and read it
String className = parsedArgs.collection.equals("gov2") ? "Trec" : "Webxml";
TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + className + "TopicReader").getConstructor(Path.class).newInstance(Paths.get(parsedArgs.topicsFile));
SortedMap<Integer, String> topics = tr.read();
LOG.debug(String.format("%d topics found", topics.size()));
WebFeatureExtractor extractor = new WebFeatureExtractor(reader, qrels, convertTopicsFormat(topics), extractors);
extractor.printFeatures(out);
} else if (parsedArgs.collection.equals("twitter")) {
Map<String, String> topics = MicroblogTopicSet.fromFile(new File(parsedArgs.topicsFile)).toMap();
LOG.debug(String.format("%d topics found", topics.size()));
TwitterFeatureExtractor extractor = new TwitterFeatureExtractor(reader, qrels, topics, extractors);
extractor.printFeatures(out);
} else {
System.err.println("Unrecognized collection " + parsedArgs.collection);
}
}
use of io.anserini.ltr.feature.FeatureExtractors in project Anserini by castorini.
the class SearchTweets method main.
public static void main(String[] args) throws Exception {
long curTime = System.nanoTime();
SearchArgs searchArgs = new SearchArgs();
CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
LOG.info("Reading index at " + searchArgs.index);
Directory dir;
if (searchArgs.inmem) {
LOG.info("Using MMapDirectory with preload");
dir = new MMapDirectory(Paths.get(searchArgs.index));
((MMapDirectory) dir).setPreload(true);
} else {
LOG.info("Using default FSDirectory");
dir = FSDirectory.open(Paths.get(searchArgs.index));
}
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
if (searchArgs.ql) {
LOG.info("Using QL scoring model");
searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
} else if (searchArgs.bm25) {
LOG.info("Using BM25 scoring model");
searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
} else {
LOG.error("Error: Must specify scoring model!");
System.exit(-1);
}
RerankerCascade cascade = new RerankerCascade();
if (searchArgs.rm3) {
cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
} else {
cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
}
if (!searchArgs.model.isEmpty() && searchArgs.extractors != null) {
LOG.debug(String.format("Ranklib model used, modeled loaded from %s", searchArgs.model));
cascade.add(new RankLibReranker(searchArgs.model, StatusField.TEXT.name, searchArgs.extractors));
}
FeatureExtractors extractorChain = null;
if (searchArgs.extractors != null) {
extractorChain = FeatureExtractors.loadExtractor(searchArgs.extractors);
}
if (searchArgs.dumpFeatures) {
PrintStream out = new PrintStream(searchArgs.featureFile);
Qrels qrels = new Qrels(searchArgs.qrels);
cascade.add(new TweetsLtrDataGenerator(out, qrels, extractorChain));
}
MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));
PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
LOG.info("Writing output to " + searchArgs.output);
LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
long totalTime = 0;
int cnt = 0;
for (MicroblogTopic topic : topics) {
long curQueryTime = System.nanoTime();
Query filter = LongPoint.newRangeQuery(StatusField.ID.name, 0L, topic.getQueryTweetTime());
Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery());
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filter, BooleanClause.Occur.FILTER);
builder.add(query, BooleanClause.Occur.MUST);
Query q = builder.build();
TopDocs rs = searcher.search(q, searchArgs.hits);
List<String> queryTokens = AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery());
RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, StatusField.TEXT.name, filter);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
for (int i = 0; i < docs.documents.length; i++) {
String qid = topic.getId().replaceFirst("^MB0*", "");
out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], searchArgs.runtag));
}
long qtime = (System.nanoTime() - curQueryTime) / 1000000;
LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
totalTime += qtime;
cnt++;
}
LOG.info("All queries completed!");
LOG.info("Total elapsed time = " + totalTime + "ms");
LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
reader.close();
out.close();
}
use of io.anserini.ltr.feature.FeatureExtractors in project Anserini by castorini.
the class SearchWebCollection method main.
public static void main(String[] args) throws Exception {
SearchArgs searchArgs = new SearchArgs();
CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
LOG.info("Reading index at " + searchArgs.index);
Directory dir;
if (searchArgs.inmem) {
LOG.info("Using MMapDirectory with preload");
dir = new MMapDirectory(Paths.get(searchArgs.index));
((MMapDirectory) dir).setPreload(true);
} else {
LOG.info("Using default FSDirectory");
dir = FSDirectory.open(Paths.get(searchArgs.index));
}
Similarity similarity = null;
if (searchArgs.ql) {
LOG.info("Using QL scoring model");
similarity = new LMDirichletSimilarity(searchArgs.mu);
} else if (searchArgs.bm25) {
LOG.info("Using BM25 scoring model");
similarity = new BM25Similarity(searchArgs.k1, searchArgs.b);
} else {
LOG.error("Error: Must specify scoring model!");
System.exit(-1);
}
RerankerCascade cascade = new RerankerCascade();
boolean useQueryParser = false;
if (searchArgs.rm3) {
cascade.add(new Rm3Reranker(new EnglishAnalyzer(), FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt"));
useQueryParser = true;
} else {
cascade.add(new IdentityReranker());
}
FeatureExtractors extractors = null;
if (searchArgs.extractors != null) {
extractors = FeatureExtractors.loadExtractor(searchArgs.extractors);
}
if (searchArgs.dumpFeatures) {
PrintStream out = new PrintStream(searchArgs.featureFile);
Qrels qrels = new Qrels(searchArgs.qrels);
cascade.add(new WebCollectionLtrDataGenerator(out, qrels, extractors));
}
Path topicsFile = Paths.get(searchArgs.topics);
if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
}
TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + searchArgs.topicReader + "TopicReader").getConstructor(Path.class).newInstance(topicsFile);
SortedMap<Integer, String> topics = tr.read();
final long start = System.nanoTime();
SearchWebCollection searcher = new SearchWebCollection(searchArgs.index);
searcher.search(topics, searchArgs.output, similarity, searchArgs.hits, cascade, useQueryParser, searchArgs.keepstop);
searcher.close();
final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info("Total " + topics.size() + " topics searched in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}
Aggregations