use of io.anserini.index.IndexUtils in project Anserini by castorini.
the class PyseriniEntryPoint method initializeWithIndex.
public void initializeWithIndex(String indexDir) throws Exception {
Path indexPath = Paths.get(indexDir);
if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
throw new IllegalArgumentException(indexDir + " does not exist or is not a directory.");
}
this.indexDir = indexDir;
this.reader = DirectoryReader.open(FSDirectory.open(indexPath));
this.indexUtils = new IndexUtils(indexDir);
}
use of io.anserini.index.IndexUtils in project Anserini by castorini.
the class RetrieveSentences method getRankedPassages.
public void getRankedPassages(Args args) throws Exception {
Map<String, Float> scoredDocs = retrieveDocuments(args.query, args.hits);
Map<String, Float> sentencesMap = new LinkedHashMap<>();
IndexUtils util = new IndexUtils(args.index);
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
List<Sentence> sentences = util.getSentDocument(doc.getKey());
for (Sentence sent : sentences) {
List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
sentencesMap.put(answerTokens, doc.getValue());
}
}
String queryTokens = tokenizerFactory.getTokenizer(new StringReader(args.query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
scorer.score(queryTokens, sentencesMap);
List<ScoredPassage> topPassages = scorer.extractTopPassages();
for (ScoredPassage s : topPassages) {
System.out.println(s.getSentence() + " " + s.getScore());
}
}
use of io.anserini.index.IndexUtils in project Anserini by castorini.
the class RetrieveSentences method getRankedPassagesList.
public List<String> getRankedPassagesList(String query, String index, int hits, int k) throws Exception {
Map<String, Float> scoredDocs = retrieveDocuments(query, hits);
Map<String, Float> sentencesMap = new LinkedHashMap<>();
IndexUtils util = new IndexUtils(index);
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
List<Sentence> sentences = util.getSentDocument(doc.getKey());
for (Sentence sent : sentences) {
List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
sentencesMap.put(answerTokens, doc.getValue());
}
}
scorer = new IdfPassageScorer(index, k);
String queryTokens = tokenizerFactory.getTokenizer(new StringReader(query)).tokenize().stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
scorer.score(queryTokens, sentencesMap);
List<String> topSentences = new ArrayList<>();
List<ScoredPassage> topPassages = scorer.extractTopPassages();
for (ScoredPassage s : topPassages) {
topSentences.add(s.getSentence() + "\t" + s.getScore());
System.out.println(s.getSentence() + " " + s.getScore());
}
return topSentences;
}
Aggregations