use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class RetrieveSentences method search.
public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
// using BM25 scoring model
Similarity similarity = new BM25Similarity(0.9f, 0.4f);
searcher.setSimilarity(similarity);
EnglishAnalyzer ea = new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Map<String, Float> scoredDocs = new LinkedHashMap<>();
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
}
}
return scoredDocs;
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class IdfPassageScorer method score.
@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
QueryParser queryParser = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, englishAnalyzer);
ClassicSimilarity similarity = new ClassicSimilarity();
String escapedQuery = queryParser.escape(query);
Query question = queryParser.parse(escapedQuery);
HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));
EnglishAnalyzer englishAnalyzerWithStop = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
QueryParser queryParserWithStop = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, englishAnalyzerWithStop);
Query questionWithStopWords = queryParserWithStop.parse(escapedQuery);
HashSet<String> questionTermsIDF = new HashSet<>(Arrays.asList(questionWithStopWords.toString().trim().toLowerCase().split("\\s+")));
// add the question terms to the termIDF Map
for (String questionTerm : questionTermsIDF) {
try {
TermQuery q = (TermQuery) queryParserWithStop.parse(questionTerm);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
termIdfMap.put(questionTerm, String.valueOf(termIDF));
} catch (Exception e) {
continue;
}
}
// avoid duplicate passages
HashSet<String> seenSentences = new HashSet<>();
for (Map.Entry<String, Float> sent : sentences.entrySet()) {
double idf = 0.0;
HashSet<String> seenTerms = new HashSet<>();
String[] terms = sent.getKey().toLowerCase().split("\\s+");
for (String term : terms) {
try {
TermQuery q = (TermQuery) queryParser.parse(term);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
idf += termIDF;
seenTerms.add(t.toString());
}
TermQuery q2 = (TermQuery) queryParserWithStop.parse(term);
Term t2 = q2.getTerm();
double termIDFwithStop = similarity.idf(reader.docFreq(t2), reader.numDocs());
termIdfMap.put(term, String.valueOf(termIDFwithStop));
} catch (Exception e) {
continue;
}
}
double weightedScore = idf + 0.0001 * sent.getValue();
ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
if (scoredPassageHeap.size() == topPassages) {
scoredPassageHeap.pollLast();
}
scoredPassageHeap.add(scoredPassage);
seenSentences.add(sent.getKey());
}
}
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class SearchWebCollection method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
final String runTag = "BM25_EnglishAnalyzer_" + (keepstopwords ? "KeepStopwords_" : "") + FIELD_BODY + "_" + similarity.toString();
PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
/**
* For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
*/
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
/**
* the first column is the topic number.
* the second column is currently unused and should always be "Q0".
* the third column is the official document identifier of the retrieved document.
* the fourth column is the rank the document is retrieved.
* the fifth column shows the score (integer or floating point) that generated the ranking.
* the sixth column is called the "run tag" and should be a unique identifier for your
*/
for (int i = 0; i < docs.documents.length; i++) {
out.println(String.format("%d Q0 %s %d %f %s", qID, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], runTag));
}
}
out.flush();
out.close();
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project xodus by JetBrains.
the class ExodusLuceneTestsBase method removeStopWord.
protected void removeStopWord(final String stopWord) {
final HashSet<Object> stopSet = new HashSet<>();
for (Object word : ((StopwordAnalyzerBase) analyzer).getStopwordSet()) {
if (!stopWord.equals(new String((char[]) word))) {
stopSet.add(word);
}
}
analyzer = new EnglishAnalyzer(LUCENE_VERSION, stopSet);
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class IndexerTestBase method buildTestIndex.
// A very simple example of how to build an index.
private void buildTestIndex() throws IOException {
Directory dir = FSDirectory.open(tempDir1);
Analyzer analyzer = new EnglishAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, config);
FieldType textOptions = new FieldType();
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
textOptions.setStored(true);
textOptions.setTokenized(true);
textOptions.setStoreTermVectors(true);
textOptions.setStoreTermVectorPositions(true);
Document doc1 = new Document();
String doc1Text = "here is some text here is some more text. city.";
doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
doc1.add(new Field(IndexArgs.CONTENTS, doc1Text, textOptions));
// specifically demonstrate how "contents" and "raw" might diverge:
doc1.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc1Text)));
writer.addDocument(doc1);
Document doc2 = new Document();
String doc2Text = "more texts";
doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
// Note plural, to test stemming
doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions));
// specifically demonstrate how "contents" and "raw" might diverge:
doc2.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc2Text)));
writer.addDocument(doc2);
Document doc3 = new Document();
String doc3Text = "here is a test";
doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
// specifically demonstrate how "contents" and "raw" might diverge:
doc3.add(new StoredField(IndexArgs.RAW, String.format("{\"contents\": \"%s\"}", doc3Text)));
writer.addDocument(doc3);
writer.commit();
writer.forceMerge(1);
writer.close();
dir.close();
}
Aggregations