use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class IndexerTest method buildTestIndex.
// A very simple example of how to build an index.
private void buildTestIndex() throws IOException {
Directory dir = FSDirectory.open(Paths.get(INDEX_PATH1));
Analyzer analyzer = new EnglishAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, config);
FieldType textOptions = new FieldType();
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
textOptions.setStored(true);
textOptions.setTokenized(true);
Document doc1 = new Document();
doc1.add(new TextField("docid", "doc1", Field.Store.YES));
doc1.add(new Field("text", "here is some text here is some more text", textOptions));
writer.addDocument(doc1);
Document doc2 = new Document();
doc2.add(new TextField("docid", "doc2", Field.Store.YES));
doc2.add(new Field("text", "more text", textOptions));
writer.addDocument(doc2);
Document doc3 = new Document();
doc3.add(new TextField("docid", "doc3", Field.Store.YES));
doc3.add(new Field("text", "here is a test", textOptions));
writer.addDocument(doc3);
writer.commit();
writer.forceMerge(1);
writer.close();
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class IndexUtils method printTermCounts.
public void printTermCounts(String termStr) throws IOException, ParseException {
EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
TermQuery q = (TermQuery) qp.parse(termStr);
Term t = q.getTerm();
System.out.println("raw term: " + termStr);
System.out.println("stemmed term: " + q.toString(LuceneDocumentGenerator.FIELD_BODY));
System.out.println("collection frequency: " + reader.totalTermFreq(t));
System.out.println("document frequency: " + reader.docFreq(t));
PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes());
System.out.println("postings:\n");
while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq());
}
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class IndexObjectTriples method run.
private void run() throws IOException, InterruptedException {
final long start = System.nanoTime();
LOG.info("Starting indexer...");
final Directory dir = FSDirectory.open(indexPath);
final EnglishAnalyzer analyzer = new EnglishAnalyzer();
final IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setCodec(new Lucene62Codec(Lucene50StoredFieldsFormat.Mode.BEST_SPEED));
config.setUseCompoundFile(false);
final IndexWriter writer = new IndexWriter(dir, config);
index(writer, collectionPath);
int numIndexed = writer.maxDoc();
try {
writer.commit();
} finally {
try {
writer.close();
} catch (IOException e) {
LOG.error(e);
}
}
LOG.info("Indexed documents: " + counters.indexedDocuments.get());
final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class IndexNodes method run.
public void run() throws IOException, InterruptedException {
final long start = System.nanoTime();
LOG.info("Starting indexer...");
final Directory dir = FSDirectory.open(indexPath);
final EnglishAnalyzer analyzer = new EnglishAnalyzer();
final IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setCodec(new Lucene62Codec(Lucene50StoredFieldsFormat.Mode.BEST_SPEED));
config.setUseCompoundFile(false);
final IndexWriter writer = new IndexWriter(dir, config);
final AtomicInteger cnt = new AtomicInteger();
new Freebase(inputPath).stream().map(new LuceneDocumentGenerator()).forEach(doc -> {
try {
writer.addDocument(doc);
int cur = cnt.incrementAndGet();
if (cur % 10000000 == 0) {
LOG.info(cnt + " nodes added.");
}
} catch (IOException e) {
LOG.error(e);
}
});
LOG.info(cnt.get() + " nodes added.");
int numIndexed = writer.maxDoc();
try {
writer.commit();
} finally {
try {
writer.close();
} catch (IOException e) {
LOG.error(e);
}
}
long duration = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(duration, "HH:mm:ss"));
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class PyseriniEntryPoint method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public Map<String, Float> search(SortedMap<Integer, String> topics, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
Map<String, Float> scoredDocs = new LinkedHashMap<>();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
for (int i = 0; i < docs.documents.length; i++) {
String docid = docs.documents[i].getField(FIELD_ID).stringValue();
float score = docs.scores[i];
scoredDocs.put(docid, score);
}
}
return scoredDocs;
}
Aggregations