use of io.anserini.search.similarity.AccurateBM25Similarity in project Anserini by castorini.
the class IndexCollection method run.
public Counters run() throws IOException {
final long start = System.nanoTime();
LOG.info("============ Indexing Collection ============");
int numThreads = args.threads;
IndexWriter writer = null;
// Used for LocalIndexThread
if (indexPath != null) {
final Directory dir = FSDirectory.open(indexPath);
final CJKAnalyzer chineseAnalyzer = new CJKAnalyzer();
final ArabicAnalyzer arabicAnalyzer = new ArabicAnalyzer();
final BengaliAnalyzer bengaliAnalyzer = new BengaliAnalyzer();
final DanishAnalyzer danishAnalyzer = new DanishAnalyzer();
final DutchAnalyzer dutchAnalyzer = new DutchAnalyzer();
final FinnishAnalyzer finnishAnalyzer = new FinnishAnalyzer();
final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
final GermanAnalyzer germanAnalyzer = new GermanAnalyzer();
final HindiAnalyzer hindiAnalyzer = new HindiAnalyzer();
final HungarianAnalyzer hungarianAnalyzer = new HungarianAnalyzer();
final IndonesianAnalyzer indonesianAnalyzer = new IndonesianAnalyzer();
final ItalianAnalyzer italianAnalyzer = new ItalianAnalyzer();
final JapaneseAnalyzer japaneseAnalyzer = new JapaneseAnalyzer();
final NorwegianAnalyzer norwegianAnalyzer = new NorwegianAnalyzer();
final PortugueseAnalyzer portugueseAnalyzer = new PortugueseAnalyzer();
final RussianAnalyzer russianAnalyzer = new RussianAnalyzer();
final SpanishAnalyzer spanishAnalyzer = new SpanishAnalyzer();
final SwedishAnalyzer swedishAnalyzer = new SwedishAnalyzer();
final ThaiAnalyzer thaiAnalyzer = new ThaiAnalyzer();
final TurkishAnalyzer turkishAnalyzer = new TurkishAnalyzer();
final WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
final DefaultEnglishAnalyzer analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords);
final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
final IndexWriterConfig config;
if (args.collectionClass.equals("TweetCollection")) {
config = new IndexWriterConfig(tweetAnalyzer);
} else if (args.language.equals("ar")) {
config = new IndexWriterConfig(arabicAnalyzer);
} else if (args.language.equals("bn")) {
config = new IndexWriterConfig(bengaliAnalyzer);
} else if (args.language.equals("da")) {
config = new IndexWriterConfig(danishAnalyzer);
} else if (args.language.equals("de")) {
config = new IndexWriterConfig(germanAnalyzer);
} else if (args.language.equals("es")) {
config = new IndexWriterConfig(spanishAnalyzer);
} else if (args.language.equals("fi")) {
config = new IndexWriterConfig(finnishAnalyzer);
} else if (args.language.equals("fr")) {
config = new IndexWriterConfig(frenchAnalyzer);
} else if (args.language.equals("hi")) {
config = new IndexWriterConfig(hindiAnalyzer);
} else if (args.language.equals("hu")) {
config = new IndexWriterConfig(hungarianAnalyzer);
} else if (args.language.equals("id")) {
config = new IndexWriterConfig(indonesianAnalyzer);
} else if (args.language.equals("it")) {
config = new IndexWriterConfig(italianAnalyzer);
} else if (args.language.equals("ja")) {
config = new IndexWriterConfig(japaneseAnalyzer);
} else if (args.language.equals("nl")) {
config = new IndexWriterConfig(dutchAnalyzer);
} else if (args.language.equals("no")) {
config = new IndexWriterConfig(norwegianAnalyzer);
} else if (args.language.equals("pt")) {
config = new IndexWriterConfig(portugueseAnalyzer);
} else if (args.language.equals("ru")) {
config = new IndexWriterConfig(russianAnalyzer);
} else if (args.language.equals("sv")) {
config = new IndexWriterConfig(swedishAnalyzer);
} else if (args.language.equals("th")) {
config = new IndexWriterConfig(thaiAnalyzer);
} else if (args.language.equals("tr")) {
config = new IndexWriterConfig(turkishAnalyzer);
} else if (args.language.equals("zh") || args.language.equals("ko")) {
config = new IndexWriterConfig(chineseAnalyzer);
} else if (args.language.equals("sw") || args.language.equals("te")) {
// For Mr.TyDi: sw and te do not have custom Lucene analyzers, so just use whitespace analyzer.
config = new IndexWriterConfig(whitespaceAnalyzer);
} else if (args.pretokenized) {
config = new IndexWriterConfig(whitespaceAnalyzer);
} else {
config = new IndexWriterConfig(analyzer);
}
if (args.bm25Accurate) {
// necessary during indexing as the norm used in BM25 is already determined at index time.
config.setSimilarity(new AccurateBM25Similarity());
}
if (args.impact) {
config.setSimilarity(new ImpactSimilarity());
} else {
config.setSimilarity(new BM25Similarity());
}
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(args.memorybufferSize);
config.setUseCompoundFile(false);
config.setMergeScheduler(new ConcurrentMergeScheduler());
writer = new IndexWriter(dir, config);
}
final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
LOG.info("Thread pool with " + numThreads + " threads initialized.");
LOG.info("Initializing collection in " + collectionPath.toString());
List<?> segmentPaths = collection.getSegmentPaths();
// when we want sharding to be done
if (args.shardCount > 1) {
segmentPaths = collection.getSegmentPaths(args.shardCount, args.shardCurrent);
}
final int segmentCnt = segmentPaths.size();
LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files")));
LOG.info("Starting to index...");
for (int i = 0; i < segmentCnt; i++) {
if (args.solr) {
executor.execute(new SolrIndexerThread(collection, (Path) segmentPaths.get(i)));
} else if (args.es) {
executor.execute(new ESIndexerThread(collection, (Path) segmentPaths.get(i)));
} else {
executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i)));
}
}
executor.shutdown();
try {
// Wait for existing tasks to terminate
while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
if (segmentCnt == 1) {
LOG.info(String.format("%,d documents indexed", counters.indexed.get()));
} else {
LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get()));
}
}
} catch (InterruptedException ie) {
// (Re-)Cancel if current thread also interrupted
executor.shutdownNow();
// Preserve interrupt status
Thread.currentThread().interrupt();
}
if (segmentCnt != executor.getCompletedTaskCount()) {
throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount());
}
long numIndexed;
if (args.solr || args.es) {
numIndexed = counters.indexed.get();
} else {
numIndexed = writer.getDocStats().maxDoc;
}
// Do a final commit
if (args.solr) {
try {
SolrClient client = solrPool.borrowObject();
client.commit(args.solrIndex);
// Needed for orderly shutdown so the SolrClient executor does not delay main thread exit
solrPool.returnObject(client);
solrPool.close();
} catch (Exception e) {
LOG.error("Exception during final Solr commit: ", e);
}
}
if (args.es) {
esPool.close();
}
try {
if (writer != null) {
writer.commit();
if (args.optimize) {
writer.forceMerge(1);
}
}
} finally {
try {
if (writer != null) {
writer.close();
}
} catch (IOException e) {
// It is possible that this happens... but nothing much we can do at this point,
// so just log the error and move on.
LOG.error(e);
}
}
if (numIndexed != counters.indexed.get()) {
LOG.warn("Unexpected difference between number of indexed documents and index maxDoc.");
}
LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed));
LOG.info("============ Final Counter Values ============");
LOG.info(String.format("indexed: %,12d", counters.indexed.get()));
LOG.info(String.format("unindexable: %,12d", counters.unindexable.get()));
LOG.info(String.format("empty: %,12d", counters.empty.get()));
LOG.info(String.format("skipped: %,12d", counters.skipped.get()));
LOG.info(String.format("errors: %,12d", counters.errors.get()));
final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
return counters;
}
Aggregations