Search in sources :

Example 1 with DefaultEnglishAnalyzer

use of io.anserini.analysis.DefaultEnglishAnalyzer in project Anserini by castorini.

the class IndexCollection method run.

public Counters run() throws IOException {
    final long start = System.nanoTime();
    LOG.info("============ Indexing Collection ============");
    int numThreads = args.threads;
    IndexWriter writer = null;
    // Used for LocalIndexThread
    if (indexPath != null) {
        final Directory dir = FSDirectory.open(indexPath);
        final CJKAnalyzer chineseAnalyzer = new CJKAnalyzer();
        final ArabicAnalyzer arabicAnalyzer = new ArabicAnalyzer();
        final BengaliAnalyzer bengaliAnalyzer = new BengaliAnalyzer();
        final DanishAnalyzer danishAnalyzer = new DanishAnalyzer();
        final DutchAnalyzer dutchAnalyzer = new DutchAnalyzer();
        final FinnishAnalyzer finnishAnalyzer = new FinnishAnalyzer();
        final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
        final GermanAnalyzer germanAnalyzer = new GermanAnalyzer();
        final HindiAnalyzer hindiAnalyzer = new HindiAnalyzer();
        final HungarianAnalyzer hungarianAnalyzer = new HungarianAnalyzer();
        final IndonesianAnalyzer indonesianAnalyzer = new IndonesianAnalyzer();
        final ItalianAnalyzer italianAnalyzer = new ItalianAnalyzer();
        final JapaneseAnalyzer japaneseAnalyzer = new JapaneseAnalyzer();
        final NorwegianAnalyzer norwegianAnalyzer = new NorwegianAnalyzer();
        final PortugueseAnalyzer portugueseAnalyzer = new PortugueseAnalyzer();
        final RussianAnalyzer russianAnalyzer = new RussianAnalyzer();
        final SpanishAnalyzer spanishAnalyzer = new SpanishAnalyzer();
        final SwedishAnalyzer swedishAnalyzer = new SwedishAnalyzer();
        final ThaiAnalyzer thaiAnalyzer = new ThaiAnalyzer();
        final TurkishAnalyzer turkishAnalyzer = new TurkishAnalyzer();
        final WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
        final DefaultEnglishAnalyzer analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords);
        final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
        final IndexWriterConfig config;
        if (args.collectionClass.equals("TweetCollection")) {
            config = new IndexWriterConfig(tweetAnalyzer);
        } else if (args.language.equals("ar")) {
            config = new IndexWriterConfig(arabicAnalyzer);
        } else if (args.language.equals("bn")) {
            config = new IndexWriterConfig(bengaliAnalyzer);
        } else if (args.language.equals("da")) {
            config = new IndexWriterConfig(danishAnalyzer);
        } else if (args.language.equals("de")) {
            config = new IndexWriterConfig(germanAnalyzer);
        } else if (args.language.equals("es")) {
            config = new IndexWriterConfig(spanishAnalyzer);
        } else if (args.language.equals("fi")) {
            config = new IndexWriterConfig(finnishAnalyzer);
        } else if (args.language.equals("fr")) {
            config = new IndexWriterConfig(frenchAnalyzer);
        } else if (args.language.equals("hi")) {
            config = new IndexWriterConfig(hindiAnalyzer);
        } else if (args.language.equals("hu")) {
            config = new IndexWriterConfig(hungarianAnalyzer);
        } else if (args.language.equals("id")) {
            config = new IndexWriterConfig(indonesianAnalyzer);
        } else if (args.language.equals("it")) {
            config = new IndexWriterConfig(italianAnalyzer);
        } else if (args.language.equals("ja")) {
            config = new IndexWriterConfig(japaneseAnalyzer);
        } else if (args.language.equals("nl")) {
            config = new IndexWriterConfig(dutchAnalyzer);
        } else if (args.language.equals("no")) {
            config = new IndexWriterConfig(norwegianAnalyzer);
        } else if (args.language.equals("pt")) {
            config = new IndexWriterConfig(portugueseAnalyzer);
        } else if (args.language.equals("ru")) {
            config = new IndexWriterConfig(russianAnalyzer);
        } else if (args.language.equals("sv")) {
            config = new IndexWriterConfig(swedishAnalyzer);
        } else if (args.language.equals("th")) {
            config = new IndexWriterConfig(thaiAnalyzer);
        } else if (args.language.equals("tr")) {
            config = new IndexWriterConfig(turkishAnalyzer);
        } else if (args.language.equals("zh") || args.language.equals("ko")) {
            config = new IndexWriterConfig(chineseAnalyzer);
        } else if (args.language.equals("sw") || args.language.equals("te")) {
            // For Mr.TyDi: sw and te do not have custom Lucene analyzers, so just use whitespace analyzer.
            config = new IndexWriterConfig(whitespaceAnalyzer);
        } else if (args.pretokenized) {
            config = new IndexWriterConfig(whitespaceAnalyzer);
        } else {
            config = new IndexWriterConfig(analyzer);
        }
        if (args.bm25Accurate) {
            // necessary during indexing as the norm used in BM25 is already determined at index time.
            config.setSimilarity(new AccurateBM25Similarity());
        }
        if (args.impact) {
            config.setSimilarity(new ImpactSimilarity());
        } else {
            config.setSimilarity(new BM25Similarity());
        }
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        config.setRAMBufferSizeMB(args.memorybufferSize);
        config.setUseCompoundFile(false);
        config.setMergeScheduler(new ConcurrentMergeScheduler());
        writer = new IndexWriter(dir, config);
    }
    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    LOG.info("Thread pool with " + numThreads + " threads initialized.");
    LOG.info("Initializing collection in " + collectionPath.toString());
    List<?> segmentPaths = collection.getSegmentPaths();
    // when we want sharding to be done
    if (args.shardCount > 1) {
        segmentPaths = collection.getSegmentPaths(args.shardCount, args.shardCurrent);
    }
    final int segmentCnt = segmentPaths.size();
    LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files")));
    LOG.info("Starting to index...");
    for (int i = 0; i < segmentCnt; i++) {
        if (args.solr) {
            executor.execute(new SolrIndexerThread(collection, (Path) segmentPaths.get(i)));
        } else if (args.es) {
            executor.execute(new ESIndexerThread(collection, (Path) segmentPaths.get(i)));
        } else {
            executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i)));
        }
    }
    executor.shutdown();
    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            if (segmentCnt == 1) {
                LOG.info(String.format("%,d documents indexed", counters.indexed.get()));
            } else {
                LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get()));
            }
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }
    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  " + executor.getCompletedTaskCount());
    }
    long numIndexed;
    if (args.solr || args.es) {
        numIndexed = counters.indexed.get();
    } else {
        numIndexed = writer.getDocStats().maxDoc;
    }
    // Do a final commit
    if (args.solr) {
        try {
            SolrClient client = solrPool.borrowObject();
            client.commit(args.solrIndex);
            // Needed for orderly shutdown so the SolrClient executor does not delay main thread exit
            solrPool.returnObject(client);
            solrPool.close();
        } catch (Exception e) {
            LOG.error("Exception during final Solr commit: ", e);
        }
    }
    if (args.es) {
        esPool.close();
    }
    try {
        if (writer != null) {
            writer.commit();
            if (args.optimize) {
                writer.forceMerge(1);
            }
        }
    } finally {
        try {
            if (writer != null) {
                writer.close();
            }
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }
    if (numIndexed != counters.indexed.get()) {
        LOG.warn("Unexpected difference between number of indexed documents and index maxDoc.");
    }
    LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed));
    LOG.info("============ Final Counter Values ============");
    LOG.info(String.format("indexed:     %,12d", counters.indexed.get()));
    LOG.info(String.format("unindexable: %,12d", counters.unindexable.get()));
    LOG.info(String.format("empty:       %,12d", counters.empty.get()));
    LOG.info(String.format("skipped:     %,12d", counters.skipped.get()));
    LOG.info(String.format("errors:      %,12d", counters.errors.get()));
    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
    return counters;
}
Also used : IndonesianAnalyzer(org.apache.lucene.analysis.id.IndonesianAnalyzer) TweetAnalyzer(io.anserini.analysis.TweetAnalyzer) JapaneseAnalyzer(org.apache.lucene.analysis.ja.JapaneseAnalyzer) DefaultEnglishAnalyzer(io.anserini.analysis.DefaultEnglishAnalyzer) ConcurrentMergeScheduler(org.apache.lucene.index.ConcurrentMergeScheduler) DutchAnalyzer(org.apache.lucene.analysis.nl.DutchAnalyzer) SpanishAnalyzer(org.apache.lucene.analysis.es.SpanishAnalyzer) AccurateBM25Similarity(io.anserini.search.similarity.AccurateBM25Similarity) CloudSolrClient(org.apache.solr.client.solrj.impl.CloudSolrClient) SolrClient(org.apache.solr.client.solrj.SolrClient) FrenchAnalyzer(org.apache.lucene.analysis.fr.FrenchAnalyzer) RussianAnalyzer(org.apache.lucene.analysis.ru.RussianAnalyzer) NorwegianAnalyzer(org.apache.lucene.analysis.no.NorwegianAnalyzer) GermanAnalyzer(org.apache.lucene.analysis.de.GermanAnalyzer) ItalianAnalyzer(org.apache.lucene.analysis.it.ItalianAnalyzer) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) BengaliAnalyzer(org.apache.lucene.analysis.bn.BengaliAnalyzer) ThaiAnalyzer(org.apache.lucene.analysis.th.ThaiAnalyzer) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Path(java.nio.file.Path) ImpactSimilarity(io.anserini.search.similarity.ImpactSimilarity) CJKAnalyzer(org.apache.lucene.analysis.cjk.CJKAnalyzer) IOException(java.io.IOException) HungarianAnalyzer(org.apache.lucene.analysis.hu.HungarianAnalyzer) InvalidDocumentException(io.anserini.index.generator.InvalidDocumentException) CmdLineException(org.kohsuke.args4j.CmdLineException) SkippedDocumentException(io.anserini.index.generator.SkippedDocumentException) IOException(java.io.IOException) EmptyDocumentException(io.anserini.index.generator.EmptyDocumentException) ArabicAnalyzer(org.apache.lucene.analysis.ar.ArabicAnalyzer) SwedishAnalyzer(org.apache.lucene.analysis.sv.SwedishAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) DanishAnalyzer(org.apache.lucene.analysis.da.DanishAnalyzer) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) AccurateBM25Similarity(io.anserini.search.similarity.AccurateBM25Similarity) FinnishAnalyzer(org.apache.lucene.analysis.fi.FinnishAnalyzer) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) PortugueseAnalyzer(org.apache.lucene.analysis.pt.PortugueseAnalyzer) TurkishAnalyzer(org.apache.lucene.analysis.tr.TurkishAnalyzer) HindiAnalyzer(org.apache.lucene.analysis.hi.HindiAnalyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 2 with DefaultEnglishAnalyzer

use of io.anserini.analysis.DefaultEnglishAnalyzer in project Anserini by castorini.

the class IndexReaderUtilsTest method testTermCountsWithAnalyzer.

@Test
public void testTermCountsWithAnalyzer() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    DefaultEnglishAnalyzer analyzer = DefaultEnglishAnalyzer.newDefaultInstance();
    Map<String, Long> termCountMap;
    termCountMap = IndexReaderUtils.getTermCountsWithAnalyzer(reader, "here", analyzer);
    assertEquals(Long.valueOf(3), termCountMap.get("collectionFreq"));
    assertEquals(Long.valueOf(2), termCountMap.get("docFreq"));
    termCountMap = IndexReaderUtils.getTermCountsWithAnalyzer(reader, "more", analyzer);
    assertEquals(Long.valueOf(2), termCountMap.get("collectionFreq"));
    assertEquals(Long.valueOf(2), termCountMap.get("docFreq"));
    termCountMap = IndexReaderUtils.getTermCountsWithAnalyzer(reader, "some", analyzer);
    assertEquals(Long.valueOf(2), termCountMap.get("collectionFreq"));
    assertEquals(Long.valueOf(1), termCountMap.get("docFreq"));
    termCountMap = IndexReaderUtils.getTermCountsWithAnalyzer(reader, "test", analyzer);
    assertEquals(Long.valueOf(1), termCountMap.get("collectionFreq"));
    assertEquals(Long.valueOf(1), termCountMap.get("docFreq"));
    termCountMap = IndexReaderUtils.getTermCountsWithAnalyzer(reader, "text", analyzer);
    assertEquals(Long.valueOf(3), termCountMap.get("collectionFreq"));
    assertEquals(Long.valueOf(2), termCountMap.get("docFreq"));
    termCountMap = IndexReaderUtils.getTermCountsWithAnalyzer(reader, "some text", analyzer);
    assertEquals(Long.valueOf(1), termCountMap.get("docFreq"));
    reader.close();
    dir.close();
}
Also used : DefaultEnglishAnalyzer(io.anserini.analysis.DefaultEnglishAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Aggregations

DefaultEnglishAnalyzer (io.anserini.analysis.DefaultEnglishAnalyzer)2 Directory (org.apache.lucene.store.Directory)2 FSDirectory (org.apache.lucene.store.FSDirectory)2 TweetAnalyzer (io.anserini.analysis.TweetAnalyzer)1 EmptyDocumentException (io.anserini.index.generator.EmptyDocumentException)1 InvalidDocumentException (io.anserini.index.generator.InvalidDocumentException)1 SkippedDocumentException (io.anserini.index.generator.SkippedDocumentException)1 AccurateBM25Similarity (io.anserini.search.similarity.AccurateBM25Similarity)1 ImpactSimilarity (io.anserini.search.similarity.ImpactSimilarity)1 IOException (java.io.IOException)1 Path (java.nio.file.Path)1 ThreadPoolExecutor (java.util.concurrent.ThreadPoolExecutor)1 ArabicAnalyzer (org.apache.lucene.analysis.ar.ArabicAnalyzer)1 BengaliAnalyzer (org.apache.lucene.analysis.bn.BengaliAnalyzer)1 CJKAnalyzer (org.apache.lucene.analysis.cjk.CJKAnalyzer)1 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)1 DanishAnalyzer (org.apache.lucene.analysis.da.DanishAnalyzer)1 GermanAnalyzer (org.apache.lucene.analysis.de.GermanAnalyzer)1 SpanishAnalyzer (org.apache.lucene.analysis.es.SpanishAnalyzer)1 FinnishAnalyzer (org.apache.lucene.analysis.fi.FinnishAnalyzer)1