Examples with EnglishAnalyzer - org.apache.lucene.analysis.en.EnglishAnalyzer

Example 6 with EnglishAnalyzer

use of org.apache.lucene.analysis.en.EnglishAnalyzer in project epadd by ePADD.

the class Indexer method newAnalyzer.

/**
 * main entry point for indexing. note: recomputeCards has to be called
 * separately
 */
/*
	 * void processDocumentCollection(List<MultiDoc> mDocs, List<Document> docs,
	 * BlobStore blobStore) throws Exception { log.info ("Processing " +
	 * docs.size() + " documents"); try { indexDocumentCollection(mDocs, docs,
	 * blobStore); } catch (OutOfMemoryError oome) { log.error
	 * ("Sorry, out of memory, results may be incomplete!"); clear(); } }
	 * 
	 * /** preprocessed and indexes the docs.
	 */
/*
	 * private void indexDocumentCollection(List<MultiDoc> mDocs, List<Document>
	 * allDocs, BlobStore blobStore) throws Exception { this.clear();
	 * currentJobStartTimeMillis = System.currentTimeMillis();
	 * currentJobDocsetSize = allDocs.size(); currentJobDocsProcessed =
	 * currentJobErrors = 0;
	 * 
	 * System.gc(); String stat1 = "Memory status before indexing " +
	 * allDocs.size() + " documents: " + Util.getMemoryStats(); log.info
	 * (stat1); docClusters = mDocs;
	 * 
	 * if (io.do_NER) NER.printAllTypes();
	 * 
	 * computeClusterStats(mDocs); log.info ("Indexing " + allDocs.size() +
	 * " documents in " + docClusters.size() + " clusters"); int clusterCount =
	 * -1; int docsIndexed = 0, multiDocsIndexed = 0; Posting.nPostingsAllocated
	 * = 0; docClusters = mDocs;
	 * 
	 * try { for (MultiDoc md: docClusters) { clusterCount++; log.info
	 * ("-----------------------------"); log.info ("Indexing " + md.docs.size()
	 * + " documents in document cluster #" + clusterCount + ": " +
	 * md.description);
	 * 
	 * for (Document d: md.docs) { if (cancel) throw new CancelledException();
	 * 
	 * String contents = ""; if (!io.ignoreDocumentBody) { try { contents =
	 * d.getContents(); } catch (Exception e) { markDataError
	 * ("Exception trying to read " + d + ": " + e); } }
	 * 
	 * if (contents.length() > MAX_DOCUMENT_SIZE) { markDataError
	 * ("Document too long, size " + Util.commatize(contents.length()) +
	 * " bytes, dropping it. Begins with: " + d + Util.ellipsize(contents, 80));
	 * contents = ""; }
	 * 
	 * String subject = d.getSubjectWithoutTitle(); subject =
	 * EmailUtils.cleanupSubjectLine(subject);
	 * 
	 * indexSubdoc(subject, contents, d, blobStore);
	 * 
	 * docsIndexed++; currentJobDocsProcessed++; } // end cluster
	 * 
	 * log.info ("Finished indexing multi doc " + md); if (md.docs.size() > 0)
	 * log.info ("Current stats:" + computeStats());
	 * 
	 * multiDocsIndexed++; // IndexUtils.dumpDocument(clusterPrefix,
	 * clusterText); // i don't think we need to do this except for debugging
	 * System.out.toString("."); // goes to console, that's ok...
	 * 
	 * if (md.docs.size() > 0) { String stat2 = ("Memory status after indexing "
	 * + docsIndexed + " of " + allDocs.size() + " documents in " +
	 * multiDocsIndexed + " (non-zero) multi-docs, total text length " +
	 * stats.processedTextLength + " chars, " + stats.nProcessedNames +
	 * " names. " + Util.getMemoryStats()); log.info (stat2); } } } catch
	 * (OutOfMemoryError oome) { String s =
	 * "REAL WARNING! SEVERE WARNING! Out of memory during indexing. Please retry with more memory!"
	 * + oome; s += "\n"; log.error (s); // option: heroically soldier on and
	 * try to work with partial results }
	 * 
	 * // imp: do this at the end to save memory. doesn't save memory during
	 * indexing but saves mem later, when the index is being used. // esp.
	 * important for lens. NER.release_classifier(); // release memory for
	 * classifier log.info ("Memory status after releasing classifier: " +
	 * Util.getMemoryStats()); packIndex();
	 * 
	 * return; }
	 */
private Analyzer newAnalyzer() {
    // we can use LimitTokenCountAnalyzer to limit the #tokens
    EnglishAnalyzer stemmingAnalyzer = new EnglishAnalyzer(MUSE_STOP_WORDS_SET);
    EnglishNumberAnalyzer snAnalyzer = new EnglishNumberAnalyzer(MUSE_STOP_WORDS_SET);
    // these are the 3 fields for stemming, everything else uses StandardAnalyzer
    Map<String, Analyzer> map = new LinkedHashMap<>();
    map.put("body", snAnalyzer);
    map.put("title", snAnalyzer);
    map.put("body_original", stemmingAnalyzer);
    KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
    // actually these do not need any real analyzer, they are just stored opaquely
    map.put("docId", keywordAnalyzer);
    map.put("names_offsets", keywordAnalyzer);
    // body redacted contains only  names and a lot of dots, hence it requires special handling.
    // if(ModeConfig.isPublicMode()) {
    // map.put("body", new Analyzer() {
    // @Override
    // protected TokenStreamComponents createComponents(final String fieldName,
    // final Reader reader) {
    // Version matchVersion = Indexer.LUCENE_VERSION;
    // final CICTokenizer source = new StandardNumberTokenizer(matchVersion, reader);
    // TokenStream result = new LowerCaseFilter(matchVersion, source);
    // return new TokenStreamComponents(source, result);
    // }
    // });
    // }
    // do not remove any stop words.
    StandardAnalyzer standardAnalyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
    return new PerFieldAnalyzerWrapper(standardAnalyzer, map);
}

Also used : KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 7 with EnglishAnalyzer

use of org.apache.lucene.analysis.en.EnglishAnalyzer in project goci by EBISPOT.

the class PublicationController method buildSearch.

private CharSequence buildSearch(String author, String title) throws IOException {
    StringBuffer result = new StringBuffer();
    EnglishAnalyzer filter = new EnglishAnalyzer();
    if (author == null) {
        author = "";
    }
    if (title == null) {
        title = "";
    }
    String search = author.toLowerCase() + " " + title.toLowerCase();
    TokenStream stream = filter.tokenStream("", search.toString());
    stream.reset();
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        result.append(term.toString()).append(" ");
    }
    stream.close();
    return result.toString().trim();
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer)

Example 8 with EnglishAnalyzer

use of org.apache.lucene.analysis.en.EnglishAnalyzer in project orientdb by orientechnologies.

the class OLucenePerFieldAnalyzerWrapperTest method shouldReturnCustomAnalyzerForEachField.

@Test
public void shouldReturnCustomAnalyzerForEachField() throws Exception {
    OLucenePerFieldAnalyzerWrapper analyzer = new OLucenePerFieldAnalyzerWrapper(new StandardAnalyzer());
    analyzer.add("text_en", new EnglishAnalyzer());
    analyzer.add("text_it", new ItalianAnalyzer());
    assertThat(analyzer.getWrappedAnalyzer("text_en")).isNotNull();
    assertThat(analyzer.getWrappedAnalyzer("text_en")).isInstanceOf(EnglishAnalyzer.class);
    assertThat(analyzer.getWrappedAnalyzer("text_it")).isNotNull();
    assertThat(analyzer.getWrappedAnalyzer("text_it")).isInstanceOf(ItalianAnalyzer.class);
}

Also used : StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ItalianAnalyzer(org.apache.lucene.analysis.it.ItalianAnalyzer) Test(org.junit.Test)

Example 9 with EnglishAnalyzer

use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.

the class IndexerWithEmptyDocumentTestBase method buildTestIndex.

// A very simple example of how to build an index.
// Creates an index similar to IndexerTestBase, but adds an empty document to test error handling.
private void buildTestIndex() throws IOException {
    Directory dir = FSDirectory.open(tempDir1);
    Analyzer analyzer = new EnglishAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, config);
    FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    textOptions.setStoreTermVectors(true);
    textOptions.setStoreTermVectorPositions(true);
    Document doc1 = new Document();
    String doc1Text = "here is some text here is some more text. city.";
    doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
    doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
    doc1.add(new Field(IndexArgs.CONTENTS, doc1Text, textOptions));
    doc1.add(new StoredField(IndexArgs.RAW, doc1Text));
    writer.addDocument(doc1);
    Document doc2 = new Document();
    String doc2Text = "more texts";
    doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
    doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
    // Note plural, to test stemming
    doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions));
    doc2.add(new StoredField(IndexArgs.RAW, doc2Text));
    writer.addDocument(doc2);
    Document doc3 = new Document();
    String doc3Text = "here is a test";
    doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
    doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
    doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
    doc3.add(new StoredField(IndexArgs.RAW, doc3Text));
    writer.addDocument(doc3);
    Document doc4 = new Document();
    String doc4Text = "";
    doc4.add(new StringField(IndexArgs.ID, "doc4", Field.Store.YES));
    doc4.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc4".getBytes())));
    doc4.add(new Field(IndexArgs.CONTENTS, doc4Text, textOptions));
    doc4.add(new StoredField(IndexArgs.RAW, doc4Text));
    writer.addDocument(doc4);
    writer.commit();
    writer.forceMerge(1);
    writer.close();
    dir.close();
}

Also used : EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) StringField(org.apache.lucene.document.StringField) StoredField(org.apache.lucene.document.StoredField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) Field(org.apache.lucene.document.Field) StoredField(org.apache.lucene.document.StoredField) IndexWriter(org.apache.lucene.index.IndexWriter) StringField(org.apache.lucene.document.StringField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 10 with EnglishAnalyzer

use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.

the class CloneIndexTest method testCloneIndex.

@Test
public void testCloneIndex() throws Exception {
    tempDir2 = createTempDir();
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);
    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);
    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();
    writer.forceMerge(1);
    writer.close();
    reader.close();
    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());
    System.out.println("Dumping out postings...");
    assertEquals(2, reader.docFreq(new Term("contents", "here")));
    assertEquals(2, reader.docFreq(new Term("contents", "more")));
    assertEquals(1, reader.docFreq(new Term("contents", "some")));
    assertEquals(1, reader.docFreq(new Term("contents", "test")));
    assertEquals(2, reader.docFreq(new Term("contents", "text")));
    reader.close();
}

Also used : FilterCodecReader(org.apache.lucene.index.FilterCodecReader) CodecReader(org.apache.lucene.index.CodecReader) LeafReader(org.apache.lucene.index.LeafReader) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) Term(org.apache.lucene.index.Term) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Aggregations

EnglishAnalyzer (org.apache.lucene.analysis.en.EnglishAnalyzer)26 Analyzer (org.apache.lucene.analysis.Analyzer)9 Directory (org.apache.lucene.store.Directory)9 FSDirectory (org.apache.lucene.store.FSDirectory)9 Test (org.junit.Test)6 IndexWriter (org.apache.lucene.index.IndexWriter)5 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)5 ScoredDocuments (io.anserini.rerank.ScoredDocuments)4 IOException (java.io.IOException)4 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 Query (org.apache.lucene.search.Query)4 RerankerContext (io.anserini.rerank.RerankerContext)3 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)3 Document (org.apache.lucene.document.Document)3 Term (org.apache.lucene.index.Term)3 IndexSearcher (org.apache.lucene.search.IndexSearcher)3 ScoreDoc (org.apache.lucene.search.ScoreDoc)3 TopDocs (org.apache.lucene.search.TopDocs)3 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)3