use of org.apache.lucene.analysis.en.EnglishAnalyzer in project epadd by ePADD.
the class Indexer method newAnalyzer.
/**
* main entry point for indexing. note: recomputeCards has to be called
* separately
*/
/*
* void processDocumentCollection(List<MultiDoc> mDocs, List<Document> docs,
* BlobStore blobStore) throws Exception { log.info ("Processing " +
* docs.size() + " documents"); try { indexDocumentCollection(mDocs, docs,
* blobStore); } catch (OutOfMemoryError oome) { log.error
* ("Sorry, out of memory, results may be incomplete!"); clear(); } }
*
* /** preprocessed and indexes the docs.
*/
/*
* private void indexDocumentCollection(List<MultiDoc> mDocs, List<Document>
* allDocs, BlobStore blobStore) throws Exception { this.clear();
* currentJobStartTimeMillis = System.currentTimeMillis();
* currentJobDocsetSize = allDocs.size(); currentJobDocsProcessed =
* currentJobErrors = 0;
*
* System.gc(); String stat1 = "Memory status before indexing " +
* allDocs.size() + " documents: " + Util.getMemoryStats(); log.info
* (stat1); docClusters = mDocs;
*
* if (io.do_NER) NER.printAllTypes();
*
* computeClusterStats(mDocs); log.info ("Indexing " + allDocs.size() +
* " documents in " + docClusters.size() + " clusters"); int clusterCount =
* -1; int docsIndexed = 0, multiDocsIndexed = 0; Posting.nPostingsAllocated
* = 0; docClusters = mDocs;
*
* try { for (MultiDoc md: docClusters) { clusterCount++; log.info
* ("-----------------------------"); log.info ("Indexing " + md.docs.size()
* + " documents in document cluster #" + clusterCount + ": " +
* md.description);
*
* for (Document d: md.docs) { if (cancel) throw new CancelledException();
*
* String contents = ""; if (!io.ignoreDocumentBody) { try { contents =
* d.getContents(); } catch (Exception e) { markDataError
* ("Exception trying to read " + d + ": " + e); } }
*
* if (contents.length() > MAX_DOCUMENT_SIZE) { markDataError
* ("Document too long, size " + Util.commatize(contents.length()) +
* " bytes, dropping it. Begins with: " + d + Util.ellipsize(contents, 80));
* contents = ""; }
*
* String subject = d.getSubjectWithoutTitle(); subject =
* EmailUtils.cleanupSubjectLine(subject);
*
* indexSubdoc(subject, contents, d, blobStore);
*
* docsIndexed++; currentJobDocsProcessed++; } // end cluster
*
* log.info ("Finished indexing multi doc " + md); if (md.docs.size() > 0)
* log.info ("Current stats:" + computeStats());
*
* multiDocsIndexed++; // IndexUtils.dumpDocument(clusterPrefix,
* clusterText); // i don't think we need to do this except for debugging
* System.out.toString("."); // goes to console, that's ok...
*
* if (md.docs.size() > 0) { String stat2 = ("Memory status after indexing "
* + docsIndexed + " of " + allDocs.size() + " documents in " +
* multiDocsIndexed + " (non-zero) multi-docs, total text length " +
* stats.processedTextLength + " chars, " + stats.nProcessedNames +
* " names. " + Util.getMemoryStats()); log.info (stat2); } } } catch
* (OutOfMemoryError oome) { String s =
* "REAL WARNING! SEVERE WARNING! Out of memory during indexing. Please retry with more memory!"
* + oome; s += "\n"; log.error (s); // option: heroically soldier on and
* try to work with partial results }
*
* // imp: do this at the end to save memory. doesn't save memory during
* indexing but saves mem later, when the index is being used. // esp.
* important for lens. NER.release_classifier(); // release memory for
* classifier log.info ("Memory status after releasing classifier: " +
* Util.getMemoryStats()); packIndex();
*
* return; }
*/
private Analyzer newAnalyzer() {
// we can use LimitTokenCountAnalyzer to limit the #tokens
EnglishAnalyzer stemmingAnalyzer = new EnglishAnalyzer(MUSE_STOP_WORDS_SET);
EnglishNumberAnalyzer snAnalyzer = new EnglishNumberAnalyzer(MUSE_STOP_WORDS_SET);
// these are the 3 fields for stemming, everything else uses StandardAnalyzer
Map<String, Analyzer> map = new LinkedHashMap<>();
map.put("body", snAnalyzer);
map.put("title", snAnalyzer);
map.put("body_original", stemmingAnalyzer);
KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
// actually these do not need any real analyzer, they are just stored opaquely
map.put("docId", keywordAnalyzer);
map.put("names_offsets", keywordAnalyzer);
// body redacted contains only names and a lot of dots, hence it requires special handling.
// if(ModeConfig.isPublicMode()) {
// map.put("body", new Analyzer() {
// @Override
// protected TokenStreamComponents createComponents(final String fieldName,
// final Reader reader) {
// Version matchVersion = Indexer.LUCENE_VERSION;
// final CICTokenizer source = new StandardNumberTokenizer(matchVersion, reader);
// TokenStream result = new LowerCaseFilter(matchVersion, source);
// return new TokenStreamComponents(source, result);
// }
// });
// }
// do not remove any stop words.
StandardAnalyzer standardAnalyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
return new PerFieldAnalyzerWrapper(standardAnalyzer, map);
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project goci by EBISPOT.
the class PublicationController method buildSearch.
private CharSequence buildSearch(String author, String title) throws IOException {
StringBuffer result = new StringBuffer();
EnglishAnalyzer filter = new EnglishAnalyzer();
if (author == null) {
author = "";
}
if (title == null) {
title = "";
}
String search = author.toLowerCase() + " " + title.toLowerCase();
TokenStream stream = filter.tokenStream("", search.toString());
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
result.append(term.toString()).append(" ");
}
stream.close();
return result.toString().trim();
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project orientdb by orientechnologies.
the class OLucenePerFieldAnalyzerWrapperTest method shouldReturnCustomAnalyzerForEachField.
@Test
public void shouldReturnCustomAnalyzerForEachField() throws Exception {
OLucenePerFieldAnalyzerWrapper analyzer = new OLucenePerFieldAnalyzerWrapper(new StandardAnalyzer());
analyzer.add("text_en", new EnglishAnalyzer());
analyzer.add("text_it", new ItalianAnalyzer());
assertThat(analyzer.getWrappedAnalyzer("text_en")).isNotNull();
assertThat(analyzer.getWrappedAnalyzer("text_en")).isInstanceOf(EnglishAnalyzer.class);
assertThat(analyzer.getWrappedAnalyzer("text_it")).isNotNull();
assertThat(analyzer.getWrappedAnalyzer("text_it")).isInstanceOf(ItalianAnalyzer.class);
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class IndexerWithEmptyDocumentTestBase method buildTestIndex.
// A very simple example of how to build an index.
// Creates an index similar to IndexerTestBase, but adds an empty document to test error handling.
private void buildTestIndex() throws IOException {
Directory dir = FSDirectory.open(tempDir1);
Analyzer analyzer = new EnglishAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, config);
FieldType textOptions = new FieldType();
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
textOptions.setStored(true);
textOptions.setTokenized(true);
textOptions.setStoreTermVectors(true);
textOptions.setStoreTermVectorPositions(true);
Document doc1 = new Document();
String doc1Text = "here is some text here is some more text. city.";
doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
doc1.add(new Field(IndexArgs.CONTENTS, doc1Text, textOptions));
doc1.add(new StoredField(IndexArgs.RAW, doc1Text));
writer.addDocument(doc1);
Document doc2 = new Document();
String doc2Text = "more texts";
doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
// Note plural, to test stemming
doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions));
doc2.add(new StoredField(IndexArgs.RAW, doc2Text));
writer.addDocument(doc2);
Document doc3 = new Document();
String doc3Text = "here is a test";
doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
doc3.add(new StoredField(IndexArgs.RAW, doc3Text));
writer.addDocument(doc3);
Document doc4 = new Document();
String doc4Text = "";
doc4.add(new StringField(IndexArgs.ID, "doc4", Field.Store.YES));
doc4.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc4".getBytes())));
doc4.add(new Field(IndexArgs.CONTENTS, doc4Text, textOptions));
doc4.add(new StoredField(IndexArgs.RAW, doc4Text));
writer.addDocument(doc4);
writer.commit();
writer.forceMerge(1);
writer.close();
dir.close();
}
use of org.apache.lucene.analysis.en.EnglishAnalyzer in project Anserini by castorini.
the class CloneIndexTest method testCloneIndex.
@Test
public void testCloneIndex() throws Exception {
tempDir2 = createTempDir();
System.out.println("Cloning index:");
Directory dir1 = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir1);
Directory dir2 = FSDirectory.open(tempDir2);
IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir2, config);
LeafReader leafReader = reader.leaves().get(0).reader();
CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
writer.addIndexes(new MyFilterCodecReader(codecReader));
writer.commit();
writer.forceMerge(1);
writer.close();
reader.close();
// Open up the cloned index and verify it.
reader = DirectoryReader.open(dir2);
assertEquals(3, reader.numDocs());
assertEquals(1, reader.leaves().size());
System.out.println("Dumping out postings...");
assertEquals(2, reader.docFreq(new Term("contents", "here")));
assertEquals(2, reader.docFreq(new Term("contents", "more")));
assertEquals(1, reader.docFreq(new Term("contents", "some")));
assertEquals(1, reader.docFreq(new Term("contents", "test")));
assertEquals(2, reader.docFreq(new Term("contents", "text")));
reader.close();
}
Aggregations