use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project jackrabbit-oak by apache.
the class IndexWriterUtils method getIndexWriterConfig.
public static IndexWriterConfig getIndexWriterConfig(IndexDefinition definition, boolean remoteDir) {
// FIXME: Hack needed to make Lucene work in an OSGi environment
Thread thread = Thread.currentThread();
ClassLoader loader = thread.getContextClassLoader();
thread.setContextClassLoader(IndexWriterConfig.class.getClassLoader());
try {
Analyzer definitionAnalyzer = definition.getAnalyzer();
Map<String, Analyzer> analyzers = new HashMap<String, Analyzer>();
analyzers.put(FieldNames.SPELLCHECK, new ShingleAnalyzerWrapper(LuceneIndexConstants.ANALYZER, 3));
if (!definition.isSuggestAnalyzed()) {
analyzers.put(FieldNames.SUGGEST, SuggestHelper.getAnalyzer());
}
Analyzer analyzer = new PerFieldAnalyzerWrapper(definitionAnalyzer, analyzers);
IndexWriterConfig config = new IndexWriterConfig(VERSION, analyzer);
if (remoteDir) {
config.setMergeScheduler(new SerialMergeScheduler());
}
if (definition.getCodec() != null) {
config.setCodec(definition.getCodec());
}
return config;
} finally {
thread.setContextClassLoader(loader);
}
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project HongsCORE by ihongs.
the class LuceneRecord method getAnalyzer.
// ** 底层工具 **/
/**
* 存储分析器
* @return
* @throws HongsException
*/
protected Analyzer getAnalyzer() throws HongsException {
Map<String, Analyzer> az = new HashMap();
Map<String, Map> fields = getFields();
Analyzer ad = new StandardAnalyzer();
for (Object ot : fields.entrySet()) {
Map.Entry et = (Map.Entry) ot;
Map m = (Map) et.getValue();
String n = (String) et.getKey();
String t = datatype(m);
if ("search".equals(t)) {
az.put(n, getAnalyzer(m));
}
}
return new PerFieldAnalyzerWrapper(ad, az);
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project epadd by ePADD.
the class Indexer method newAnalyzer.
/**
* main entry point for indexing. note: recomputeCards has to be called
* separately
*/
/*
* void processDocumentCollection(List<MultiDoc> mDocs, List<Document> docs,
* BlobStore blobStore) throws Exception { log.info ("Processing " +
* docs.size() + " documents"); try { indexDocumentCollection(mDocs, docs,
* blobStore); } catch (OutOfMemoryError oome) { log.error
* ("Sorry, out of memory, results may be incomplete!"); clear(); } }
*
* /** preprocessed and indexes the docs.
*/
/*
* private void indexDocumentCollection(List<MultiDoc> mDocs, List<Document>
* allDocs, BlobStore blobStore) throws Exception { this.clear();
* currentJobStartTimeMillis = System.currentTimeMillis();
* currentJobDocsetSize = allDocs.size(); currentJobDocsProcessed =
* currentJobErrors = 0;
*
* System.gc(); String stat1 = "Memory status before indexing " +
* allDocs.size() + " documents: " + Util.getMemoryStats(); log.info
* (stat1); docClusters = mDocs;
*
* if (io.do_NER) NER.printAllTypes();
*
* computeClusterStats(mDocs); log.info ("Indexing " + allDocs.size() +
* " documents in " + docClusters.size() + " clusters"); int clusterCount =
* -1; int docsIndexed = 0, multiDocsIndexed = 0; Posting.nPostingsAllocated
* = 0; docClusters = mDocs;
*
* try { for (MultiDoc md: docClusters) { clusterCount++; log.info
* ("-----------------------------"); log.info ("Indexing " + md.docs.size()
* + " documents in document cluster #" + clusterCount + ": " +
* md.description);
*
* for (Document d: md.docs) { if (cancel) throw new CancelledException();
*
* String contents = ""; if (!io.ignoreDocumentBody) { try { contents =
* d.getContents(); } catch (Exception e) { markDataError
* ("Exception trying to read " + d + ": " + e); } }
*
* if (contents.length() > MAX_DOCUMENT_SIZE) { markDataError
* ("Document too long, size " + Util.commatize(contents.length()) +
* " bytes, dropping it. Begins with: " + d + Util.ellipsize(contents, 80));
* contents = ""; }
*
* String subject = d.getSubjectWithoutTitle(); subject =
* EmailUtils.cleanupSubjectLine(subject);
*
* indexSubdoc(subject, contents, d, blobStore);
*
* docsIndexed++; currentJobDocsProcessed++; } // end cluster
*
* log.info ("Finished indexing multi doc " + md); if (md.docs.size() > 0)
* log.info ("Current stats:" + computeStats());
*
* multiDocsIndexed++; // IndexUtils.dumpDocument(clusterPrefix,
* clusterText); // i don't think we need to do this except for debugging
* System.out.toString("."); // goes to console, that's ok...
*
* if (md.docs.size() > 0) { String stat2 = ("Memory status after indexing "
* + docsIndexed + " of " + allDocs.size() + " documents in " +
* multiDocsIndexed + " (non-zero) multi-docs, total text length " +
* stats.processedTextLength + " chars, " + stats.nProcessedNames +
* " names. " + Util.getMemoryStats()); log.info (stat2); } } } catch
* (OutOfMemoryError oome) { String s =
* "REAL WARNING! SEVERE WARNING! Out of memory during indexing. Please retry with more memory!"
* + oome; s += "\n"; log.error (s); // option: heroically soldier on and
* try to work with partial results }
*
* // imp: do this at the end to save memory. doesn't save memory during
* indexing but saves mem later, when the index is being used. // esp.
* important for lens. NER.release_classifier(); // release memory for
* classifier log.info ("Memory status after releasing classifier: " +
* Util.getMemoryStats()); packIndex();
*
* return; }
*/
private Analyzer newAnalyzer() {
// we can use LimitTokenCountAnalyzer to limit the #tokens
EnglishAnalyzer stemmingAnalyzer = new EnglishAnalyzer(MUSE_STOP_WORDS_SET);
EnglishNumberAnalyzer snAnalyzer = new EnglishNumberAnalyzer(MUSE_STOP_WORDS_SET);
// these are the 3 fields for stemming, everything else uses StandardAnalyzer
Map<String, Analyzer> map = new LinkedHashMap<>();
map.put("body", snAnalyzer);
map.put("title", snAnalyzer);
map.put("body_original", stemmingAnalyzer);
KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
// actually these do not need any real analyzer, they are just stored opaquely
map.put("docId", keywordAnalyzer);
map.put("names_offsets", keywordAnalyzer);
// body redacted contains only names and a lot of dots, hence it requires special handling.
// if(ModeConfig.isPublicMode()) {
// map.put("body", new Analyzer() {
// @Override
// protected TokenStreamComponents createComponents(final String fieldName,
// final Reader reader) {
// Version matchVersion = Indexer.LUCENE_VERSION;
// final CICTokenizer source = new StandardNumberTokenizer(matchVersion, reader);
// TokenStream result = new LowerCaseFilter(matchVersion, source);
// return new TokenStreamComponents(source, result);
// }
// });
// }
// do not remove any stop words.
StandardAnalyzer standardAnalyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
return new PerFieldAnalyzerWrapper(standardAnalyzer, map);
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project jackrabbit-oak by apache.
the class IndexDefinition method createAnalyzer.
// ~---------------------------------------------------< Analyzer >
private Analyzer createAnalyzer() {
Analyzer result;
Analyzer defaultAnalyzer = LuceneIndexConstants.ANALYZER;
if (analyzers.containsKey(LuceneIndexConstants.ANL_DEFAULT)) {
defaultAnalyzer = analyzers.get(LuceneIndexConstants.ANL_DEFAULT);
}
if (!evaluatePathRestrictions()) {
result = defaultAnalyzer;
} else {
Map<String, Analyzer> analyzerMap = ImmutableMap.<String, Analyzer>builder().put(FieldNames.ANCESTORS, new TokenizerChain(new PathHierarchyTokenizerFactory(Collections.<String, String>emptyMap()))).build();
result = new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzerMap);
}
// In case of negative value no limits would be applied
if (maxFieldLength < 0) {
return result;
}
return new LimitTokenCountAnalyzer(result, maxFieldLength);
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project ansj_seg by NLPchina.
the class IndexAndTest method test.
@Test
public void test() throws Exception {
DicLibrary.put(DicLibrary.DEFAULT, "../../library/default.dic");
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
Directory directory = null;
IndexWriter iwriter = null;
IndexWriterConfig ic = new IndexWriterConfig(analyzer);
String text = "旅游和服务是最好的";
System.out.println(IndexAnalysis.parse(text));
// 建立内存索引对象
directory = new RAMDirectory();
iwriter = new IndexWriter(directory, ic);
addContent(iwriter, text);
iwriter.commit();
iwriter.close();
System.out.println("索引建立完毕");
Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.index_ansj);
System.out.println("index ok to search!");
for (Term t : IndexAnalysis.parse(text)) {
System.out.println(t.getName());
search(queryAnalyzer, directory, "\"" + t.getName() + "\"");
}
}
Aggregations