Search in sources :

Example 6 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project jackrabbit-oak by apache.

the class IndexWriterUtils method getIndexWriterConfig.

public static IndexWriterConfig getIndexWriterConfig(IndexDefinition definition, boolean remoteDir) {
    // FIXME: Hack needed to make Lucene work in an OSGi environment
    Thread thread = Thread.currentThread();
    ClassLoader loader = thread.getContextClassLoader();
    thread.setContextClassLoader(IndexWriterConfig.class.getClassLoader());
    try {
        Analyzer definitionAnalyzer = definition.getAnalyzer();
        Map<String, Analyzer> analyzers = new HashMap<String, Analyzer>();
        analyzers.put(FieldNames.SPELLCHECK, new ShingleAnalyzerWrapper(LuceneIndexConstants.ANALYZER, 3));
        if (!definition.isSuggestAnalyzed()) {
            analyzers.put(FieldNames.SUGGEST, SuggestHelper.getAnalyzer());
        }
        Analyzer analyzer = new PerFieldAnalyzerWrapper(definitionAnalyzer, analyzers);
        IndexWriterConfig config = new IndexWriterConfig(VERSION, analyzer);
        if (remoteDir) {
            config.setMergeScheduler(new SerialMergeScheduler());
        }
        if (definition.getCodec() != null) {
            config.setCodec(definition.getCodec());
        }
        return config;
    } finally {
        thread.setContextClassLoader(loader);
    }
}
Also used : SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) HashMap(java.util.HashMap) Analyzer(org.apache.lucene.analysis.Analyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) ShingleAnalyzerWrapper(org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 7 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project HongsCORE by ihongs.

the class LuceneRecord method getAnalyzer.

// ** 底层工具 **/
/**
 * 存储分析器
 * @return
 * @throws HongsException
 */
protected Analyzer getAnalyzer() throws HongsException {
    Map<String, Analyzer> az = new HashMap();
    Map<String, Map> fields = getFields();
    Analyzer ad = new StandardAnalyzer();
    for (Object ot : fields.entrySet()) {
        Map.Entry et = (Map.Entry) ot;
        Map m = (Map) et.getValue();
        String n = (String) et.getKey();
        String t = datatype(m);
        if ("search".equals(t)) {
            az.put(n, getAnalyzer(m));
        }
    }
    return new PerFieldAnalyzerWrapper(ad, az);
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) CustomAnalyzer(org.apache.lucene.analysis.custom.CustomAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 8 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project epadd by ePADD.

the class Indexer method newAnalyzer.

/**
 * main entry point for indexing. note: recomputeCards has to be called
 * separately
 */
/*
	 * void processDocumentCollection(List<MultiDoc> mDocs, List<Document> docs,
	 * BlobStore blobStore) throws Exception { log.info ("Processing " +
	 * docs.size() + " documents"); try { indexDocumentCollection(mDocs, docs,
	 * blobStore); } catch (OutOfMemoryError oome) { log.error
	 * ("Sorry, out of memory, results may be incomplete!"); clear(); } }
	 * 
	 * /** preprocessed and indexes the docs.
	 */
/*
	 * private void indexDocumentCollection(List<MultiDoc> mDocs, List<Document>
	 * allDocs, BlobStore blobStore) throws Exception { this.clear();
	 * currentJobStartTimeMillis = System.currentTimeMillis();
	 * currentJobDocsetSize = allDocs.size(); currentJobDocsProcessed =
	 * currentJobErrors = 0;
	 * 
	 * System.gc(); String stat1 = "Memory status before indexing " +
	 * allDocs.size() + " documents: " + Util.getMemoryStats(); log.info
	 * (stat1); docClusters = mDocs;
	 * 
	 * if (io.do_NER) NER.printAllTypes();
	 * 
	 * computeClusterStats(mDocs); log.info ("Indexing " + allDocs.size() +
	 * " documents in " + docClusters.size() + " clusters"); int clusterCount =
	 * -1; int docsIndexed = 0, multiDocsIndexed = 0; Posting.nPostingsAllocated
	 * = 0; docClusters = mDocs;
	 * 
	 * try { for (MultiDoc md: docClusters) { clusterCount++; log.info
	 * ("-----------------------------"); log.info ("Indexing " + md.docs.size()
	 * + " documents in document cluster #" + clusterCount + ": " +
	 * md.description);
	 * 
	 * for (Document d: md.docs) { if (cancel) throw new CancelledException();
	 * 
	 * String contents = ""; if (!io.ignoreDocumentBody) { try { contents =
	 * d.getContents(); } catch (Exception e) { markDataError
	 * ("Exception trying to read " + d + ": " + e); } }
	 * 
	 * if (contents.length() > MAX_DOCUMENT_SIZE) { markDataError
	 * ("Document too long, size " + Util.commatize(contents.length()) +
	 * " bytes, dropping it. Begins with: " + d + Util.ellipsize(contents, 80));
	 * contents = ""; }
	 * 
	 * String subject = d.getSubjectWithoutTitle(); subject =
	 * EmailUtils.cleanupSubjectLine(subject);
	 * 
	 * indexSubdoc(subject, contents, d, blobStore);
	 * 
	 * docsIndexed++; currentJobDocsProcessed++; } // end cluster
	 * 
	 * log.info ("Finished indexing multi doc " + md); if (md.docs.size() > 0)
	 * log.info ("Current stats:" + computeStats());
	 * 
	 * multiDocsIndexed++; // IndexUtils.dumpDocument(clusterPrefix,
	 * clusterText); // i don't think we need to do this except for debugging
	 * System.out.toString("."); // goes to console, that's ok...
	 * 
	 * if (md.docs.size() > 0) { String stat2 = ("Memory status after indexing "
	 * + docsIndexed + " of " + allDocs.size() + " documents in " +
	 * multiDocsIndexed + " (non-zero) multi-docs, total text length " +
	 * stats.processedTextLength + " chars, " + stats.nProcessedNames +
	 * " names. " + Util.getMemoryStats()); log.info (stat2); } } } catch
	 * (OutOfMemoryError oome) { String s =
	 * "REAL WARNING! SEVERE WARNING! Out of memory during indexing. Please retry with more memory!"
	 * + oome; s += "\n"; log.error (s); // option: heroically soldier on and
	 * try to work with partial results }
	 * 
	 * // imp: do this at the end to save memory. doesn't save memory during
	 * indexing but saves mem later, when the index is being used. // esp.
	 * important for lens. NER.release_classifier(); // release memory for
	 * classifier log.info ("Memory status after releasing classifier: " +
	 * Util.getMemoryStats()); packIndex();
	 * 
	 * return; }
	 */
private Analyzer newAnalyzer() {
    // we can use LimitTokenCountAnalyzer to limit the #tokens
    EnglishAnalyzer stemmingAnalyzer = new EnglishAnalyzer(MUSE_STOP_WORDS_SET);
    EnglishNumberAnalyzer snAnalyzer = new EnglishNumberAnalyzer(MUSE_STOP_WORDS_SET);
    // these are the 3 fields for stemming, everything else uses StandardAnalyzer
    Map<String, Analyzer> map = new LinkedHashMap<>();
    map.put("body", snAnalyzer);
    map.put("title", snAnalyzer);
    map.put("body_original", stemmingAnalyzer);
    KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
    // actually these do not need any real analyzer, they are just stored opaquely
    map.put("docId", keywordAnalyzer);
    map.put("names_offsets", keywordAnalyzer);
    // body redacted contains only  names and a lot of dots, hence it requires special handling.
    // if(ModeConfig.isPublicMode()) {
    // map.put("body", new Analyzer() {
    // @Override
    // protected TokenStreamComponents createComponents(final String fieldName,
    // final Reader reader) {
    // Version matchVersion = Indexer.LUCENE_VERSION;
    // final CICTokenizer source = new StandardNumberTokenizer(matchVersion, reader);
    // TokenStream result = new LowerCaseFilter(matchVersion, source);
    // return new TokenStreamComponents(source, result);
    // }
    // });
    // }
    // do not remove any stop words.
    StandardAnalyzer standardAnalyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
    return new PerFieldAnalyzerWrapper(standardAnalyzer, map);
}
Also used : KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 9 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project jackrabbit-oak by apache.

the class IndexDefinition method createAnalyzer.

// ~---------------------------------------------------< Analyzer >
private Analyzer createAnalyzer() {
    Analyzer result;
    Analyzer defaultAnalyzer = LuceneIndexConstants.ANALYZER;
    if (analyzers.containsKey(LuceneIndexConstants.ANL_DEFAULT)) {
        defaultAnalyzer = analyzers.get(LuceneIndexConstants.ANL_DEFAULT);
    }
    if (!evaluatePathRestrictions()) {
        result = defaultAnalyzer;
    } else {
        Map<String, Analyzer> analyzerMap = ImmutableMap.<String, Analyzer>builder().put(FieldNames.ANCESTORS, new TokenizerChain(new PathHierarchyTokenizerFactory(Collections.<String, String>emptyMap()))).build();
        result = new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzerMap);
    }
    // In case of negative value no limits would be applied
    if (maxFieldLength < 0) {
        return result;
    }
    return new LimitTokenCountAnalyzer(result, maxFieldLength);
}
Also used : LimitTokenCountAnalyzer(org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer) PathHierarchyTokenizerFactory(org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory) TokenizerChain(org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain) LimitTokenCountAnalyzer(org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 10 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project ansj_seg by NLPchina.

the class IndexAndTest method test.

@Test
public void test() throws Exception {
    DicLibrary.put(DicLibrary.DEFAULT, "../../library/default.dic");
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
    Directory directory = null;
    IndexWriter iwriter = null;
    IndexWriterConfig ic = new IndexWriterConfig(analyzer);
    String text = "旅游和服务是最好的";
    System.out.println(IndexAnalysis.parse(text));
    // 建立内存索引对象
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    addContent(iwriter, text);
    iwriter.commit();
    iwriter.close();
    System.out.println("索引建立完毕");
    Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.index_ansj);
    System.out.println("index ok to search!");
    for (Term t : IndexAnalysis.parse(text)) {
        System.out.println(t.getName());
        search(queryAnalyzer, directory, "\"" + t.getName() + "\"");
    }
}
Also used : AnsjAnalyzer(org.ansj.lucene7.AnsjAnalyzer) Term(org.ansj.domain.Term) AnsjAnalyzer(org.ansj.lucene7.AnsjAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) Test(org.junit.Test)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)22 PerFieldAnalyzerWrapper (org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)22 HashMap (java.util.HashMap)12 RAMDirectory (org.apache.lucene.store.RAMDirectory)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)8 Document (org.apache.lucene.document.Document)8 TextField (org.apache.lucene.document.TextField)8 IndexWriter (org.apache.lucene.index.IndexWriter)8 Field (org.apache.lucene.document.Field)7 Directory (org.apache.lucene.store.Directory)6 Test (org.junit.Test)6 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)4 Tokenizer (org.apache.lucene.analysis.Tokenizer)4 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)4 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)4 DirectoryReader (org.apache.lucene.index.DirectoryReader)4 IOException (java.io.IOException)3 Map (java.util.Map)3 SKOSAnalyzer (at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer)2