Search in sources :

Example 26 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class SimpleQueryStringBuilder method doToQuery.

@Override
protected Query doToQuery(QueryShardContext context) throws IOException {
    // field names in builder can have wildcards etc, need to resolve them here
    Map<String, Float> resolvedFieldsAndWeights = new TreeMap<>();
    if ((useAllFields != null && useAllFields) && (fieldsAndWeights.size() != 0)) {
        throw addValidationError("cannot use [all_fields] parameter in conjunction with [fields]", null);
    }
    // If explicitly required to use all fields, use all fields, OR:
    // Automatically determine the fields (to replace the _all field) if all of the following are true:
    // - The _all field is disabled,
    // - and the default_field has not been changed in the settings
    // - and no fields are specified in the request
    Settings newSettings = new Settings(settings);
    if ((this.useAllFields != null && this.useAllFields) || (context.getMapperService().allEnabled() == false && "_all".equals(context.defaultField()) && this.fieldsAndWeights.isEmpty())) {
        resolvedFieldsAndWeights = QueryStringQueryBuilder.allQueryableDefaultFields(context);
        // Need to use lenient mode when using "all-mode" so exceptions aren't thrown due to mismatched types
        newSettings.lenient(lenientSet ? settings.lenient() : true);
    } else {
        // Use the default field if no fields specified
        if (fieldsAndWeights.isEmpty()) {
            resolvedFieldsAndWeights.put(resolveIndexName(context.defaultField(), context), AbstractQueryBuilder.DEFAULT_BOOST);
        } else {
            for (Map.Entry<String, Float> fieldEntry : fieldsAndWeights.entrySet()) {
                if (Regex.isSimpleMatchPattern(fieldEntry.getKey())) {
                    for (String fieldName : context.getMapperService().simpleMatchToIndexNames(fieldEntry.getKey())) {
                        resolvedFieldsAndWeights.put(fieldName, fieldEntry.getValue());
                    }
                } else {
                    resolvedFieldsAndWeights.put(resolveIndexName(fieldEntry.getKey(), context), fieldEntry.getValue());
                }
            }
        }
    }
    // Use standard analyzer by default if none specified
    Analyzer luceneAnalyzer;
    if (analyzer == null) {
        luceneAnalyzer = context.getMapperService().searchAnalyzer();
    } else {
        luceneAnalyzer = context.getIndexAnalyzers().get(analyzer);
        if (luceneAnalyzer == null) {
            throw new QueryShardException(context, "[" + SimpleQueryStringBuilder.NAME + "] analyzer [" + analyzer + "] not found");
        }
    }
    SimpleQueryParser sqp = new SimpleQueryParser(luceneAnalyzer, resolvedFieldsAndWeights, flags, newSettings, context);
    sqp.setDefaultOperator(defaultOperator.toBooleanClauseOccur());
    Query query = sqp.parse(queryText);
    return Queries.maybeApplyMinimumShouldMatch(query, minimumShouldMatch);
}
Also used : Query(org.apache.lucene.search.Query) TreeMap(java.util.TreeMap) Analyzer(org.apache.lucene.analysis.Analyzer) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) Map(java.util.Map) Settings(org.elasticsearch.index.query.SimpleQueryParser.Settings)

Example 27 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project ansj_seg by NLPchina.

the class IndexAndTest method test.

@Test
public void test() throws Exception {
    DicLibrary.put(DicLibrary.DEFAULT, "../../library/default.dic");
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
    Directory directory = null;
    IndexWriter iwriter = null;
    IndexWriterConfig ic = new IndexWriterConfig(analyzer);
    String text = "旅游和服务是最好的";
    System.out.println(IndexAnalysis.parse(text));
    // 建立内存索引对象
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    addContent(iwriter, text);
    iwriter.commit();
    iwriter.close();
    System.out.println("索引建立完毕");
    Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.index_ansj);
    System.out.println("index ok to search!");
    for (Term t : IndexAnalysis.parse(text)) {
        System.out.println(t.getName());
        search(queryAnalyzer, directory, "\"" + t.getName() + "\"");
    }
}
Also used : AnsjAnalyzer(org.ansj.lucene6.AnsjAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) Term(org.ansj.domain.Term) AnsjAnalyzer(org.ansj.lucene6.AnsjAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Example 28 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project ansj_seg by NLPchina.

the class IndexTest method indexTest.

@Test
public void indexTest() throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
    Directory directory = null;
    IndexWriter iwriter = null;
    IndexWriterConfig ic = new IndexWriterConfig(analyzer);
    // 建立内存索引对象
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    addContent(iwriter, "助推企业转型升级提供强有力的技术支持和服保障。中心的建成将使青岛的服务器承载能力突破10万台,达到世界一流水平。");
    addContent(iwriter, "涉及民生的部分商品和服务成本监审政策");
    addContent(iwriter, "我穿着和服");
    iwriter.commit();
    iwriter.close();
    System.out.println("索引建立完毕");
    Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.dic_ansj);
    System.out.println("index ok to search!");
    search(queryAnalyzer, directory, "\"和服\"");
}
Also used : AnsjAnalyzer(org.ansj.lucene6.AnsjAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) AnsjAnalyzer(org.ansj.lucene6.AnsjAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Example 29 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project ansj_seg by NLPchina.

the class IndexTest method testDic.

@Test
public void testDic() throws IOException {
    DicLibrary.put(DicLibrary.DEFAULT, "../../library/default.dic");
    Token nt = new Token();
    Analyzer ca = new AnsjAnalyzer(TYPE.dic_ansj);
    String content = ("\n\n\n\n\n\n\n我从小就不由自主地认为自己长大以后一定得成为一个象我父亲一样的画家, 可能是父母潜移默化的影响。其实我根本不知道作为画家意味着什么,我是否喜欢,最重要的是否适合我,我是否有这个才华。其实人到中年的我还是不确定我最喜欢什么,最想做的是什么?我相信很多人和我一样有同样的烦恼。毕竟不是每个人都能成为作文里的宇航员,科学家和大教授。知道自己适合做什么,喜欢做什么,能做好什么其实是个非常困难的问题。" + "幸运的是,我想我的孩子不会为这个太过烦恼。通过老大,我慢慢发现美国高中的一个重要功能就是帮助学生分析他们的专长和兴趣,从而帮助他们选择大学的专业和未来的职业。我觉得帮助一个未成形的孩子找到她未来成长的方向是个非常重要的过程。" + "美国高中都有专门的职业顾问,通过接触不同的课程,和各种心理,个性,兴趣很多方面的问答来帮助每个学生找到最感兴趣的专业。这样的教育一般是要到高年级才开始, 可老大因为今年上计算机的课程就是研究一个职业走向的软件项目,所以她提前做了这些考试和面试。看来以后这样的教育会慢慢由电脑来测试了。老大带回家了一些试卷,我挑出一些给大家看看。这门课她花了2个多月才做完,这里只是很小的一部分。" + "在测试里有这样的一些问题:" + "你是个喜欢动手的人吗? 你喜欢修东西吗?你喜欢体育运动吗?你喜欢在室外工作吗?你是个喜欢思考的人吗?你喜欢数学和科学课吗?你喜欢一个人工作吗?你对自己的智力自信吗?你的创造能力很强吗?你喜欢艺术,音乐和戏剧吗?  你喜欢自由自在的工作环境吗?你喜欢尝试新的东西吗? 你喜欢帮助别人吗?你喜欢教别人吗?你喜欢和机器和工具打交道吗?你喜欢当领导吗?你喜欢组织活动吗?你什么和数字打交道吗?");
    TokenStream ts = ca.tokenStream(content, new StringReader(content));
    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) AnsjAnalyzer(org.ansj.lucene6.AnsjAnalyzer) StringReader(java.io.StringReader) Token(org.apache.lucene.analysis.Token) AnsjAnalyzer(org.ansj.lucene6.AnsjAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Date(java.util.Date) Test(org.junit.Test)

Example 30 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project ansj_seg by NLPchina.

the class IndexTest method testIndex.

@Test
public void testIndex() throws IOException {
    Token nt = new Token();
    Analyzer ca = new AnsjAnalyzer(TYPE.index_ansj);
    String content = ("\n\n\n\n\n\n\n我从小就不由自主地认为自己长大以后一定得成为一个象我父亲一样的画家, 可能是父母潜移默化的影响。其实我根本不知道作为画家意味着什么,我是否喜欢,最重要的是否适合我,我是否有这个才华。其实人到中年的我还是不确定我最喜欢什么,最想做的是什么?我相信很多人和我一样有同样的烦恼。毕竟不是每个人都能成为作文里的宇航员,科学家和大教授。知道自己适合做什么,喜欢做什么,能做好什么其实是个非常困难的问题。" + "幸运的是,我想我的孩子不会为这个太过烦恼。通过老大,我慢慢发现美国高中的一个重要功能就是帮助学生分析他们的专长和兴趣,从而帮助他们选择大学的专业和未来的职业。我觉得帮助一个未成形的孩子找到她未来成长的方向是个非常重要的过程。" + "美国高中都有专门的职业顾问,通过接触不同的课程,和各种心理,个性,兴趣很多方面的问答来帮助每个学生找到最感兴趣的专业。这样的教育一般是要到高年级才开始, 可老大因为今年上计算机的课程就是研究一个职业走向的软件项目,所以她提前做了这些考试和面试。看来以后这样的教育会慢慢由电脑来测试了。老大带回家了一些试卷,我挑出一些给大家看看。这门课她花了2个多月才做完,这里只是很小的一部分。" + "在测试里有这样的一些问题:" + "你是个喜欢动手的人吗? 你喜欢修东西吗?你喜欢体育运动吗?你喜欢在室外工作吗?你是个喜欢思考的人吗?你喜欢数学和科学课吗?你喜欢一个人工作吗?你对自己的智力自信吗?你的创造能力很强吗?你喜欢艺术,音乐和戏剧吗?  你喜欢自由自在的工作环境吗?你喜欢尝试新的东西吗? 你喜欢帮助别人吗?你喜欢教别人吗?你喜欢和机器和工具打交道吗?你喜欢当领导吗?你喜欢组织活动吗?你什么和数字打交道吗?");
    TokenStream ts = ca.tokenStream(content, new StringReader(content));
    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) AnsjAnalyzer(org.ansj.lucene5.AnsjAnalyzer) StringReader(java.io.StringReader) Token(org.apache.lucene.analysis.Token) AnsjAnalyzer(org.ansj.lucene5.AnsjAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Date(java.util.Date) Test(org.junit.Test)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)1020 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)396 Tokenizer (org.apache.lucene.analysis.Tokenizer)265 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)228 Document (org.apache.lucene.document.Document)207 Directory (org.apache.lucene.store.Directory)192 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)176 BytesRef (org.apache.lucene.util.BytesRef)122 Test (org.junit.Test)119 TokenStream (org.apache.lucene.analysis.TokenStream)107 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)92 Term (org.apache.lucene.index.Term)92 IndexReader (org.apache.lucene.index.IndexReader)67 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)65 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)64 Input (org.apache.lucene.search.suggest.Input)63 CharArraySet (org.apache.lucene.analysis.CharArraySet)58 ArrayList (java.util.ArrayList)57 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)57 TextField (org.apache.lucene.document.TextField)55