Search in sources :

Example 21 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class PostingsHighlighter method highlight.

@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
    FieldMapper fieldMapper = highlighterContext.mapper;
    SearchContextHighlight.Field field = highlighterContext.field;
    if (canHighlight(fieldMapper) == false) {
        throw new IllegalArgumentException("the field [" + highlighterContext.fieldName + "] should be indexed with positions and offsets in the postings list to be used with postings highlighter");
    }
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
    }
    HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
    MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
    if (mapperHighlighterEntry == null) {
        Encoder encoder = field.fieldOptions().encoder().equals("html") ? Encoders.HTML : Encoders.DEFAULT;
        CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
        mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
    }
    List<Snippet> snippets = new ArrayList<>();
    int numberOfFragments;
    try {
        Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
        List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
        CustomPostingsHighlighter highlighter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            //we use a control char to separate values, which is the only char that the custom break iterator breaks the text on,
            //so we don't lose the distinction between the different values of a field and we get back a snippet per value
            String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
            CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
            highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
            //we are highlighting the whole content, one snippet per value
            numberOfFragments = fieldValues.size();
        } else {
            //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
            String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
            highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, fieldValue, field.fieldOptions().noMatchSize() > 0);
            numberOfFragments = field.fieldOptions().numberOfFragments();
        }
        IndexSearcher searcher = new IndexSearcher(hitContext.reader());
        Snippet[] fieldSnippets = highlighter.highlightField(fieldMapper.fieldType().name(), highlighterContext.query, searcher, hitContext.docId(), numberOfFragments);
        for (Snippet fieldSnippet : fieldSnippets) {
            if (Strings.hasText(fieldSnippet.getText())) {
                snippets.add(fieldSnippet);
            }
        }
    } catch (IOException e) {
        throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
    }
    snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
    if (field.fieldOptions().scoreOrdered()) {
        //let's sort the snippets by score if needed
        CollectionUtil.introSort(snippets, new Comparator<Snippet>() {

            @Override
            public int compare(Snippet o1, Snippet o2) {
                return (int) Math.signum(o2.getScore() - o1.getScore());
            }
        });
    }
    String[] fragments = new String[snippets.size()];
    for (int i = 0; i < fragments.length; i++) {
        fragments[i] = snippets.get(i).getText();
    }
    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
    }
    return null;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ArrayList(java.util.ArrayList) SearchContext(org.elasticsearch.search.internal.SearchContext) Analyzer(org.apache.lucene.analysis.Analyzer) CustomSeparatorBreakIterator(org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator) CustomPostingsHighlighter(org.apache.lucene.search.postingshighlight.CustomPostingsHighlighter) Encoder(org.apache.lucene.search.highlight.Encoder) FetchSubPhase(org.elasticsearch.search.fetch.FetchSubPhase) CustomPassageFormatter(org.apache.lucene.search.postingshighlight.CustomPassageFormatter) Snippet(org.apache.lucene.search.highlight.Snippet) IOException(java.io.IOException) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) FieldMapper(org.elasticsearch.index.mapper.FieldMapper)

Example 22 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class UnifiedHighlighter method highlight.

@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
    FieldMapper fieldMapper = highlighterContext.mapper;
    SearchContextHighlight.Field field = highlighterContext.field;
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
    }
    HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
    MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
    if (mapperHighlighterEntry == null) {
        Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
        CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
        mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
    }
    List<Snippet> snippets = new ArrayList<>();
    int numberOfFragments;
    try {
        Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
        List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
        fieldValues = fieldValues.stream().map(obj -> {
            if (obj instanceof BytesRef) {
                return fieldMapper.fieldType().valueForDisplay(obj).toString();
            } else {
                return obj;
            }
        }).collect(Collectors.toList());
        IndexSearcher searcher = new IndexSearcher(hitContext.reader());
        CustomUnifiedHighlighter highlighter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            // we use a control char to separate values, which is the only char that the custom break iterator
            // breaks the text on, so we don't lose the distinction between the different values of a field and we
            // get back a snippet per value
            String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
            org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator = new org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
            highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
            // we are highlighting the whole content, one snippet per value
            numberOfFragments = fieldValues.size();
        } else {
            //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
            String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
            BreakIterator bi = getBreakIterator(field);
            highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi, fieldValue, field.fieldOptions().noMatchSize());
            numberOfFragments = field.fieldOptions().numberOfFragments();
        }
        if (field.fieldOptions().requireFieldMatch()) {
            final String fieldName = highlighterContext.fieldName;
            highlighter.setFieldMatcher((name) -> fieldName.equals(name));
        } else {
            highlighter.setFieldMatcher((name) -> true);
        }
        Snippet[] fieldSnippets = highlighter.highlightField(highlighterContext.fieldName, highlighterContext.query, hitContext.docId(), numberOfFragments);
        for (Snippet fieldSnippet : fieldSnippets) {
            if (Strings.hasText(fieldSnippet.getText())) {
                snippets.add(fieldSnippet);
            }
        }
    } catch (IOException e) {
        throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
    }
    snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
    if (field.fieldOptions().scoreOrdered()) {
        //let's sort the snippets by score if needed
        CollectionUtil.introSort(snippets, (o1, o2) -> Double.compare(o2.getScore(), o1.getScore()));
    }
    String[] fragments = new String[snippets.size()];
    for (int i = 0; i < fragments.length; i++) {
        fragments[i] = snippets.get(i).getText();
    }
    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
    }
    return null;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ArrayList(java.util.ArrayList) SearchContext(org.elasticsearch.search.internal.SearchContext) Analyzer(org.apache.lucene.analysis.Analyzer) BreakIterator(java.text.BreakIterator) Encoder(org.apache.lucene.search.highlight.Encoder) FetchSubPhase(org.elasticsearch.search.fetch.FetchSubPhase) CustomPassageFormatter(org.apache.lucene.search.uhighlight.CustomPassageFormatter) BytesRef(org.apache.lucene.util.BytesRef) CustomUnifiedHighlighter(org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter) Snippet(org.apache.lucene.search.highlight.Snippet) IOException(java.io.IOException) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) FieldMapper(org.elasticsearch.index.mapper.FieldMapper)

Example 23 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class AbstractTermVectorsTestCase method indexDocsWithLucene.

protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException {
    Map<String, Analyzer> mapping = new HashMap<>();
    for (TestFieldSetting field : testDocs[0].fieldSettings) {
        if (field.storedPayloads) {
            mapping.put(field.name, new Analyzer() {

                @Override
                protected TokenStreamComponents createComponents(String fieldName) {
                    Tokenizer tokenizer = new StandardTokenizer();
                    TokenFilter filter = new LowerCaseFilter(tokenizer);
                    filter = new TypeAsPayloadTokenFilter(filter);
                    return new TokenStreamComponents(tokenizer, filter);
                }
            });
        }
    }
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), mapping);
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(wrapper);
    conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, conf);
    for (TestDoc doc : testDocs) {
        Document d = new Document();
        d.add(new Field("id", doc.id, StringField.TYPE_STORED));
        for (int i = 0; i < doc.fieldContent.length; i++) {
            FieldType type = new FieldType(TextField.TYPE_STORED);
            TestFieldSetting fieldSetting = doc.fieldSettings[i];
            type.setStoreTermVectorOffsets(fieldSetting.storedOffset);
            type.setStoreTermVectorPayloads(fieldSetting.storedPayloads);
            type.setStoreTermVectorPositions(fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset);
            type.setStoreTermVectors(true);
            type.freeze();
            d.add(new Field(fieldSetting.name, doc.fieldContent[i], type));
        }
        writer.updateDocument(new Term("id", doc.id), d);
        writer.commit();
    }
    writer.close();
    return DirectoryReader.open(dir);
}
Also used : HashMap(java.util.HashMap) TypeAsPayloadTokenFilter(org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter) Term(org.apache.lucene.index.Term) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) FieldType(org.apache.lucene.document.FieldType) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TypeAsPayloadTokenFilter(org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 24 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class TransportAnalyzeAction method shardOperation.

@Override
protected AnalyzeResponse shardOperation(AnalyzeRequest request, ShardId shardId) {
    try {
        final IndexService indexService;
        if (shardId != null) {
            indexService = indicesService.indexServiceSafe(shardId.getIndex());
        } else {
            indexService = null;
        }
        String field = null;
        Analyzer analyzer = null;
        if (request.field() != null) {
            if (indexService == null) {
                throw new IllegalArgumentException("No index provided, and trying to analyzer based on a specific field which requires the index parameter");
            }
            MappedFieldType fieldType = indexService.mapperService().fullName(request.field());
            if (fieldType != null) {
                if (fieldType.tokenized()) {
                    analyzer = fieldType.indexAnalyzer();
                } else if (fieldType instanceof KeywordFieldMapper.KeywordFieldType) {
                    analyzer = ((KeywordFieldMapper.KeywordFieldType) fieldType).normalizer();
                    if (analyzer == null) {
                        // this will be KeywordAnalyzer
                        analyzer = fieldType.indexAnalyzer();
                    }
                } else {
                    throw new IllegalArgumentException("Can't process field [" + request.field() + "], Analysis requests are only supported on tokenized fields");
                }
                field = fieldType.name();
            }
        }
        if (field == null) {
            if (indexService != null) {
                field = indexService.getIndexSettings().getDefaultField();
            } else {
                field = AllFieldMapper.NAME;
            }
        }
        final AnalysisRegistry analysisRegistry = indicesService.getAnalysis();
        return analyze(request, field, analyzer, indexService != null ? indexService.getIndexAnalyzers() : null, analysisRegistry, environment);
    } catch (IOException e) {
        throw new ElasticsearchException("analysis failed", e);
    }
}
Also used : AnalysisRegistry(org.elasticsearch.index.analysis.AnalysisRegistry) KeywordFieldMapper(org.elasticsearch.index.mapper.KeywordFieldMapper) IndexService(org.elasticsearch.index.IndexService) MappedFieldType(org.elasticsearch.index.mapper.MappedFieldType) IOException(java.io.IOException) ElasticsearchException(org.elasticsearch.ElasticsearchException) NamedAnalyzer(org.elasticsearch.index.analysis.NamedAnalyzer) CustomAnalyzer(org.elasticsearch.index.analysis.CustomAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 25 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class MoreLikeThisQueryBuilder method doToQuery.

@Override
protected Query doToQuery(QueryShardContext context) throws IOException {
    Item[] likeItems = new Item[this.likeItems.length];
    for (int i = 0; i < likeItems.length; i++) {
        likeItems[i] = new Item(this.likeItems[i]);
    }
    Item[] unlikeItems = new Item[this.unlikeItems.length];
    for (int i = 0; i < unlikeItems.length; i++) {
        unlikeItems[i] = new Item(this.unlikeItems[i]);
    }
    MoreLikeThisQuery mltQuery = new MoreLikeThisQuery();
    // set similarity
    mltQuery.setSimilarity(context.getSearchSimilarity());
    // set query parameters
    mltQuery.setMaxQueryTerms(maxQueryTerms);
    mltQuery.setMinTermFrequency(minTermFreq);
    mltQuery.setMinDocFreq(minDocFreq);
    mltQuery.setMaxDocFreq(maxDocFreq);
    mltQuery.setMinWordLen(minWordLength);
    mltQuery.setMaxWordLen(maxWordLength);
    mltQuery.setMinimumShouldMatch(minimumShouldMatch);
    if (stopWords != null) {
        mltQuery.setStopWords(new HashSet<>(Arrays.asList(stopWords)));
    }
    // sets boost terms
    if (boostTerms != 0) {
        mltQuery.setBoostTerms(true);
        mltQuery.setBoostTermsFactor(boostTerms);
    }
    // set analyzer
    Analyzer analyzerObj = context.getIndexAnalyzers().get(analyzer);
    if (analyzerObj == null) {
        analyzerObj = context.getMapperService().searchAnalyzer();
    }
    mltQuery.setAnalyzer(analyzerObj);
    // set like text fields
    boolean useDefaultField = (fields == null);
    List<String> moreLikeFields = new ArrayList<>();
    if (useDefaultField) {
        moreLikeFields = Collections.singletonList(context.defaultField());
    } else {
        for (String field : fields) {
            MappedFieldType fieldType = context.fieldMapper(field);
            if (fieldType != null && SUPPORTED_FIELD_TYPES.contains(fieldType.getClass()) == false) {
                if (failOnUnsupportedField) {
                    throw new IllegalArgumentException("more_like_this only supports text/keyword fields: [" + field + "]");
                } else {
                    // skip
                    continue;
                }
            }
            moreLikeFields.add(fieldType == null ? field : fieldType.name());
        }
    }
    if (moreLikeFields.isEmpty()) {
        return null;
    }
    mltQuery.setMoreLikeFields(moreLikeFields.toArray(new String[moreLikeFields.size()]));
    // handle like texts
    if (likeTexts.length > 0) {
        mltQuery.setLikeText(likeTexts);
    }
    if (unlikeTexts.length > 0) {
        mltQuery.setUnlikeText(unlikeTexts);
    }
    // handle items
    if (likeItems.length > 0) {
        return handleItems(context, mltQuery, likeItems, unlikeItems, include, moreLikeFields, useDefaultField);
    } else {
        return mltQuery;
    }
}
Also used : MoreLikeThisQuery(org.elasticsearch.common.lucene.search.MoreLikeThisQuery) ArrayList(java.util.ArrayList) MappedFieldType(org.elasticsearch.index.mapper.MappedFieldType) Analyzer(org.apache.lucene.analysis.Analyzer)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)1020 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)396 Tokenizer (org.apache.lucene.analysis.Tokenizer)265 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)228 Document (org.apache.lucene.document.Document)207 Directory (org.apache.lucene.store.Directory)192 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)176 BytesRef (org.apache.lucene.util.BytesRef)122 Test (org.junit.Test)119 TokenStream (org.apache.lucene.analysis.TokenStream)107 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)92 Term (org.apache.lucene.index.Term)92 IndexReader (org.apache.lucene.index.IndexReader)67 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)65 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)64 Input (org.apache.lucene.search.suggest.Input)63 CharArraySet (org.apache.lucene.analysis.CharArraySet)58 ArrayList (java.util.ArrayList)57 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)57 TextField (org.apache.lucene.document.TextField)55