Examples with Analyzer - org.apache.lucene.analysis.Analyzer

Example 16 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class FingerprintAnalyzerTests method testReusableTokenStream.

public void testReusableTokenStream() throws Exception {
    Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
    assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO", new String[] { "bar baz foo" });
    assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc", new String[] { "123.2 abc xyz" });
}

Also used : Analyzer(org.apache.lucene.analysis.Analyzer)

Example 17 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class FingerprintAnalyzerTests method testFingerprint.

public void testFingerprint() throws Exception {
    Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
    assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO", new String[] { "bar baz foo" });
}

Also used : Analyzer(org.apache.lucene.analysis.Analyzer)

Example 18 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class MapperQueryParser method getFieldQuerySingle.

private Query getFieldQuerySingle(String field, String queryText, boolean quoted) throws ParseException {
    if (!quoted && queryText.length() > 1) {
        if (queryText.charAt(0) == '>') {
            if (queryText.length() > 2) {
                if (queryText.charAt(1) == '=') {
                    return getRangeQuerySingle(field, queryText.substring(2), null, true, true, context);
                }
            }
            return getRangeQuerySingle(field, queryText.substring(1), null, false, true, context);
        } else if (queryText.charAt(0) == '<') {
            if (queryText.length() > 2) {
                if (queryText.charAt(1) == '=') {
                    return getRangeQuerySingle(field, null, queryText.substring(2), true, true, context);
                }
            }
            return getRangeQuerySingle(field, null, queryText.substring(1), true, false, context);
        }
    }
    currentFieldType = null;
    Analyzer oldAnalyzer = getAnalyzer();
    try {
        if (quoted) {
            setAnalyzer(settings.quoteAnalyzer());
            if (settings.quoteFieldSuffix() != null) {
                currentFieldType = context.fieldMapper(field + settings.quoteFieldSuffix());
            }
        }
        if (currentFieldType == null) {
            currentFieldType = context.fieldMapper(field);
        }
        if (currentFieldType != null) {
            if (quoted) {
                if (!settings.forceQuoteAnalyzer()) {
                    setAnalyzer(context.getSearchQuoteAnalyzer(currentFieldType));
                }
            } else {
                if (!settings.forceAnalyzer()) {
                    setAnalyzer(context.getSearchAnalyzer(currentFieldType));
                }
            }
            if (currentFieldType != null) {
                Query query = null;
                if (currentFieldType.tokenized() == false) {
                    // this might be a structured field like a numeric
                    try {
                        query = currentFieldType.termQuery(queryText, context);
                    } catch (RuntimeException e) {
                        if (settings.lenient()) {
                            return null;
                        } else {
                            throw e;
                        }
                    }
                }
                if (query == null) {
                    query = super.getFieldQuery(currentFieldType.name(), queryText, quoted);
                }
                return query;
            }
        }
        return super.getFieldQuery(field, queryText, quoted);
    } finally {
        setAnalyzer(oldAnalyzer);
    }
}

Also used : Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 19 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class CommonTermsQueryBuilder method doToQuery.

@Override
protected Query doToQuery(QueryShardContext context) throws IOException {
    String field;
    MappedFieldType fieldType = context.fieldMapper(fieldName);
    if (fieldType != null) {
        field = fieldType.name();
    } else {
        field = fieldName;
    }
    Analyzer analyzerObj;
    if (analyzer == null) {
        if (fieldType != null) {
            analyzerObj = context.getSearchAnalyzer(fieldType);
        } else {
            analyzerObj = context.getMapperService().searchAnalyzer();
        }
    } else {
        analyzerObj = context.getMapperService().getIndexAnalyzers().get(analyzer);
        if (analyzerObj == null) {
            throw new QueryShardException(context, "[common] analyzer [" + analyzer + "] not found");
        }
    }
    Occur highFreqOccur = highFreqOperator.toBooleanClauseOccur();
    Occur lowFreqOccur = lowFreqOperator.toBooleanClauseOccur();
    ExtendedCommonTermsQuery commonsQuery = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, cutoffFrequency, disableCoord, fieldType);
    return parseQueryString(commonsQuery, text, field, analyzerObj, lowFreqMinimumShouldMatch, highFreqMinimumShouldMatch);
}

Also used : ExtendedCommonTermsQuery(org.apache.lucene.queries.ExtendedCommonTermsQuery) MappedFieldType(org.elasticsearch.index.mapper.MappedFieldType) Occur(org.apache.lucene.search.BooleanClause.Occur) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 20 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.

the class PlainHighlighter method highlight.

@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
    SearchContextHighlight.Field field = highlighterContext.field;
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    FieldMapper mapper = highlighterContext.mapper;
    Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> mappers = new HashMap<>();
        hitContext.cache().put(CACHE_KEY, mappers);
    }
    @SuppressWarnings("unchecked") Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter>) hitContext.cache().get(CACHE_KEY);
    org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper);
    if (entry == null) {
        QueryScorer queryScorer = new CustomQueryScorer(highlighterContext.query, field.fieldOptions().requireFieldMatch() ? mapper.fieldType().name() : null);
        queryScorer.setExpandMultiTermQuery(true);
        Fragmenter fragmenter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            fragmenter = new NullFragmenter();
        } else if (field.fieldOptions().fragmenter() == null) {
            fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
        } else if ("simple".equals(field.fieldOptions().fragmenter())) {
            fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize());
        } else if ("span".equals(field.fieldOptions().fragmenter())) {
            fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
        } else {
            throw new IllegalArgumentException("unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]");
        }
        Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]);
        entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer);
        entry.setTextFragmenter(fragmenter);
        // always highlight across all data
        entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
        cache.put(mapper, entry);
    }
    // a HACK to make highlighter do highlighting, even though its using the single frag list builder
    int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments();
    ArrayList<TextFragment> fragsList = new ArrayList<>();
    List<Object> textsToHighlight;
    Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
    try {
        textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
        for (Object textToHighlight : textsToHighlight) {
            String text;
            if (textToHighlight instanceof BytesRef) {
                text = mapper.fieldType().valueForDisplay(textToHighlight).toString();
            } else {
                text = textToHighlight.toString();
            }
            try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
                if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
                    // can't perform highlighting if the stream has no terms (binary token stream) or no offsets
                    continue;
                }
                TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments);
                for (TextFragment bestTextFragment : bestTextFragments) {
                    if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
                        fragsList.add(bestTextFragment);
                    }
                }
            }
        }
    } catch (Exception e) {
        if (ExceptionsHelper.unwrap(e, BytesRefHash.MaxBytesLengthExceededException.class) != null) {
            // the plain highlighter will parse the source and try to analyze it.
            return null;
        } else {
            throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
        }
    }
    if (field.fieldOptions().scoreOrdered()) {
        CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() {

            @Override
            public int compare(TextFragment o1, TextFragment o2) {
                return Math.round(o2.getScore() - o1.getScore());
            }
        });
    }
    String[] fragments;
    // number_of_fragments is set to 0 but we have a multivalued field
    if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) {
        fragments = new String[fragsList.size()];
        for (int i = 0; i < fragsList.size(); i++) {
            fragments[i] = fragsList.get(i).toString();
        }
    } else {
        // refine numberOfFragments if needed
        numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments;
        fragments = new String[numberOfFragments];
        for (int i = 0; i < fragments.length; i++) {
            fragments[i] = fragsList.get(i).toString();
        }
    }
    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
    }
    int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
    if (noMatchSize > 0 && textsToHighlight.size() > 0) {
        // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary.
        String fieldContents = textsToHighlight.get(0).toString();
        int end;
        try {
            end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().name(), fieldContents);
        } catch (Exception e) {
            throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
        }
        if (end > 0) {
            return new HighlightField(highlighterContext.fieldName, new Text[] { new Text(fieldContents.substring(0, end)) });
        }
    }
    return null;
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) Formatter(org.apache.lucene.search.highlight.Formatter) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) ArrayList(java.util.ArrayList) SearchContext(org.elasticsearch.search.internal.SearchContext) TextFragment(org.apache.lucene.search.highlight.TextFragment) Analyzer(org.apache.lucene.analysis.Analyzer) SimpleFragmenter(org.apache.lucene.search.highlight.SimpleFragmenter) Encoder(org.apache.lucene.search.highlight.Encoder) SimpleFragmenter(org.apache.lucene.search.highlight.SimpleFragmenter) Fragmenter(org.apache.lucene.search.highlight.Fragmenter) SimpleSpanFragmenter(org.apache.lucene.search.highlight.SimpleSpanFragmenter) NullFragmenter(org.apache.lucene.search.highlight.NullFragmenter) FetchSubPhase(org.elasticsearch.search.fetch.FetchSubPhase) BytesRefHash(org.apache.lucene.util.BytesRefHash) BytesRef(org.apache.lucene.util.BytesRef) SimpleSpanFragmenter(org.apache.lucene.search.highlight.SimpleSpanFragmenter) QueryScorer(org.apache.lucene.search.highlight.QueryScorer) Text(org.elasticsearch.common.text.Text) NullFragmenter(org.apache.lucene.search.highlight.NullFragmenter) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) IOException(java.io.IOException) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) FieldMapper(org.elasticsearch.index.mapper.FieldMapper) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)1020 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)396 Tokenizer (org.apache.lucene.analysis.Tokenizer)265 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)228 Document (org.apache.lucene.document.Document)207 Directory (org.apache.lucene.store.Directory)192 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)176 BytesRef (org.apache.lucene.util.BytesRef)122 Test (org.junit.Test)119 TokenStream (org.apache.lucene.analysis.TokenStream)107 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)92 Term (org.apache.lucene.index.Term)92 IndexReader (org.apache.lucene.index.IndexReader)67 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)65 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)64 Input (org.apache.lucene.search.suggest.Input)63 CharArraySet (org.apache.lucene.analysis.CharArraySet)58 ArrayList (java.util.ArrayList)57 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)57 TextField (org.apache.lucene.document.TextField)55