Search in sources :

Example 16 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project elasticsearch by elastic.

the class MockRepeatAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new StandardTokenizer();
    TokenStream repeatFilter = new MockRepeatFilter(tokenizer);
    return new TokenStreamComponents(tokenizer, repeatFilter);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer)

Example 17 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project elasticsearch by elastic.

the class MapperQueryParser method getPossiblyAnalyzedPrefixQuery.

private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
    if (!settings.analyzeWildcard()) {
        return super.getPrefixQuery(field, termStr);
    }
    List<List<String>> tlist;
    // get Analyzer from superclass and tokenize the term
    TokenStream source = null;
    try {
        try {
            source = getAnalyzer().tokenStream(field, termStr);
            source.reset();
        } catch (IOException e) {
            return super.getPrefixQuery(field, termStr);
        }
        tlist = new ArrayList<>();
        List<String> currentPos = new ArrayList<>();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);
        while (true) {
            try {
                if (!source.incrementToken())
                    break;
            } catch (IOException e) {
                break;
            }
            if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
                tlist.add(currentPos);
                currentPos = new ArrayList<>();
            }
            currentPos.add(termAtt.toString());
        }
        if (currentPos.isEmpty() == false) {
            tlist.add(currentPos);
        }
    } finally {
        if (source != null) {
            IOUtils.closeWhileHandlingException(source);
        }
    }
    if (tlist.size() == 0) {
        return null;
    }
    if (tlist.size() == 1 && tlist.get(0).size() == 1) {
        return super.getPrefixQuery(field, tlist.get(0).get(0));
    }
    // build a boolean query with prefix on the last position only.
    List<BooleanClause> clauses = new ArrayList<>();
    for (int pos = 0; pos < tlist.size(); pos++) {
        List<String> plist = tlist.get(pos);
        boolean isLastPos = (pos == tlist.size() - 1);
        Query posQuery;
        if (plist.size() == 1) {
            if (isLastPos) {
                posQuery = super.getPrefixQuery(field, plist.get(0));
            } else {
                posQuery = newTermQuery(new Term(field, plist.get(0)));
            }
        } else if (isLastPos == false) {
            // build a synonym query for terms in the same position.
            Term[] terms = new Term[plist.size()];
            for (int i = 0; i < plist.size(); i++) {
                terms[i] = new Term(field, plist.get(i));
            }
            posQuery = new SynonymQuery(terms);
        } else {
            List<BooleanClause> innerClauses = new ArrayList<>();
            for (String token : plist) {
                innerClauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
            }
            posQuery = getBooleanQueryCoordDisabled(innerClauses);
        }
        clauses.add(new BooleanClause(posQuery, getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD));
    }
    return getBooleanQuery(clauses);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) BooleanClause(org.apache.lucene.search.BooleanClause) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) List(java.util.List)

Example 18 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project elasticsearch by elastic.

the class XAnalyzingSuggester method toFiniteStrings.

// EDIT: Adrien, needed by lookup providers
// NOTE: these XForks are unmaintainable, we need to get rid of them...
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
    final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    Automaton automaton;
    try (TokenStream ts = stream) {
        automaton = toAutomaton(ts, ts2a);
    }
    LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    Set<IntsRef> set = new HashSet<>();
    for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
        set.add(IntsRef.deepCopyOf(string));
    }
    return Collections.unmodifiableSet(set);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRef(org.apache.lucene.util.IntsRef) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) HashSet(java.util.HashSet)

Example 19 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project elasticsearch by elastic.

the class MoreLikeThisQuery method handleUnlike.

private void handleUnlike(XMoreLikeThis mlt, String[] unlikeText, Fields[] unlikeFields) throws IOException {
    Set<Term> skipTerms = new HashSet<>();
    // handle like text
    if (unlikeText != null) {
        for (String text : unlikeText) {
            // only use the first field to be consistent
            String fieldName = moreLikeFields[0];
            try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
                CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
                ts.reset();
                while (ts.incrementToken()) {
                    skipTerms.add(new Term(fieldName, termAtt.toString()));
                }
                ts.end();
            }
        }
    }
    // handle like fields
    if (unlikeFields != null) {
        for (Fields fields : unlikeFields) {
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                final TermsEnum termsEnum = terms.iterator();
                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                    skipTerms.add(new Term(fieldName, text.utf8ToString()));
                }
            }
        }
    }
    if (!skipTerms.isEmpty()) {
        mlt.setSkipTerms(skipTerms);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Fields(org.apache.lucene.index.Fields) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 20 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr-analysis-turkish by iorixxx.

the class TestTurkishDeASCIIfyFilter method testDeAscii.

public void testDeAscii() throws Exception {
    TokenStream stream = whitespaceMockTokenizer("kus fadil akgunduz dogalgaz ahmet");
    stream = new TurkishDeASCIIfyFilter(stream, false);
    assertTokenStreamContents(stream, new String[] { "kuş", "fadıl", "akgündüz", "doğalgaz", "ahmet" });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TurkishDeASCIIfyFilter(org.apache.lucene.analysis.tr.TurkishDeASCIIfyFilter)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)848 StringReader (java.io.StringReader)336 Tokenizer (org.apache.lucene.analysis.Tokenizer)244 Reader (java.io.Reader)175 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)140 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)128 Analyzer (org.apache.lucene.analysis.Analyzer)121 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)94 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)88 IOException (java.io.IOException)85 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)73 Term (org.apache.lucene.index.Term)66 Document (org.apache.lucene.document.Document)64 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)59 ArrayList (java.util.ArrayList)58 StopFilter (org.apache.lucene.analysis.StopFilter)58 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)57 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)53 Test (org.junit.Test)53 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)46