Search in sources :

Example 6 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class CompoundAnalysisTests method analyze.

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
    AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() {

        @Override
        public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
            return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
        }
    }));
    IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
    Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", text, 1.0f);
    TokenStream stream = AllTokenStream.allTokenStream("_all", text, 1.0f, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    List<String> terms = new ArrayList<>();
    while (stream.incrementToken()) {
        String tokText = termAtt.toString();
        terms.add(tokText);
    }
    return terms;
}
Also used : AllTokenStream(org.elasticsearch.common.lucene.all.AllTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) IndexSettings(org.elasticsearch.index.IndexSettings) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) AllEntries(org.elasticsearch.common.lucene.all.AllEntries) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Environment(org.elasticsearch.env.Environment) AnalysisModule(org.elasticsearch.indices.analysis.AnalysisModule) MyFilterTokenFilterFactory(org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory) AnalysisProvider(org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider) AnalysisPlugin(org.elasticsearch.plugins.AnalysisPlugin)

Example 7 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class DocumentFieldMapperTests method assertAnalyzes.

private void assertAnalyzes(Analyzer analyzer, String field, String output) throws IOException {
    try (TokenStream tok = analyzer.tokenStream(field, new StringReader(""))) {
        CharTermAttribute term = tok.addAttribute(CharTermAttribute.class);
        assertTrue(tok.incrementToken());
        assertEquals(output, term.toString());
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader)

Example 8 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class MapperQueryParser method getPossiblyAnalyzedPrefixQuery.

private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
    if (!settings.analyzeWildcard()) {
        return super.getPrefixQuery(field, termStr);
    }
    List<List<String>> tlist;
    // get Analyzer from superclass and tokenize the term
    TokenStream source = null;
    try {
        try {
            source = getAnalyzer().tokenStream(field, termStr);
            source.reset();
        } catch (IOException e) {
            return super.getPrefixQuery(field, termStr);
        }
        tlist = new ArrayList<>();
        List<String> currentPos = new ArrayList<>();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);
        while (true) {
            try {
                if (!source.incrementToken())
                    break;
            } catch (IOException e) {
                break;
            }
            if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
                tlist.add(currentPos);
                currentPos = new ArrayList<>();
            }
            currentPos.add(termAtt.toString());
        }
        if (currentPos.isEmpty() == false) {
            tlist.add(currentPos);
        }
    } finally {
        if (source != null) {
            IOUtils.closeWhileHandlingException(source);
        }
    }
    if (tlist.size() == 0) {
        return null;
    }
    if (tlist.size() == 1 && tlist.get(0).size() == 1) {
        return super.getPrefixQuery(field, tlist.get(0).get(0));
    }
    // build a boolean query with prefix on the last position only.
    List<BooleanClause> clauses = new ArrayList<>();
    for (int pos = 0; pos < tlist.size(); pos++) {
        List<String> plist = tlist.get(pos);
        boolean isLastPos = (pos == tlist.size() - 1);
        Query posQuery;
        if (plist.size() == 1) {
            if (isLastPos) {
                posQuery = super.getPrefixQuery(field, plist.get(0));
            } else {
                posQuery = newTermQuery(new Term(field, plist.get(0)));
            }
        } else if (isLastPos == false) {
            // build a synonym query for terms in the same position.
            Term[] terms = new Term[plist.size()];
            for (int i = 0; i < plist.size(); i++) {
                terms[i] = new Term(field, plist.get(i));
            }
            posQuery = new SynonymQuery(terms);
        } else {
            List<BooleanClause> innerClauses = new ArrayList<>();
            for (String token : plist) {
                innerClauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
            }
            posQuery = getBooleanQueryCoordDisabled(innerClauses);
        }
        clauses.add(new BooleanClause(posQuery, getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD));
    }
    return getBooleanQuery(clauses);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) BooleanClause(org.apache.lucene.search.BooleanClause) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) List(java.util.List)

Example 9 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class MoreLikeThisQuery method handleUnlike.

private void handleUnlike(XMoreLikeThis mlt, String[] unlikeText, Fields[] unlikeFields) throws IOException {
    Set<Term> skipTerms = new HashSet<>();
    // handle like text
    if (unlikeText != null) {
        for (String text : unlikeText) {
            // only use the first field to be consistent
            String fieldName = moreLikeFields[0];
            try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
                CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
                ts.reset();
                while (ts.incrementToken()) {
                    skipTerms.add(new Term(fieldName, termAtt.toString()));
                }
                ts.end();
            }
        }
    }
    // handle like fields
    if (unlikeFields != null) {
        for (Fields fields : unlikeFields) {
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                final TermsEnum termsEnum = terms.iterator();
                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                    skipTerms.add(new Term(fieldName, text.utf8ToString()));
                }
            }
        }
    }
    if (!skipTerms.isEmpty()) {
        mlt.setSkipTerms(skipTerms);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Fields(org.apache.lucene.index.Fields) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 10 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.

the class Summarizer method getTokens.

private SToken[] getTokens(String text) throws IOException {
    //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
    //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
    ArrayList<SToken> result = new ArrayList<>();
    try (TokenStream ts = analyzer.tokenStream("full", text)) {
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
            result.add(t);
        }
        ts.end();
    }
    return result.toArray(new SToken[result.size()]);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6