Examples with TokenStream - org.apache.lucene.analysis.TokenStream

Example 26 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SpellCheckComponent method getTokens.

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<>();
    assert analyzer != null;
    try (TokenStream ts = analyzer.tokenStream("", q)) {
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    }
}

Example 27 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class QueryElevationComponent method getAnalyzedQuery.

String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
        return query;
    }
    StringBuilder norm = new StringBuilder();
    try (TokenStream tokens = analyzer.tokenStream("", query)) {
        tokens.reset();
        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        while (tokens.incrementToken()) {
            norm.append(termAtt.buffer(), 0, termAtt.length());
        }
        tokens.end();
        return norm.toString();
    }
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 28 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SimplePreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
        String s = f.stringValue();
        if (s != null) {
            // encode the equals sign
            s = s.replaceAll("=", "\\=");
            sb.append('=');
            sb.append(s);
            sb.append('=');
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        StringBuilder tok = new StringBuilder();
        boolean next = false;
        while (ts.incrementToken()) {
            if (next) {
                sb.append(' ');
            } else {
                next = true;
            }
            tok.setLength(0);
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = escape(catt.buffer(), catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
                    tTerm = escape(tTermChars, tTermChars.length);
                } else {
                    if (tok.length() > 0)
                        tok.append(',');
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
                        } else if (tok.length() > 0) {
                            // remove the last comma
                            tok.setLength(tok.length() - 1);
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.append("y=" + escape(((TypeAttribute) att).type()));
                    } else {
                        tok.append(cl.getName() + "=" + escape(att.toString()));
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                if (tok.length() > 0) {
                    tok.insert(0, term + ",");
                } else {
                    tok.insert(0, term);
                }
            }
            sb.append(tok);
        }
    }
    return sb.toString();
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 29 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TextField method analyzeMultiTerm.

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;
    try (TokenStream source = analyzerIn.tokenStream(field, part)) {
        source.reset();
        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part);
        BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part);
        source.end();
        return bytes;
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    }
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) IOException(java.io.IOException) SolrException(org.apache.solr.common.SolrException) BytesRef(org.apache.lucene.util.BytesRef)

Example 30 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project sukija by ahomansikka.

the class AppTest method test.

private boolean test(String input, String expectedOutput) throws IOException {
    Reader r = new StringReader(input);
    TokenStream t = new HVTokenizer();
    ((Tokenizer) t).setReader(r);
    t = new VoikkoFilter(t, voikko);
    t.reset();
    VoikkoAttribute sukijaAtt = t.addAttribute(VoikkoAttribute.class);
    CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
    while (t.incrementToken()) {
        System.out.println("AppTest " + termAtt.toString());
        for (int i = 0; i < sukijaAtt.getAnalysis().size(); i++) {
            System.out.println(sukijaAtt.getAnalysis(i).get("BASEFORM"));
        //        VoikkoUtils.printAnalysisResult (sukijaAtt.getAnalysis(i), System.out);
        }
        System.out.println("");
    }
    return true;
}

Also used : HVTokenizer(peltomaa.sukija.finnish.HVTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) VoikkoFilter(peltomaa.sukija.voikko.VoikkoFilter) StringReader(java.io.StringReader) Reader(java.io.Reader) StringReader(java.io.StringReader) VoikkoAttribute(peltomaa.sukija.attributes.VoikkoAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) HVTokenizer(peltomaa.sukija.finnish.HVTokenizer)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)848 StringReader (java.io.StringReader)336 Tokenizer (org.apache.lucene.analysis.Tokenizer)244 Reader (java.io.Reader)175 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)140 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)128 Analyzer (org.apache.lucene.analysis.Analyzer)121 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)94 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)88 IOException (java.io.IOException)85 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)73 Term (org.apache.lucene.index.Term)66 Document (org.apache.lucene.document.Document)64 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)59 ArrayList (java.util.ArrayList)58 StopFilter (org.apache.lucene.analysis.StopFilter)58 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)57 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)53 Test (org.junit.Test)53 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)46