Search in sources :

Example 1 with TypeAttribute

use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project elasticsearch by elastic.

the class TransportAnalyzeAction method simpleAnalyze.

private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
    List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
    int lastPosition = -1;
    int lastOffset = 0;
    for (String text : request.text()) {
        try (TokenStream stream = analyzer.tokenStream(field, text)) {
            stream.reset();
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
            TypeAttribute type = stream.addAttribute(TypeAttribute.class);
            PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);
            while (stream.incrementToken()) {
                int increment = posIncr.getPositionIncrement();
                if (increment > 0) {
                    lastPosition = lastPosition + increment;
                }
                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));
            }
            stream.end();
            lastOffset += offset.endOffset();
            lastPosition += posIncr.getPositionIncrement();
            lastPosition += analyzer.getPositionIncrementGap(field);
            lastOffset += analyzer.getOffsetGap(field);
        } catch (IOException e) {
            throw new ElasticsearchException("failed to analyze", e);
        }
    }
    return tokens;
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ElasticsearchException(org.elasticsearch.ElasticsearchException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 2 with TypeAttribute

use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project languagetool by languagetool-org.

the class LanguageToolFilterTest method displayTokensWithFullDetails.

private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }
        System.out.print("[" + term + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] ");
    }
    System.out.println();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 3 with TypeAttribute

use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.

the class SpellCheckComponent method getTokens.

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<>();
    assert analyzer != null;
    try (TokenStream ts = analyzer.tokenStream("", q)) {
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 4 with TypeAttribute

use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.

the class SimplePreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
        String s = f.stringValue();
        if (s != null) {
            // encode the equals sign
            s = s.replaceAll("=", "\\=");
            sb.append('=');
            sb.append(s);
            sb.append('=');
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        StringBuilder tok = new StringBuilder();
        boolean next = false;
        while (ts.incrementToken()) {
            if (next) {
                sb.append(' ');
            } else {
                next = true;
            }
            tok.setLength(0);
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = escape(catt.buffer(), catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
                    tTerm = escape(tTermChars, tTermChars.length);
                } else {
                    if (tok.length() > 0)
                        tok.append(',');
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
                        } else if (tok.length() > 0) {
                            // remove the last comma
                            tok.setLength(tok.length() - 1);
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.append("y=" + escape(((TypeAttribute) att).type()));
                    } else {
                        tok.append(cl.getName() + "=" + escape(att.toString()));
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                if (tok.length() > 0) {
                    tok.insert(0, term + ",");
                } else {
                    tok.insert(0, term);
                }
            }
            sb.append(tok);
        }
    }
    return sb.toString();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 5 with TypeAttribute

use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.

the class JsonPreAnalyzedParser method parse.

@SuppressWarnings("unchecked")
@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
    ParseResult res = new ParseResult();
    StringBuilder sb = new StringBuilder();
    char[] buf = new char[128];
    int cnt;
    while ((cnt = reader.read(buf)) > 0) {
        sb.append(buf, 0, cnt);
    }
    String val = sb.toString();
    // empty string - accept even without version number
    if (val.length() == 0) {
        return res;
    }
    Object o = ObjectBuilder.fromJSON(val);
    if (!(o instanceof Map)) {
        throw new IOException("Invalid JSON type " + o.getClass().getName() + ", expected Map");
    }
    Map<String, Object> map = (Map<String, Object>) o;
    // check version
    String version = (String) map.get(VERSION_KEY);
    if (version == null) {
        throw new IOException("Missing VERSION key");
    }
    if (!VERSION.equals(version)) {
        throw new IOException("Unknown VERSION '" + version + "', expected " + VERSION);
    }
    if (map.containsKey(STRING_KEY) && map.containsKey(BINARY_KEY)) {
        throw new IOException("Field cannot have both stringValue and binaryValue");
    }
    res.str = (String) map.get(STRING_KEY);
    String bin = (String) map.get(BINARY_KEY);
    if (bin != null) {
        byte[] data = Base64.base64ToByteArray(bin);
        res.bin = data;
    }
    List<Object> tokens = (List<Object>) map.get(TOKENS_KEY);
    if (tokens == null) {
        return res;
    }
    int tokenStart = 0;
    int tokenEnd = 0;
    parent.clearAttributes();
    for (Object ot : tokens) {
        // automatic increment by 1 separator
        tokenStart = tokenEnd + 1;
        Map<String, Object> tok = (Map<String, Object>) ot;
        boolean hasOffsetStart = false;
        boolean hasOffsetEnd = false;
        int len = -1;
        for (Entry<String, Object> e : tok.entrySet()) {
            String key = e.getKey();
            if (key.equals(TOKEN_KEY)) {
                CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
                String str = String.valueOf(e.getValue());
                catt.append(str);
                len = str.length();
            } else if (key.equals(OFFSET_START_KEY)) {
                Object obj = e.getValue();
                hasOffsetStart = true;
                if (obj instanceof Number) {
                    tokenStart = ((Number) obj).intValue();
                } else {
                    try {
                        tokenStart = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + OFFSET_START_KEY + " attribute, skipped: '" + obj + "'");
                        hasOffsetStart = false;
                    }
                }
            } else if (key.equals(OFFSET_END_KEY)) {
                hasOffsetEnd = true;
                Object obj = e.getValue();
                if (obj instanceof Number) {
                    tokenEnd = ((Number) obj).intValue();
                } else {
                    try {
                        tokenEnd = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + OFFSET_END_KEY + " attribute, skipped: '" + obj + "'");
                        hasOffsetEnd = false;
                    }
                }
            } else if (key.equals(POSINCR_KEY)) {
                Object obj = e.getValue();
                int posIncr = 1;
                if (obj instanceof Number) {
                    posIncr = ((Number) obj).intValue();
                } else {
                    try {
                        posIncr = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + POSINCR_KEY + " attribute, skipped: '" + obj + "'");
                    }
                }
                PositionIncrementAttribute patt = parent.addAttribute(PositionIncrementAttribute.class);
                patt.setPositionIncrement(posIncr);
            } else if (key.equals(PAYLOAD_KEY)) {
                String str = String.valueOf(e.getValue());
                if (str.length() > 0) {
                    byte[] data = Base64.base64ToByteArray(str);
                    PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
                    if (data != null && data.length > 0) {
                        p.setPayload(new BytesRef(data));
                    }
                }
            } else if (key.equals(FLAGS_KEY)) {
                try {
                    int f = Integer.parseInt(String.valueOf(e.getValue()), 16);
                    FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
                    flags.setFlags(f);
                } catch (NumberFormatException nfe) {
                    LOG.warn("Invalid " + FLAGS_KEY + " attribute, skipped: '" + e.getValue() + "'");
                }
            } else if (key.equals(TYPE_KEY)) {
                TypeAttribute tattr = parent.addAttribute(TypeAttribute.class);
                tattr.setType(String.valueOf(e.getValue()));
            } else {
                LOG.warn("Unknown attribute, skipped: " + e.getKey() + "=" + e.getValue());
            }
        }
        // handle offset attr
        OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
        if (!hasOffsetEnd && len > -1) {
            tokenEnd = tokenStart + len;
        }
        offset.setOffset(tokenStart, tokenEnd);
        if (!hasOffsetStart) {
            tokenStart = tokenEnd + 1;
        }
        // capture state and add to result
        State state = parent.captureState();
        res.states.add(state.clone());
        // reset for reuse
        parent.clearAttributes();
    }
    return res;
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) ParseResult(org.apache.solr.schema.PreAnalyzedField.ParseResult) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) State(org.apache.lucene.util.AttributeSource.State) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) LinkedList(java.util.LinkedList) List(java.util.List) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)25 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)21 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)17 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)17 TokenStream (org.apache.lucene.analysis.TokenStream)12 IOException (java.io.IOException)10 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)10 StringReader (java.io.StringReader)8 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 BytesRef (org.apache.lucene.util.BytesRef)6 ArrayList (java.util.ArrayList)5 PackedTokenAttributeImpl (org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl)5 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)4 LinkedList (java.util.LinkedList)3 Token (org.apache.lucene.analysis.Token)3 LinkedHashMap (java.util.LinkedHashMap)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2 Term (org.apache.lucene.index.Term)2 SpanOrQuery (org.apache.lucene.search.spans.SpanOrQuery)2