Search in sources :

Example 1 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class SpellCheckComponent method getTokens.

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<>();
    assert analyzer != null;
    try (TokenStream ts = analyzer.tokenStream("", q)) {
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 2 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class SimplePreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
        String s = f.stringValue();
        if (s != null) {
            // encode the equals sign
            s = s.replaceAll("=", "\\=");
            sb.append('=');
            sb.append(s);
            sb.append('=');
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        StringBuilder tok = new StringBuilder();
        boolean next = false;
        while (ts.incrementToken()) {
            if (next) {
                sb.append(' ');
            } else {
                next = true;
            }
            tok.setLength(0);
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = escape(catt.buffer(), catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
                    tTerm = escape(tTermChars, tTermChars.length);
                } else {
                    if (tok.length() > 0)
                        tok.append(',');
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
                        } else if (tok.length() > 0) {
                            // remove the last comma
                            tok.setLength(tok.length() - 1);
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.append("y=" + escape(((TypeAttribute) att).type()));
                    } else {
                        tok.append(cl.getName() + "=" + escape(att.toString()));
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                if (tok.length() > 0) {
                    tok.insert(0, term + ",");
                } else {
                    tok.insert(0, term);
                }
            }
            sb.append(tok);
        }
    }
    return sb.toString();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 3 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class JsonPreAnalyzedParser method parse.

@SuppressWarnings("unchecked")
@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
    ParseResult res = new ParseResult();
    StringBuilder sb = new StringBuilder();
    char[] buf = new char[128];
    int cnt;
    while ((cnt = reader.read(buf)) > 0) {
        sb.append(buf, 0, cnt);
    }
    String val = sb.toString();
    // empty string - accept even without version number
    if (val.length() == 0) {
        return res;
    }
    Object o = ObjectBuilder.fromJSON(val);
    if (!(o instanceof Map)) {
        throw new IOException("Invalid JSON type " + o.getClass().getName() + ", expected Map");
    }
    Map<String, Object> map = (Map<String, Object>) o;
    // check version
    String version = (String) map.get(VERSION_KEY);
    if (version == null) {
        throw new IOException("Missing VERSION key");
    }
    if (!VERSION.equals(version)) {
        throw new IOException("Unknown VERSION '" + version + "', expected " + VERSION);
    }
    if (map.containsKey(STRING_KEY) && map.containsKey(BINARY_KEY)) {
        throw new IOException("Field cannot have both stringValue and binaryValue");
    }
    res.str = (String) map.get(STRING_KEY);
    String bin = (String) map.get(BINARY_KEY);
    if (bin != null) {
        byte[] data = Base64.base64ToByteArray(bin);
        res.bin = data;
    }
    List<Object> tokens = (List<Object>) map.get(TOKENS_KEY);
    if (tokens == null) {
        return res;
    }
    int tokenStart = 0;
    int tokenEnd = 0;
    parent.clearAttributes();
    for (Object ot : tokens) {
        // automatic increment by 1 separator
        tokenStart = tokenEnd + 1;
        Map<String, Object> tok = (Map<String, Object>) ot;
        boolean hasOffsetStart = false;
        boolean hasOffsetEnd = false;
        int len = -1;
        for (Entry<String, Object> e : tok.entrySet()) {
            String key = e.getKey();
            if (key.equals(TOKEN_KEY)) {
                CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
                String str = String.valueOf(e.getValue());
                catt.append(str);
                len = str.length();
            } else if (key.equals(OFFSET_START_KEY)) {
                Object obj = e.getValue();
                hasOffsetStart = true;
                if (obj instanceof Number) {
                    tokenStart = ((Number) obj).intValue();
                } else {
                    try {
                        tokenStart = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + OFFSET_START_KEY + " attribute, skipped: '" + obj + "'");
                        hasOffsetStart = false;
                    }
                }
            } else if (key.equals(OFFSET_END_KEY)) {
                hasOffsetEnd = true;
                Object obj = e.getValue();
                if (obj instanceof Number) {
                    tokenEnd = ((Number) obj).intValue();
                } else {
                    try {
                        tokenEnd = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + OFFSET_END_KEY + " attribute, skipped: '" + obj + "'");
                        hasOffsetEnd = false;
                    }
                }
            } else if (key.equals(POSINCR_KEY)) {
                Object obj = e.getValue();
                int posIncr = 1;
                if (obj instanceof Number) {
                    posIncr = ((Number) obj).intValue();
                } else {
                    try {
                        posIncr = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + POSINCR_KEY + " attribute, skipped: '" + obj + "'");
                    }
                }
                PositionIncrementAttribute patt = parent.addAttribute(PositionIncrementAttribute.class);
                patt.setPositionIncrement(posIncr);
            } else if (key.equals(PAYLOAD_KEY)) {
                String str = String.valueOf(e.getValue());
                if (str.length() > 0) {
                    byte[] data = Base64.base64ToByteArray(str);
                    PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
                    if (data != null && data.length > 0) {
                        p.setPayload(new BytesRef(data));
                    }
                }
            } else if (key.equals(FLAGS_KEY)) {
                try {
                    int f = Integer.parseInt(String.valueOf(e.getValue()), 16);
                    FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
                    flags.setFlags(f);
                } catch (NumberFormatException nfe) {
                    LOG.warn("Invalid " + FLAGS_KEY + " attribute, skipped: '" + e.getValue() + "'");
                }
            } else if (key.equals(TYPE_KEY)) {
                TypeAttribute tattr = parent.addAttribute(TypeAttribute.class);
                tattr.setType(String.valueOf(e.getValue()));
            } else {
                LOG.warn("Unknown attribute, skipped: " + e.getKey() + "=" + e.getValue());
            }
        }
        // handle offset attr
        OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
        if (!hasOffsetEnd && len > -1) {
            tokenEnd = tokenStart + len;
        }
        offset.setOffset(tokenStart, tokenEnd);
        if (!hasOffsetStart) {
            tokenStart = tokenEnd + 1;
        }
        // capture state and add to result
        State state = parent.captureState();
        res.states.add(state.clone());
        // reset for reuse
        parent.clearAttributes();
    }
    return res;
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) ParseResult(org.apache.solr.schema.PreAnalyzedField.ParseResult) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) State(org.apache.lucene.util.AttributeSource.State) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) LinkedList(java.util.LinkedList) List(java.util.List) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) BytesRef(org.apache.lucene.util.BytesRef)

Example 4 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class TestSnowball method testFilterTokens.

public void testFilterTokens() throws Exception {
    SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
    filter.incrementToken();
    assertEquals("accent", termAtt.toString());
    assertEquals(2, offsetAtt.startOffset());
    assertEquals(7, offsetAtt.endOffset());
    assertEquals("wrd", typeAtt.type());
    assertEquals(3, posIncAtt.getPositionIncrement());
    assertEquals(77, flagsAtt.getFlags());
    assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.getPayload());
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 5 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class WikipediaTokenizerTest method testBoth.

public void testBoth() throws Exception {
    Set<String> untoks = new HashSet<>();
    untoks.add(WikipediaTokenizer.CATEGORY);
    untoks.add(WikipediaTokenizer.ITALICS);
    String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    //should output all the indivual tokens plus the untokenized tokens as well.  Untokenized tokens
    WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(tf, new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g", "link", "here", "link", "there", "italics here", "italics", "here", "something", "more italics", "more", "italics", "h   i   j", "h", "i", "j" }, new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132 }, new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 }, new int[] { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1 });
    // now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
    tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
    tf.setReader(new StringReader(test));
    int[] expectedFlags = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 };
    FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class);
    tf.reset();
    for (int i = 0; i < expectedFlags.length; i++) {
        assertTrue(tf.incrementToken());
        assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags());
    }
    assertFalse(tf.incrementToken());
    tf.close();
}
Also used : WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) StringReader(java.io.StringReader) HashSet(java.util.HashSet)

Aggregations

FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)11 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)10 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)8 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)8 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)7 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)7 TokenStream (org.apache.lucene.analysis.TokenStream)6 BytesRef (org.apache.lucene.util.BytesRef)5 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 BaseFormAttribute (peltomaa.sukija.attributes.BaseFormAttribute)3 OriginalWordAttribute (peltomaa.sukija.attributes.OriginalWordAttribute)3 HVTokenizer (peltomaa.sukija.finnish.HVTokenizer)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 LinkedHashMap (java.util.LinkedHashMap)2 LinkedList (java.util.LinkedList)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2 Token (org.apache.lucene.analysis.Token)2 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)2