Search in sources :

Example 6 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class TestSnowball method testFilterTokens.

public void testFilterTokens() throws Exception {
    SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
    filter.incrementToken();
    assertEquals("accent", termAtt.toString());
    assertEquals(2, offsetAtt.startOffset());
    assertEquals(7, offsetAtt.endOffset());
    assertEquals("wrd", typeAtt.type());
    assertEquals(3, posIncAtt.getPositionIncrement());
    assertEquals(77, flagsAtt.getFlags());
    assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.getPayload());
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 7 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class WikipediaTokenizerTest method testBoth.

public void testBoth() throws Exception {
    Set<String> untoks = new HashSet<>();
    untoks.add(WikipediaTokenizer.CATEGORY);
    untoks.add(WikipediaTokenizer.ITALICS);
    String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    //should output all the indivual tokens plus the untokenized tokens as well.  Untokenized tokens
    WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(tf, new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g", "link", "here", "link", "there", "italics here", "italics", "here", "something", "more italics", "more", "italics", "h   i   j", "h", "i", "j" }, new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132 }, new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 }, new int[] { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1 });
    // now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
    tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
    tf.setReader(new StringReader(test));
    int[] expectedFlags = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 };
    FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class);
    tf.reset();
    for (int i = 0; i < expectedFlags.length; i++) {
        assertTrue(tf.incrementToken());
        assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags());
    }
    assertFalse(tf.incrementToken());
    tf.close();
}
Also used : WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) StringReader(java.io.StringReader) HashSet(java.util.HashSet)

Example 8 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class SimpleQueryConverter method convert.

@Override
public Collection<Token> convert(String origQuery) {
    Collection<Token> result = new HashSet<>();
    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", origQuery)) {
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            Token tok = new Token();
            tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            tok.setFlags(flagsAtt.getFlags());
            tok.setPayload(payloadAtt.getPayload());
            tok.setPositionIncrement(posIncAtt.getPositionIncrement());
            tok.setType(typeAtt.type());
            result.add(tok);
        }
        ts.end();
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) HashSet(java.util.HashSet)

Example 9 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project sukija by ahomansikka.

the class KeepFilterTester method test.

public static void test(Reader reader, Writer writer, Voikko voikko, CharArraySet wordSet, String from, String to, Suggestion[] suggestion, boolean stopOnSuccess) throws IOException {
    Set<String> set = new TreeSet<String>();
    TokenStream t = new HVTokenizer();
    ((Tokenizer) t).setReader(reader);
    t = new KeepFilter(t, voikko, wordSet, from, to, suggestion);
    CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
    BaseFormAttribute baseFormAtt = t.addAttribute(BaseFormAttribute.class);
    FlagsAttribute flagsAtt = t.addAttribute(FlagsAttribute.class);
    OriginalWordAttribute originalWordAtt = t.addAttribute(OriginalWordAttribute.class);
    try {
        t.reset();
        while (t.incrementToken()) {
            writer.write("Sana: " + originalWordAtt.getOriginalWord() + " " + termAtt.toString() + " " + Constants.toString(flagsAtt) + " " + baseFormAtt.getBaseForms().toString() + "\n");
            writer.flush();
        }
        t.end();
    } finally {
        t.close();
    }
}
Also used : HVTokenizer(peltomaa.sukija.finnish.HVTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) BaseFormAttribute(peltomaa.sukija.attributes.BaseFormAttribute) TreeSet(java.util.TreeSet) OriginalWordAttribute(peltomaa.sukija.attributes.OriginalWordAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) HVTokenizer(peltomaa.sukija.finnish.HVTokenizer)

Example 10 with FlagsAttribute

use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.

the class JsonPreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    Map<String, Object> map = new LinkedHashMap<>();
    map.put(VERSION_KEY, VERSION);
    if (f.fieldType().stored()) {
        String stringValue = f.stringValue();
        if (stringValue != null) {
            map.put(STRING_KEY, stringValue);
        }
        BytesRef binaryValue = f.binaryValue();
        if (binaryValue != null) {
            map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        List<Map<String, Object>> tokens = new LinkedList<>();
        while (ts.incrementToken()) {
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            Map<String, Object> tok = new TreeMap<>();
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = new String(catt.buffer(), 0, catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    tTerm = tatt.getBytesRef().utf8ToString();
                } else {
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
                        tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.put(TYPE_KEY, ((TypeAttribute) att).type());
                    } else {
                        tok.put(cl.getName(), att.toString());
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                tok.put(TOKEN_KEY, term);
            }
            tokens.add(tok);
        }
        map.put(TOKENS_KEY, tokens);
    }
    return JSONUtil.toJSON(map, -1);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TreeMap(java.util.TreeMap) LinkedList(java.util.LinkedList) LinkedHashMap(java.util.LinkedHashMap) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)11 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)10 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)7 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)7 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)7 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)7 TokenStream (org.apache.lucene.analysis.TokenStream)6 BytesRef (org.apache.lucene.util.BytesRef)5 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 BaseFormAttribute (peltomaa.sukija.attributes.BaseFormAttribute)3 OriginalWordAttribute (peltomaa.sukija.attributes.OriginalWordAttribute)3 HVTokenizer (peltomaa.sukija.finnish.HVTokenizer)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 LinkedHashMap (java.util.LinkedHashMap)2 LinkedList (java.util.LinkedList)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2 TreeSet (java.util.TreeSet)2 Token (org.apache.lucene.analysis.Token)2