Search in sources :

Example 96 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.

the class DataflowUtils method tokenizeQueryWithStopwords.

public static ArrayList<String> tokenizeQueryWithStopwords(String query) {
    ArrayList<String> result = new ArrayList<String>();
    CharArraySet emptyStopwords = new CharArraySet(1, true);
    Analyzer luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
    TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String token = term.toString();
            int tokenIndex = query.toLowerCase().indexOf(token);
            // Since tokens are converted to lower case,
            // get the exact token from the query string.
            String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
            result.add(actualQueryToken);
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    luceneAnalyzer.close();
    return result;
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) IOException(java.io.IOException)

Example 97 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.

the class DataflowUtils method generatePayload.

public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
    List<Span> payload = new ArrayList<>();
    try {
        TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
        int tokenPositionCounter = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
            int tokenPosition = tokenPositionCounter;
            int charStart = offsetAttribute.startOffset();
            int charEnd = offsetAttribute.endOffset();
            String analyzedTermStr = charTermAttribute.toString();
            String originalTermStr = fieldValue.substring(charStart, charEnd);
            payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
        }
        tokenStream.close();
    } catch (IOException e) {
        // return empty payload
        payload.clear();
    }
    return payload;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException) Span(edu.uci.ics.textdb.api.span.Span) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 98 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.

the class ZimbraAnalyzerTest method toTokens.

public static List<String> toTokens(TokenStream stream) throws IOException {
    List<String> result = new ArrayList<String>();
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        result.add(termAttr.toString());
    }
    stream.end();
    return result;
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList)

Example 99 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.

the class UniversalAnalyzerTest method testCJK.

private void testCJK(String src) throws IOException {
    TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
    OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);
    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
    while (true) {
        boolean result = cjk.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }
        String term = cjkTermAttr.toString();
        Assert.assertEquals(cjkTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 100 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project sukija by ahomansikka.

the class KeepFilterTester method test.

public static void test(Reader reader, Writer writer, Voikko voikko, CharArraySet wordSet, String from, String to, Suggestion[] suggestion, boolean stopOnSuccess) throws IOException {
    Set<String> set = new TreeSet<String>();
    TokenStream t = new HVTokenizer();
    ((Tokenizer) t).setReader(reader);
    t = new KeepFilter(t, voikko, wordSet, from, to, suggestion);
    CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
    BaseFormAttribute baseFormAtt = t.addAttribute(BaseFormAttribute.class);
    FlagsAttribute flagsAtt = t.addAttribute(FlagsAttribute.class);
    OriginalWordAttribute originalWordAtt = t.addAttribute(OriginalWordAttribute.class);
    try {
        t.reset();
        while (t.incrementToken()) {
            writer.write("Sana: " + originalWordAtt.getOriginalWord() + " " + termAtt.toString() + " " + Constants.toString(flagsAtt) + " " + baseFormAtt.getBaseForms().toString() + "\n");
            writer.flush();
        }
        t.end();
    } finally {
        t.close();
    }
}
Also used : HVTokenizer(peltomaa.sukija.finnish.HVTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) BaseFormAttribute(peltomaa.sukija.attributes.BaseFormAttribute) TreeSet(java.util.TreeSet) OriginalWordAttribute(peltomaa.sukija.attributes.OriginalWordAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) HVTokenizer(peltomaa.sukija.finnish.HVTokenizer)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)144 TokenStream (org.apache.lucene.analysis.TokenStream)88 StringReader (java.io.StringReader)42 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)33 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 ArrayList (java.util.ArrayList)26 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 IOException (java.io.IOException)22 Analyzer (org.apache.lucene.analysis.Analyzer)18 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)8 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6