Search in sources :

Example 16 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project languagetool by languagetool-org.

the class LanguageToolFilterTest method displayTokensWithFullDetails.

private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }
        System.out.print("[" + term + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] ");
    }
    System.out.println();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 17 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.

the class DataflowUtils method tokenizeQuery.

/**
     * Tokenizes the query string using the given analyser
     * 
     * @param luceneAnalyzer
     * @param query
     * @return ArrayList<String> list of results
     */
public static ArrayList<String> tokenizeQuery(Analyzer luceneAnalyzer, String query) {
    ArrayList<String> result = new ArrayList<String>();
    TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            result.add(term.toString());
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) IOException(java.io.IOException)

Example 18 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.

the class UniversalAnalyzerTest method testSTD.

private void testSTD(String src) throws IOException {
    TokenStream std = standardAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute stdTermAttr = std.addAttribute(CharTermAttribute.class);
    OffsetAttribute stdOffsetAttr = std.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute stdPosIncAttr = std.addAttribute(PositionIncrementAttribute.class);
    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
    while (true) {
        boolean result = std.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }
        String term = stdTermAttr.toString();
        Assert.assertEquals(stdTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, stdOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, stdPosIncAttr, uniPosIncAttr);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 19 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.

the class TermInfo method updateMapWithDetailsForField.

/**
     * Update {@code term2info} with information from {@code field}
     *
     *  if the field from the Lucene document is indexed and tokenized, for each token:
     *      a)   construct a key based on the field name and info about the token
     *      b)   if {@code term2info} has an entry for that key, get it, otherwise create an entry
     *      c)   update the entry with position information for this token
     *
     * @param pos is the current position
     * @return new value for {@code pos}
     */
public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field, Map<String, TermInfo> term2info, int pos) throws IOException {
    if (!field.isIndexed()) {
        return pos;
    }
    Character prefix = LuceneFields.FIELD2PREFIX.get(field.name());
    if (prefix == null) {
        ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name() + " isTokenized=" + field.isTokenized());
        return pos;
    }
    if (field.isTokenized()) {
        TokenStream stream = field.tokenStreamValue();
        if (stream == null) {
            stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
        }
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttr.length() == 0) {
                continue;
            }
            String term = prefix + termAttr.toString();
            TermInfo info = term2info.get(term);
            if (info == null) {
                info = new TermInfo();
                term2info.put(term, info);
            }
            pos += posAttr.getPositionIncrement();
            info.addPosition(pos);
        }
    } else {
        // whole field is the only "token".  Info potentially getting stored twice - here as well as where
        // the field is stored.
        String term = prefix + field.stringValue();
        TermInfo info = term2info.get(term);
        if (info == null) {
            info = new TermInfo();
            term2info.put(term, info);
        }
    }
    return pos;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 20 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project zm-mailbox by Zimbra.

the class ZimbraAnalyzer method getAllTokensConcatenated.

public static String getAllTokensConcatenated(String fieldName, Reader reader) {
    StringBuilder toReturn = new StringBuilder();
    TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    try {
        stream.reset();
        while (stream.incrementToken()) {
            toReturn.append(term);
            toReturn.append(' ');
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        //otherwise eat it
        e.printStackTrace();
    }
    return toReturn.toString();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IOException(java.io.IOException)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)144 TokenStream (org.apache.lucene.analysis.TokenStream)88 StringReader (java.io.StringReader)42 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)33 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 ArrayList (java.util.ArrayList)26 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 IOException (java.io.IOException)22 Analyzer (org.apache.lucene.analysis.Analyzer)18 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)8 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6