Search in sources :

Example 21 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project sukija by ahomansikka.

the class SuggestionTester method analyze.

public static void analyze(Reader reader, Writer writer, Voikko voikko, String suggestionFile, boolean stopOnSuccess, boolean useHyphenFilter, TokenStream t) throws IOException {
    List<Analysis> analysis = null;
    ((Tokenizer) t).setReader(reader);
    //    t = new VoikkoFilter (t, voikko);
    t = new SuggestionFilter(t, voikko, suggestionFile, false);
    CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
    BaseFormAttribute baseFormAtt = t.addAttribute(BaseFormAttribute.class);
    FlagsAttribute flagsAtt = t.addAttribute(FlagsAttribute.class);
    OriginalWordAttribute originalWordAtt = t.addAttribute(OriginalWordAttribute.class);
    try {
        t.reset();
        while (t.incrementToken()) {
            writer.write("Sana: " + originalWordAtt.getOriginalWord() + " | " + termAtt.toString() + " | ");
            writer.write(Constants.toString(flagsAtt));
            writer.write("\n");
            writer.flush();
        }
        t.end();
    } finally {
        t.close();
    }
}
Also used : FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) BaseFormAttribute(peltomaa.sukija.attributes.BaseFormAttribute) Analysis(org.puimula.libvoikko.Analysis) OriginalWordAttribute(peltomaa.sukija.attributes.OriginalWordAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) HVTokenizer(peltomaa.sukija.finnish.HVTokenizer)

Example 22 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project sukija by ahomansikka.

the class BaseFormTester method test.

public static void test(Reader reader, Writer writer, Voikko voikko, boolean successOnly) throws IOException {
    TokenStream t = new HVTokenizer();
    ((Tokenizer) t).setReader(reader);
    t = new BaseFormFilter(t, voikko, successOnly);
    CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
    BaseFormAttribute baseFormAtt = t.addAttribute(BaseFormAttribute.class);
    FlagsAttribute flagsAtt = t.addAttribute(FlagsAttribute.class);
    OriginalWordAttribute originalWordAtt = t.addAttribute(OriginalWordAttribute.class);
    String orig = "";
    TreeSet<String> tset = new TreeSet<String>();
    FlagsAttribute flagsA = new FlagsAttributeImpl();
    try {
        t.reset();
        while (t.incrementToken()) {
            if (!orig.equals("") && !orig.equals(originalWordAtt.getOriginalWord())) {
                writer.write("Sana: " + orig);
                if (Constants.hasFlag(flagsA, Constants.FOUND)) {
                    writer.write(" M " + toString(tset));
                }
                writer.write("\n");
                writer.flush();
                tset.clear();
            }
            orig = originalWordAtt.getOriginalWord();
            tset.addAll(baseFormAtt.getBaseForms());
            flagsA.setFlags(flagsAtt.getFlags());
        }
        writer.write("Sana: " + orig);
        if (Constants.hasFlag(flagsA, Constants.FOUND)) {
            writer.write(" M " + toString(tset));
        }
        writer.write("\n");
        writer.flush();
        t.end();
    } finally {
        t.close();
    }
/*
    try {
      t.reset();
      while (t.incrementToken()) {
        writer.write ("Sana: " + originalWordAtt.getOriginalWord()
                      + " " + termAtt.toString()
                      + " " + Constants.toString (flagsAtt)
                      + " " + baseFormAtt.getBaseForms().toString()
                      + "\n");
        writer.flush();
      }
      t.end();
    }
    finally {
      t.close();
    }
*/
}
Also used : HVTokenizer(peltomaa.sukija.finnish.HVTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) FlagsAttributeImpl(org.apache.lucene.analysis.tokenattributes.FlagsAttributeImpl) BaseFormAttribute(peltomaa.sukija.attributes.BaseFormAttribute) TreeSet(java.util.TreeSet) OriginalWordAttribute(peltomaa.sukija.attributes.OriginalWordAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) HVTokenizer(peltomaa.sukija.finnish.HVTokenizer)

Example 23 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project sukija by ahomansikka.

the class AppTest method test.

private boolean test(String input, String expectedOutput) throws IOException {
    Reader r = new StringReader(input);
    TokenStream t = new HVTokenizer();
    ((Tokenizer) t).setReader(r);
    t = new VoikkoFilter(t, voikko);
    t.reset();
    VoikkoAttribute sukijaAtt = t.addAttribute(VoikkoAttribute.class);
    CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
    while (t.incrementToken()) {
        System.out.println("AppTest " + termAtt.toString());
        for (int i = 0; i < sukijaAtt.getAnalysis().size(); i++) {
            System.out.println(sukijaAtt.getAnalysis(i).get("BASEFORM"));
        //        VoikkoUtils.printAnalysisResult (sukijaAtt.getAnalysis(i), System.out);
        }
        System.out.println("");
    }
    return true;
}
Also used : HVTokenizer(peltomaa.sukija.finnish.HVTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) VoikkoFilter(peltomaa.sukija.voikko.VoikkoFilter) StringReader(java.io.StringReader) Reader(java.io.Reader) StringReader(java.io.StringReader) VoikkoAttribute(peltomaa.sukija.attributes.VoikkoAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) HVTokenizer(peltomaa.sukija.finnish.HVTokenizer)

Example 24 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project jackrabbit-oak by apache.

the class LuceneIndex method tokenize.

/**
     * Tries to merge back tokens that are split on relevant fulltext query
     * wildcards ('*' or '?')
     *
     *
     * @param text
     * @param analyzer
     * @return
     */
static List<String> tokenize(String text, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        // TypeAttribute type = stream.addAttribute(TypeAttribute.class);
        stream.reset();
        int poz = 0;
        boolean hasFulltextToken = false;
        StringBuilder token = new StringBuilder();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            int start = offsetAtt.startOffset();
            int end = offsetAtt.endOffset();
            if (start > poz) {
                for (int i = poz; i < start; i++) {
                    for (char c : fulltextTokens) {
                        if (c == text.charAt(i)) {
                            token.append(c);
                            hasFulltextToken = true;
                        }
                    }
                }
            }
            poz = end;
            if (hasFulltextToken) {
                token.append(term);
                hasFulltextToken = false;
            } else {
                if (token.length() > 0) {
                    tokens.add(token.toString());
                }
                token = new StringBuilder();
                token.append(term);
            }
        }
        // consume to the end of the string
        if (poz < text.length()) {
            for (int i = poz; i < text.length(); i++) {
                for (char c : fulltextTokens) {
                    if (c == text.charAt(i)) {
                        token.append(c);
                    }
                }
            }
        }
        if (token.length() > 0) {
            tokens.add(token.toString());
        }
        stream.end();
    } catch (IOException e) {
        LOG.error("Building fulltext query failed", e.getMessage());
        return null;
    } finally {
        try {
            if (stream != null) {
                stream.close();
            }
        } catch (IOException e) {
        // ignore
        }
    }
    return tokens;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException)

Example 25 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestASCIIFoldingFilter method testUnmodifiedLetters.

// Test that we do not emit duplicated tokens when preserve original is on
public void testUnmodifiedLetters() throws Exception {
    TokenStream stream = whitespaceMockTokenizer("§ ¦ ¤ END");
    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    filter.reset();
    assertNextTerms("§", "§", filter, termAtt);
    assertNextTerms("¦", "¦", filter, termAtt);
    assertNextTerms("¤", "¤", filter, termAtt);
    assertNextTerms("END", "END", filter, termAtt);
    assertFalse(filter.incrementToken());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)144 TokenStream (org.apache.lucene.analysis.TokenStream)88 StringReader (java.io.StringReader)42 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)33 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 ArrayList (java.util.ArrayList)26 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 IOException (java.io.IOException)22 Analyzer (org.apache.lucene.analysis.Analyzer)18 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)8 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6