Search in sources :

Example 6 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project textdb by TextDB.

the class DataflowUtils method tokenizeQueryWithStopwords.

public static ArrayList<String> tokenizeQueryWithStopwords(String luceneAnalyzerStr, String query) {
    Analyzer luceneAnalyzer;
    if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.standardAnalyzerString())) {
        // use an empty stop word list for standard analyzer
        CharArraySet emptyStopwords = new CharArraySet(1, true);
        luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
    } else if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.chineseAnalyzerString())) {
        // use the default smart chinese analyzer
        // because the smart chinese analyzer's default stopword list is simply a list of punctuations
        // https://lucene.apache.org/core/5_5_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html
        luceneAnalyzer = LuceneAnalyzerConstants.getLuceneAnalyzer(luceneAnalyzerStr);
    } else {
        throw new TexeraException("tokenizeQueryWithStopwords: analyzer " + luceneAnalyzerStr + " not recgonized");
    }
    ArrayList<String> result = new ArrayList<String>();
    TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String token = term.toString();
            int tokenIndex = query.toLowerCase().indexOf(token);
            // Since tokens are converted to lower case,
            // get the exact token from the query string.
            String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
            result.add(actualQueryToken);
        }
        tokenStream.close();
    } catch (IOException e) {
        throw new DataflowException(e);
    } finally {
        luceneAnalyzer.close();
    }
    return result;
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StringReader(java.io.StringReader) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 7 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project omegat by omegat-org.

the class LuceneArabicTokenizer method getTokenStream.

@SuppressWarnings("resource")
@Override
protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) throws IOException {
    if (stemsAllowed) {
        CharArraySet stopWords = stopWordsAllowed ? ArabicAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET;
        ArabicAnalyzer analyzer = new ArabicAnalyzer(stopWords);
        return analyzer.tokenStream("", new StringReader(strOrig));
    } else {
        return getStandardTokenStream(strOrig);
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) ArabicAnalyzer(org.apache.lucene.analysis.ar.ArabicAnalyzer) StringReader(java.io.StringReader)

Example 8 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project omegat by omegat-org.

the class LuceneJapaneseTokenizer method getTokenStream.

@SuppressWarnings("resource")
@Override
protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) throws IOException {
    if (stemsAllowed) {
        // Blank out tags when stemming only
        strOrig = blankOutTags(strOrig);
        CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET;
        Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.emptySet();
        return new JapaneseAnalyzer(null, Mode.SEARCH, stopWords, stopTags).tokenStream("", new StringReader(strOrig));
    } else {
        JapaneseTokenizer tokenizer = new JapaneseTokenizer(null, false, Mode.NORMAL);
        tokenizer.setReader(new StringReader(strOrig));
        return new TagJoiningFilter(tokenizer);
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) JapaneseAnalyzer(org.apache.lucene.analysis.ja.JapaneseAnalyzer) StringReader(java.io.StringReader) JapaneseTokenizer(org.apache.lucene.analysis.ja.JapaneseTokenizer)

Example 9 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project omegat by omegat-org.

the class LucenePolishTokenizer method getTokenStream.

@SuppressWarnings("resource")
@Override
protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) throws IOException {
    if (stemsAllowed) {
        CharArraySet stopWords = stopWordsAllowed ? PolishAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET;
        PolishAnalyzer analyzer = new PolishAnalyzer(stopWords);
        return analyzer.tokenStream("", new StringReader(strOrig));
    } else {
        return getStandardTokenStream(strOrig);
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) PolishAnalyzer(org.apache.lucene.analysis.pl.PolishAnalyzer) StringReader(java.io.StringReader)

Example 10 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project textdb by TextDB.

the class DataflowUtils method tokenizeQueryWithStopwords.

public static ArrayList<String> tokenizeQueryWithStopwords(String query) {
    ArrayList<String> result = new ArrayList<String>();
    CharArraySet emptyStopwords = new CharArraySet(1, true);
    Analyzer luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
    TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String token = term.toString();
            int tokenIndex = query.toLowerCase().indexOf(token);
            // Since tokens are converted to lower case,
            // get the exact token from the query string.
            String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
            result.add(actualQueryToken);
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    luceneAnalyzer.close();
    return result;
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) IOException(java.io.IOException)

Aggregations

CharArraySet (org.apache.lucene.analysis.util.CharArraySet)12 StringReader (java.io.StringReader)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)4 IOException (java.io.IOException)3 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 StopFilter (org.apache.lucene.analysis.core.StopFilter)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 OException (com.orientechnologies.common.exception.OException)1 OIndexException (com.orientechnologies.orient.core.index.OIndexException)1 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)1 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)1 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)1 File (java.io.File)1 Constructor (java.lang.reflect.Constructor)1 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 ArrayList (java.util.ArrayList)1 ArabicAnalyzer (org.apache.lucene.analysis.ar.ArabicAnalyzer)1 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)1