use of org.apache.lucene.analysis.util.CharArraySet in project textdb by TextDB.
the class DataflowUtils method tokenizeQueryWithStopwords.
public static ArrayList<String> tokenizeQueryWithStopwords(String luceneAnalyzerStr, String query) {
Analyzer luceneAnalyzer;
if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.standardAnalyzerString())) {
// use an empty stop word list for standard analyzer
CharArraySet emptyStopwords = new CharArraySet(1, true);
luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
} else if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.chineseAnalyzerString())) {
// use the default smart chinese analyzer
// because the smart chinese analyzer's default stopword list is simply a list of punctuations
// https://lucene.apache.org/core/5_5_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html
luceneAnalyzer = LuceneAnalyzerConstants.getLuceneAnalyzer(luceneAnalyzerStr);
} else {
throw new TexeraException("tokenizeQueryWithStopwords: analyzer " + luceneAnalyzerStr + " not recgonized");
}
ArrayList<String> result = new ArrayList<String>();
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
String token = term.toString();
int tokenIndex = query.toLowerCase().indexOf(token);
// Since tokens are converted to lower case,
// get the exact token from the query string.
String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
result.add(actualQueryToken);
}
tokenStream.close();
} catch (IOException e) {
throw new DataflowException(e);
} finally {
luceneAnalyzer.close();
}
return result;
}
use of org.apache.lucene.analysis.util.CharArraySet in project omegat by omegat-org.
the class LuceneArabicTokenizer method getTokenStream.
@SuppressWarnings("resource")
@Override
protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) throws IOException {
if (stemsAllowed) {
CharArraySet stopWords = stopWordsAllowed ? ArabicAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET;
ArabicAnalyzer analyzer = new ArabicAnalyzer(stopWords);
return analyzer.tokenStream("", new StringReader(strOrig));
} else {
return getStandardTokenStream(strOrig);
}
}
use of org.apache.lucene.analysis.util.CharArraySet in project omegat by omegat-org.
the class LuceneJapaneseTokenizer method getTokenStream.
@SuppressWarnings("resource")
@Override
protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) throws IOException {
if (stemsAllowed) {
// Blank out tags when stemming only
strOrig = blankOutTags(strOrig);
CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET;
Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.emptySet();
return new JapaneseAnalyzer(null, Mode.SEARCH, stopWords, stopTags).tokenStream("", new StringReader(strOrig));
} else {
JapaneseTokenizer tokenizer = new JapaneseTokenizer(null, false, Mode.NORMAL);
tokenizer.setReader(new StringReader(strOrig));
return new TagJoiningFilter(tokenizer);
}
}
use of org.apache.lucene.analysis.util.CharArraySet in project omegat by omegat-org.
the class LucenePolishTokenizer method getTokenStream.
@SuppressWarnings("resource")
@Override
protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) throws IOException {
if (stemsAllowed) {
CharArraySet stopWords = stopWordsAllowed ? PolishAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET;
PolishAnalyzer analyzer = new PolishAnalyzer(stopWords);
return analyzer.tokenStream("", new StringReader(strOrig));
} else {
return getStandardTokenStream(strOrig);
}
}
use of org.apache.lucene.analysis.util.CharArraySet in project textdb by TextDB.
the class DataflowUtils method tokenizeQueryWithStopwords.
public static ArrayList<String> tokenizeQueryWithStopwords(String query) {
ArrayList<String> result = new ArrayList<String>();
CharArraySet emptyStopwords = new CharArraySet(1, true);
Analyzer luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
String token = term.toString();
int tokenIndex = query.toLowerCase().indexOf(token);
// Since tokens are converted to lower case,
// get the exact token from the query string.
String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
result.add(actualQueryToken);
}
tokenStream.close();
} catch (Exception e) {
e.printStackTrace();
}
luceneAnalyzer.close();
return result;
}
Aggregations