use of org.apache.lucene.analysis.TokenStream in project elasticsearch by elastic.
the class MockRepeatAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
TokenStream repeatFilter = new MockRepeatFilter(tokenizer);
return new TokenStreamComponents(tokenizer, repeatFilter);
}
use of org.apache.lucene.analysis.TokenStream in project elasticsearch by elastic.
the class MapperQueryParser method getPossiblyAnalyzedPrefixQuery.
private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
if (!settings.analyzeWildcard()) {
return super.getPrefixQuery(field, termStr);
}
List<List<String>> tlist;
// get Analyzer from superclass and tokenize the term
TokenStream source = null;
try {
try {
source = getAnalyzer().tokenStream(field, termStr);
source.reset();
} catch (IOException e) {
return super.getPrefixQuery(field, termStr);
}
tlist = new ArrayList<>();
List<String> currentPos = new ArrayList<>();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);
while (true) {
try {
if (!source.incrementToken())
break;
} catch (IOException e) {
break;
}
if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
tlist.add(currentPos);
currentPos = new ArrayList<>();
}
currentPos.add(termAtt.toString());
}
if (currentPos.isEmpty() == false) {
tlist.add(currentPos);
}
} finally {
if (source != null) {
IOUtils.closeWhileHandlingException(source);
}
}
if (tlist.size() == 0) {
return null;
}
if (tlist.size() == 1 && tlist.get(0).size() == 1) {
return super.getPrefixQuery(field, tlist.get(0).get(0));
}
// build a boolean query with prefix on the last position only.
List<BooleanClause> clauses = new ArrayList<>();
for (int pos = 0; pos < tlist.size(); pos++) {
List<String> plist = tlist.get(pos);
boolean isLastPos = (pos == tlist.size() - 1);
Query posQuery;
if (plist.size() == 1) {
if (isLastPos) {
posQuery = super.getPrefixQuery(field, plist.get(0));
} else {
posQuery = newTermQuery(new Term(field, plist.get(0)));
}
} else if (isLastPos == false) {
// build a synonym query for terms in the same position.
Term[] terms = new Term[plist.size()];
for (int i = 0; i < plist.size(); i++) {
terms[i] = new Term(field, plist.get(i));
}
posQuery = new SynonymQuery(terms);
} else {
List<BooleanClause> innerClauses = new ArrayList<>();
for (String token : plist) {
innerClauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
}
posQuery = getBooleanQueryCoordDisabled(innerClauses);
}
clauses.add(new BooleanClause(posQuery, getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD));
}
return getBooleanQuery(clauses);
}
use of org.apache.lucene.analysis.TokenStream in project elasticsearch by elastic.
the class XAnalyzingSuggester method toFiniteStrings.
// EDIT: Adrien, needed by lookup providers
// NOTE: these XForks are unmaintainable, we need to get rid of them...
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
Automaton automaton;
try (TokenStream ts = stream) {
automaton = toAutomaton(ts, ts2a);
}
LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
Set<IntsRef> set = new HashSet<>();
for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
set.add(IntsRef.deepCopyOf(string));
}
return Collections.unmodifiableSet(set);
}
use of org.apache.lucene.analysis.TokenStream in project elasticsearch by elastic.
the class MoreLikeThisQuery method handleUnlike.
private void handleUnlike(XMoreLikeThis mlt, String[] unlikeText, Fields[] unlikeFields) throws IOException {
Set<Term> skipTerms = new HashSet<>();
// handle like text
if (unlikeText != null) {
for (String text : unlikeText) {
// only use the first field to be consistent
String fieldName = moreLikeFields[0];
try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
skipTerms.add(new Term(fieldName, termAtt.toString()));
}
ts.end();
}
}
}
// handle like fields
if (unlikeFields != null) {
for (Fields fields : unlikeFields) {
for (String fieldName : fields) {
Terms terms = fields.terms(fieldName);
final TermsEnum termsEnum = terms.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
skipTerms.add(new Term(fieldName, text.utf8ToString()));
}
}
}
}
if (!skipTerms.isEmpty()) {
mlt.setSkipTerms(skipTerms);
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr-analysis-turkish by iorixxx.
the class TestTurkishDeASCIIfyFilter method testDeAscii.
public void testDeAscii() throws Exception {
TokenStream stream = whitespaceMockTokenizer("kus fadil akgunduz dogalgaz ahmet");
stream = new TurkishDeASCIIfyFilter(stream, false);
assertTokenStreamContents(stream, new String[] { "kuş", "fadıl", "akgündüz", "doğalgaz", "ahmet" });
}
Aggregations