use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class SpellingQueryConverter method convert.
/**
* Converts the original query string to a collection of Lucene Tokens.
* @param original the original query string
* @return a Collection of Lucene Tokens
*/
@Override
public Collection<Token> convert(String original) {
if (original == null) {
// this can happen with q.alt = and no query
return Collections.emptyList();
}
boolean mightContainRangeQuery = (original.indexOf('[') != -1 || original.indexOf('{') != -1) && (original.indexOf(']') != -1 || original.indexOf('}') != -1);
Collection<Token> result = new ArrayList<>();
Matcher matcher = QUERY_REGEX.matcher(original);
String nextWord = null;
int nextStartIndex = 0;
String lastBooleanOp = null;
while (nextWord != null || matcher.find()) {
String word = null;
int startIndex = 0;
if (nextWord != null) {
word = nextWord;
startIndex = nextStartIndex;
nextWord = null;
} else {
word = matcher.group(0);
startIndex = matcher.start();
}
if (matcher.find()) {
nextWord = matcher.group(0);
nextStartIndex = matcher.start();
}
if (mightContainRangeQuery && "TO".equals(word)) {
continue;
}
if ("AND".equals(word) || "OR".equals(word) || "NOT".equals(word)) {
lastBooleanOp = word;
continue;
}
// treat "AND NOT" as "NOT"...
if ("AND".equals(nextWord) && original.length() > nextStartIndex + 7 && original.substring(nextStartIndex, nextStartIndex + 7).equals("AND NOT")) {
nextWord = "NOT";
}
int flagValue = 0;
if (word.charAt(0) == '-' || (startIndex > 0 && original.charAt(startIndex - 1) == '-')) {
flagValue = PROHIBITED_TERM_FLAG;
} else if (word.charAt(0) == '+' || (startIndex > 0 && original.charAt(startIndex - 1) == '+')) {
flagValue = REQUIRED_TERM_FLAG;
//we don't know the default operator so just assume the first operator isn't new.
} else if (nextWord != null && lastBooleanOp != null && !nextWord.equals(lastBooleanOp) && ("AND".equals(nextWord) || "OR".equals(nextWord) || "NOT".equals(nextWord))) {
flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
//...unless the 1st boolean operator is a NOT, because only AND/OR can be default.
} else if (nextWord != null && lastBooleanOp == null && !nextWord.equals(lastBooleanOp) && ("NOT".equals(nextWord))) {
flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
}
try {
analyze(result, word, startIndex, flagValue);
} catch (IOException e) {
// TODO: shouldn't we log something?
}
}
if (lastBooleanOp != null) {
for (Token t : result) {
int f = t.getFlags();
t.setFlags(f |= QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG);
}
}
return result;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class WordBreakSolrSpellChecker method getSuggestions.
@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
IndexReader ir = options.reader;
int numSuggestions = options.count;
StringBuilder sb = new StringBuilder();
Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]);
List<Token> tokenArrWithSeparators = new ArrayList<>(options.tokens.size() + 2);
List<Term> termArr = new ArrayList<>(options.tokens.size() + 2);
List<ResultEntry> breakSuggestionList = new ArrayList<>();
List<ResultEntry> noBreakSuggestionList = new ArrayList<>();
boolean lastOneProhibited = false;
boolean lastOneRequired = false;
boolean lastOneprocedesNewBooleanOp = false;
for (int i = 0; i < tokenArr.length; i++) {
boolean prohibited = (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) == QueryConverter.PROHIBITED_TERM_FLAG;
boolean required = (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) == QueryConverter.REQUIRED_TERM_FLAG;
boolean procedesNewBooleanOp = (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
if (i > 0 && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) {
termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
tokenArrWithSeparators.add(null);
}
lastOneProhibited = prohibited;
lastOneRequired = required;
lastOneprocedesNewBooleanOp = procedesNewBooleanOp;
Term thisTerm = new Term(field, tokenArr[i].toString());
termArr.add(thisTerm);
tokenArrWithSeparators.add(tokenArr[i]);
if (breakWords) {
SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir, options.suggestMode, sortMethod);
if (breakSuggestions.length == 0) {
noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
}
for (SuggestWord[] breakSuggestion : breakSuggestions) {
sb.delete(0, sb.length());
boolean firstOne = true;
int freq = 0;
for (SuggestWord word : breakSuggestion) {
if (!firstOne) {
sb.append(" ");
}
firstOne = false;
sb.append(word.string);
if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
freq = Math.max(freq, word.freq);
} else {
freq += word.freq;
}
}
breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq));
}
}
}
breakSuggestionList.addAll(noBreakSuggestionList);
List<ResultEntry> combineSuggestionList = Collections.emptyList();
CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations(termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode);
if (combineWords) {
combineSuggestionList = new ArrayList<>(combineSuggestions.length);
for (CombineSuggestion cs : combineSuggestions) {
int firstTermIndex = cs.originalTermIndexes[0];
int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
sb.delete(0, sb.length());
for (int i = firstTermIndex; i <= lastTermIndex; i++) {
if (i > firstTermIndex) {
sb.append(" ");
}
sb.append(tokenArrWithSeparators.get(i).toString());
}
Token token = new Token(sb.toString(), tokenArrWithSeparators.get(firstTermIndex).startOffset(), tokenArrWithSeparators.get(lastTermIndex).endOffset());
combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq));
}
}
// Interleave the two lists of suggestions into one SpellingResult
SpellingResult result = new SpellingResult();
Iterator<ResultEntry> breakIter = breakSuggestionList.iterator();
Iterator<ResultEntry> combineIter = combineSuggestionList.iterator();
ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
int breakCount = 0;
int combineCount = 0;
while (lastBreak != null || lastCombine != null) {
if (lastBreak == null) {
addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
lastCombine = null;
} else if (lastCombine == null) {
addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
lastBreak = null;
} else if (lastBreak.freq < lastCombine.freq) {
addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
lastCombine = null;
} else if (lastCombine.freq < lastBreak.freq) {
addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
lastBreak = null;
} else if (breakCount >= combineCount) {
//TODO: Should reverse >= to < ??S
addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
lastCombine = null;
} else {
addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
lastBreak = null;
}
if (lastBreak == null && breakIter.hasNext()) {
lastBreak = breakIter.next();
breakCount++;
}
if (lastCombine == null && combineIter.hasNext()) {
lastCombine = combineIter.next();
combineCount++;
}
}
return result;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class Suggester method getSuggestions.
@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
LOG.debug("getSuggestions: " + options.tokens);
if (lookup == null) {
LOG.info("Lookup is null - invoke spellchecker.build first");
return EMPTY_RESULT;
}
SpellingResult res = new SpellingResult();
CharsRef scratch = new CharsRef();
for (Token t : options.tokens) {
scratch.chars = t.buffer();
scratch.offset = 0;
scratch.length = t.length();
boolean onlyMorePopular = (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) && !(lookup instanceof WFSTCompletionLookup) && !(lookup instanceof AnalyzingSuggester);
List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count);
if (suggestions == null) {
continue;
}
if (options.suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) {
Collections.sort(suggestions);
}
for (LookupResult lr : suggestions) {
res.add(t, lr.key.toString(), (int) lr.value);
}
}
return res;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestRemoveDuplicatesTokenFilter method testDups.
public void testDups(final String expected, final Token... tokens) throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
final TokenStream ts = new RemoveDuplicatesTokenFilter((new TokenStream() {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
@Override
public boolean incrementToken() {
if (toks.hasNext()) {
clearAttributes();
Token tok = toks.next();
termAtt.setEmpty().append(tok);
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
return true;
} else {
return false;
}
}
}));
assertTokenStreamContents(ts, expected.split("\\s"));
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestFixBrokenOffsetsFilter method testBogusTermVectors.
public void testBogusTermVectors() throws IOException {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", "", ft);
field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))));
doc.add(field);
iw.addDocument(doc);
iw.close();
dir.close();
}
Aggregations