Search in sources :

Example 86 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class SpellingQueryConverter method convert.

/**
   * Converts the original query string to a collection of Lucene Tokens.
   * @param original the original query string
   * @return a Collection of Lucene Tokens
   */
@Override
public Collection<Token> convert(String original) {
    if (original == null) {
        // this can happen with q.alt = and no query
        return Collections.emptyList();
    }
    boolean mightContainRangeQuery = (original.indexOf('[') != -1 || original.indexOf('{') != -1) && (original.indexOf(']') != -1 || original.indexOf('}') != -1);
    Collection<Token> result = new ArrayList<>();
    Matcher matcher = QUERY_REGEX.matcher(original);
    String nextWord = null;
    int nextStartIndex = 0;
    String lastBooleanOp = null;
    while (nextWord != null || matcher.find()) {
        String word = null;
        int startIndex = 0;
        if (nextWord != null) {
            word = nextWord;
            startIndex = nextStartIndex;
            nextWord = null;
        } else {
            word = matcher.group(0);
            startIndex = matcher.start();
        }
        if (matcher.find()) {
            nextWord = matcher.group(0);
            nextStartIndex = matcher.start();
        }
        if (mightContainRangeQuery && "TO".equals(word)) {
            continue;
        }
        if ("AND".equals(word) || "OR".equals(word) || "NOT".equals(word)) {
            lastBooleanOp = word;
            continue;
        }
        // treat "AND NOT" as "NOT"...
        if ("AND".equals(nextWord) && original.length() > nextStartIndex + 7 && original.substring(nextStartIndex, nextStartIndex + 7).equals("AND NOT")) {
            nextWord = "NOT";
        }
        int flagValue = 0;
        if (word.charAt(0) == '-' || (startIndex > 0 && original.charAt(startIndex - 1) == '-')) {
            flagValue = PROHIBITED_TERM_FLAG;
        } else if (word.charAt(0) == '+' || (startIndex > 0 && original.charAt(startIndex - 1) == '+')) {
            flagValue = REQUIRED_TERM_FLAG;
        //we don't know the default operator so just assume the first operator isn't new.
        } else if (nextWord != null && lastBooleanOp != null && !nextWord.equals(lastBooleanOp) && ("AND".equals(nextWord) || "OR".equals(nextWord) || "NOT".equals(nextWord))) {
            flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
        //...unless the 1st boolean operator is a NOT, because only AND/OR can be default.
        } else if (nextWord != null && lastBooleanOp == null && !nextWord.equals(lastBooleanOp) && ("NOT".equals(nextWord))) {
            flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
        }
        try {
            analyze(result, word, startIndex, flagValue);
        } catch (IOException e) {
        // TODO: shouldn't we log something?
        }
    }
    if (lastBooleanOp != null) {
        for (Token t : result) {
            int f = t.getFlags();
            t.setFlags(f |= QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG);
        }
    }
    return result;
}
Also used : Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token) IOException(java.io.IOException)

Example 87 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class WordBreakSolrSpellChecker method getSuggestions.

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    IndexReader ir = options.reader;
    int numSuggestions = options.count;
    StringBuilder sb = new StringBuilder();
    Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]);
    List<Token> tokenArrWithSeparators = new ArrayList<>(options.tokens.size() + 2);
    List<Term> termArr = new ArrayList<>(options.tokens.size() + 2);
    List<ResultEntry> breakSuggestionList = new ArrayList<>();
    List<ResultEntry> noBreakSuggestionList = new ArrayList<>();
    boolean lastOneProhibited = false;
    boolean lastOneRequired = false;
    boolean lastOneprocedesNewBooleanOp = false;
    for (int i = 0; i < tokenArr.length; i++) {
        boolean prohibited = (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) == QueryConverter.PROHIBITED_TERM_FLAG;
        boolean required = (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) == QueryConverter.REQUIRED_TERM_FLAG;
        boolean procedesNewBooleanOp = (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
        if (i > 0 && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) {
            termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
            tokenArrWithSeparators.add(null);
        }
        lastOneProhibited = prohibited;
        lastOneRequired = required;
        lastOneprocedesNewBooleanOp = procedesNewBooleanOp;
        Term thisTerm = new Term(field, tokenArr[i].toString());
        termArr.add(thisTerm);
        tokenArrWithSeparators.add(tokenArr[i]);
        if (breakWords) {
            SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir, options.suggestMode, sortMethod);
            if (breakSuggestions.length == 0) {
                noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
            }
            for (SuggestWord[] breakSuggestion : breakSuggestions) {
                sb.delete(0, sb.length());
                boolean firstOne = true;
                int freq = 0;
                for (SuggestWord word : breakSuggestion) {
                    if (!firstOne) {
                        sb.append(" ");
                    }
                    firstOne = false;
                    sb.append(word.string);
                    if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
                        freq = Math.max(freq, word.freq);
                    } else {
                        freq += word.freq;
                    }
                }
                breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq));
            }
        }
    }
    breakSuggestionList.addAll(noBreakSuggestionList);
    List<ResultEntry> combineSuggestionList = Collections.emptyList();
    CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations(termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode);
    if (combineWords) {
        combineSuggestionList = new ArrayList<>(combineSuggestions.length);
        for (CombineSuggestion cs : combineSuggestions) {
            int firstTermIndex = cs.originalTermIndexes[0];
            int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
            sb.delete(0, sb.length());
            for (int i = firstTermIndex; i <= lastTermIndex; i++) {
                if (i > firstTermIndex) {
                    sb.append(" ");
                }
                sb.append(tokenArrWithSeparators.get(i).toString());
            }
            Token token = new Token(sb.toString(), tokenArrWithSeparators.get(firstTermIndex).startOffset(), tokenArrWithSeparators.get(lastTermIndex).endOffset());
            combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq));
        }
    }
    // Interleave the two lists of suggestions into one SpellingResult
    SpellingResult result = new SpellingResult();
    Iterator<ResultEntry> breakIter = breakSuggestionList.iterator();
    Iterator<ResultEntry> combineIter = combineSuggestionList.iterator();
    ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
    ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
    int breakCount = 0;
    int combineCount = 0;
    while (lastBreak != null || lastCombine != null) {
        if (lastBreak == null) {
            addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
            lastCombine = null;
        } else if (lastCombine == null) {
            addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
            lastBreak = null;
        } else if (lastBreak.freq < lastCombine.freq) {
            addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
            lastCombine = null;
        } else if (lastCombine.freq < lastBreak.freq) {
            addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
            lastBreak = null;
        } else if (breakCount >= combineCount) {
            //TODO: Should reverse >= to < ??S
            addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
            lastCombine = null;
        } else {
            addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
            lastBreak = null;
        }
        if (lastBreak == null && breakIter.hasNext()) {
            lastBreak = breakIter.next();
            breakCount++;
        }
        if (lastCombine == null && combineIter.hasNext()) {
            lastCombine = combineIter.next();
            combineCount++;
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) CombineSuggestion(org.apache.lucene.search.spell.CombineSuggestion) Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term) IndexReader(org.apache.lucene.index.IndexReader) SuggestWord(org.apache.lucene.search.spell.SuggestWord)

Example 88 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class Suggester method getSuggestions.

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    LOG.debug("getSuggestions: " + options.tokens);
    if (lookup == null) {
        LOG.info("Lookup is null - invoke spellchecker.build first");
        return EMPTY_RESULT;
    }
    SpellingResult res = new SpellingResult();
    CharsRef scratch = new CharsRef();
    for (Token t : options.tokens) {
        scratch.chars = t.buffer();
        scratch.offset = 0;
        scratch.length = t.length();
        boolean onlyMorePopular = (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) && !(lookup instanceof WFSTCompletionLookup) && !(lookup instanceof AnalyzingSuggester);
        List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count);
        if (suggestions == null) {
            continue;
        }
        if (options.suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) {
            Collections.sort(suggestions);
        }
        for (LookupResult lr : suggestions) {
            res.add(t, lr.key.toString(), (int) lr.value);
        }
    }
    return res;
}
Also used : WFSTCompletionLookup(org.apache.lucene.search.suggest.fst.WFSTCompletionLookup) SpellingResult(org.apache.solr.spelling.SpellingResult) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Token(org.apache.lucene.analysis.Token) AnalyzingSuggester(org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester) CharsRef(org.apache.lucene.util.CharsRef)

Example 89 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestRemoveDuplicatesTokenFilter method testDups.

public void testDups(final String expected, final Token... tokens) throws Exception {
    final Iterator<Token> toks = Arrays.asList(tokens).iterator();
    final TokenStream ts = new RemoveDuplicatesTokenFilter((new TokenStream() {

        CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

        @Override
        public boolean incrementToken() {
            if (toks.hasNext()) {
                clearAttributes();
                Token tok = toks.next();
                termAtt.setEmpty().append(tok);
                offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
                posIncAtt.setPositionIncrement(tok.getPositionIncrement());
                return true;
            } else {
                return false;
            }
        }
    }));
    assertTokenStreamContents(ts, expected.split("\\s"));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 90 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestFixBrokenOffsetsFilter method testBogusTermVectors.

public void testBogusTermVectors() throws IOException {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    Field field = new Field("foo", "", ft);
    field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))));
    doc.add(field);
    iw.addDocument(doc);
    iw.close();
    dir.close();
}
Also used : Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType)

Aggregations

Token (org.apache.lucene.analysis.Token)100 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)39 TokenStream (org.apache.lucene.analysis.TokenStream)31 Directory (org.apache.lucene.store.Directory)24 Test (org.junit.Test)23 Document (org.apache.lucene.document.Document)19 TextField (org.apache.lucene.document.TextField)19 BytesRef (org.apache.lucene.util.BytesRef)16 NamedList (org.apache.solr.common.util.NamedList)16 StringReader (java.io.StringReader)15 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)15 Analyzer (org.apache.lucene.analysis.Analyzer)14 ArrayList (java.util.ArrayList)13 Map (java.util.Map)13 Field (org.apache.lucene.document.Field)13 FieldType (org.apache.lucene.document.FieldType)11 IndexReader (org.apache.lucene.index.IndexReader)11 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Date (java.util.Date)8