Search in sources :

Example 96 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class AbstractLuceneSpellChecker method getSuggestions.

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    SpellingResult result = new SpellingResult(options.tokens);
    IndexReader reader = determineReader(options.reader);
    Term term = field != null ? new Term(field, "") : null;
    float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy;
    int count = Math.max(options.count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
    for (Token token : options.tokens) {
        String tokenText = new String(token.buffer(), 0, token.length());
        term = new Term(field, tokenText);
        int docFreq = 0;
        if (reader != null) {
            docFreq = reader.docFreq(term);
        }
        String[] suggestions = spellChecker.suggestSimilar(tokenText, ((options.alternativeTermCount == 0 || docFreq == 0) ? count : // workaround LUCENE-1295
        options.alternativeTermCount), // workaround LUCENE-1295
        field != null ? reader : null, field, options.suggestMode, theAccuracy);
        if (suggestions.length == 1 && suggestions[0].equals(tokenText) && options.alternativeTermCount == 0) {
            // These are spelled the same, continue on
            continue;
        }
        // original as a viable suggestion.
        if (options.alternativeTermCount > 0 && docFreq > 0) {
            boolean foundOriginal = false;
            String[] suggestionsWithOrig = new String[suggestions.length + 1];
            for (int i = 0; i < suggestions.length; i++) {
                if (suggestions[i].equals(tokenText)) {
                    foundOriginal = true;
                    break;
                }
                suggestionsWithOrig[i + 1] = suggestions[i];
            }
            if (!foundOriginal) {
                suggestionsWithOrig[0] = tokenText;
                suggestions = suggestionsWithOrig;
            }
        }
        if (options.extendedResults == true && reader != null && field != null) {
            result.addFrequency(token, docFreq);
            int countLimit = Math.min(options.count, suggestions.length);
            if (countLimit > 0) {
                for (int i = 0; i < countLimit; i++) {
                    term = new Term(field, suggestions[i]);
                    result.add(token, suggestions[i], reader.docFreq(term));
                }
            } else {
                List<String> suggList = Collections.emptyList();
                result.add(token, suggList);
            }
        } else {
            if (suggestions.length > 0) {
                List<String> suggList = Arrays.asList(suggestions);
                if (suggestions.length > options.count) {
                    suggList = suggList.subList(0, options.count);
                }
                result.add(token, suggList);
            } else {
                List<String> suggList = Collections.emptyList();
                result.add(token, suggList);
            }
        }
    }
    return result;
}
Also used : IndexReader(org.apache.lucene.index.IndexReader) Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term)

Example 97 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method token.

private static Token token(String term, int posInc, int posLength) {
    final Token t = new Token(term, 0, term.length());
    t.setPositionIncrement(posInc);
    t.setPositionLength(posLength);
    return t;
}
Also used : Token(org.apache.lucene.analysis.Token)

Example 98 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testTooManyExpansions.

public void testTooManyExpansions() throws Exception {
    final Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                @Override
                public TokenStream getTokenStream() {
                    Token a = new Token("a", 0, 1);
                    a.setPositionIncrement(1);
                    Token b = new Token("b", 0, 1);
                    b.setPositionIncrement(0);
                    return new CannedTokenStream(new Token[] { a, b });
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, 0, 256, 1, true);
    suggester.build(new InputArrayIterator(new Input[] { new Input("a", 1) }));
    assertEquals("[a/1]", suggester.lookup("a", false, 1).toString());
    IOUtils.close(a, tempDir);
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) Reader(java.io.Reader) Token(org.apache.lucene.analysis.Token) BinaryToken(org.apache.lucene.analysis.CannedBinaryTokenStream.BinaryToken) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Example 99 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testInputPathRequired.

public void testInputPathRequired() throws Exception {
    //  SynonymMap.Builder b = new SynonymMap.Builder(false);
    //  b.add(new CharsRef("ab"), new CharsRef("ba"), true);
    //  final SynonymMap map = b.build();
    //  The Analyzer below mimics the functionality of the SynonymAnalyzer
    //  using the above map, so that the suggest module does not need a dependency on the 
    //  synonym module 
    final Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                int tokenStreamCounter = 0;

                final TokenStream[] tokenStreams = new TokenStream[] { new CannedTokenStream(new Token[] { token("ab", 1, 1), token("ba", 0, 1), token("xc", 1, 1) }), new CannedTokenStream(new Token[] { token("ba", 1, 1), token("xd", 1, 1) }), new CannedTokenStream(new Token[] { token("ab", 1, 1), token("ba", 0, 1), token("x", 1, 1) }) };

                @Override
                public TokenStream getTokenStream() {
                    TokenStream result = tokenStreams[tokenStreamCounter];
                    tokenStreamCounter++;
                    return result;
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Input[] keys = new Input[] { new Input("ab xc", 50), new Input("ba xd", 50) };
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", analyzer);
    suggester.build(new InputArrayIterator(keys));
    List<LookupResult> results = suggester.lookup("ab x", false, 1);
    assertTrue(results.size() == 1);
    IOUtils.close(analyzer, tempDir);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) CannedBinaryTokenStream(org.apache.lucene.analysis.CannedBinaryTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Reader(java.io.Reader) Token(org.apache.lucene.analysis.Token) BinaryToken(org.apache.lucene.analysis.CannedBinaryTokenStream.BinaryToken) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Example 100 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testGraphDups.

public void testGraphDups() throws Exception {
    final Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                int tokenStreamCounter = 0;

                final TokenStream[] tokenStreams = new TokenStream[] { new CannedTokenStream(new Token[] { token("wifi", 1, 1), token("hotspot", 0, 2), token("network", 1, 1), token("is", 1, 1), token("slow", 1, 1) }), new CannedTokenStream(new Token[] { token("wi", 1, 1), token("hotspot", 0, 3), token("fi", 1, 1), token("network", 1, 1), token("is", 1, 1), token("fast", 1, 1) }), new CannedTokenStream(new Token[] { token("wifi", 1, 1), token("hotspot", 0, 2), token("network", 1, 1) }) };

                @Override
                public TokenStream getTokenStream() {
                    TokenStream result = tokenStreams[tokenStreamCounter];
                    tokenStreamCounter++;
                    return result;
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Input[] keys = new Input[] { new Input("wifi network is slow", 50), new Input("wi fi network is fast", 10) };
    //AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, AnalyzingSuggester.EXACT_FIRST, 256, -1);
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", analyzer);
    suggester.build(new InputArrayIterator(keys));
    List<LookupResult> results = suggester.lookup("wifi network", false, 10);
    if (VERBOSE) {
        System.out.println("Results: " + results);
    }
    assertEquals(2, results.size());
    assertEquals("wifi network is slow", results.get(0).key);
    assertEquals(50, results.get(0).value);
    assertEquals("wi fi network is fast", results.get(1).key);
    assertEquals(10, results.get(1).value);
    IOUtils.close(analyzer, tempDir);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) CannedBinaryTokenStream(org.apache.lucene.analysis.CannedBinaryTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Reader(java.io.Reader) Token(org.apache.lucene.analysis.Token) BinaryToken(org.apache.lucene.analysis.CannedBinaryTokenStream.BinaryToken) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Aggregations

Token (org.apache.lucene.analysis.Token)100 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)39 TokenStream (org.apache.lucene.analysis.TokenStream)31 Directory (org.apache.lucene.store.Directory)24 Test (org.junit.Test)23 Document (org.apache.lucene.document.Document)19 TextField (org.apache.lucene.document.TextField)19 BytesRef (org.apache.lucene.util.BytesRef)16 NamedList (org.apache.solr.common.util.NamedList)16 StringReader (java.io.StringReader)15 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)15 Analyzer (org.apache.lucene.analysis.Analyzer)14 ArrayList (java.util.ArrayList)13 Map (java.util.Map)13 Field (org.apache.lucene.document.Field)13 FieldType (org.apache.lucene.document.FieldType)11 IndexReader (org.apache.lucene.index.IndexReader)11 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Date (java.util.Date)8