Search in sources :

Example 51 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class FuzzySuggesterTest method testGraphDups.

public void testGraphDups() throws Exception {
    final Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                int tokenStreamCounter = 0;

                final TokenStream[] tokenStreams = new TokenStream[] { new CannedTokenStream(new Token[] { token("wifi", 1, 1), token("hotspot", 0, 2), token("network", 1, 1), token("is", 1, 1), token("slow", 1, 1) }), new CannedTokenStream(new Token[] { token("wi", 1, 1), token("hotspot", 0, 3), token("fi", 1, 1), token("network", 1, 1), token("is", 1, 1), token("fast", 1, 1) }), new CannedTokenStream(new Token[] { token("wifi", 1, 1), token("hotspot", 0, 2), token("network", 1, 1) }) };

                @Override
                public TokenStream getTokenStream() {
                    TokenStream result = tokenStreams[tokenStreamCounter];
                    tokenStreamCounter++;
                    return result;
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Input[] keys = new Input[] { new Input("wifi network is slow", 50), new Input("wi fi network is fast", 10) };
    Directory tempDir = getDirectory();
    FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer);
    suggester.build(new InputArrayIterator(keys));
    List<LookupResult> results = suggester.lookup("wifi network", false, 10);
    if (VERBOSE) {
        System.out.println("Results: " + results);
    }
    assertEquals(2, results.size());
    assertEquals("wifi network is slow", results.get(0).key);
    assertEquals(50, results.get(0).value);
    assertEquals("wi fi network is fast", results.get(1).key);
    assertEquals(10, results.get(1).value);
    IOUtils.close(tempDir, analyzer);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Reader(java.io.Reader) Token(org.apache.lucene.analysis.Token) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Example 52 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class CompletionTokenStreamTest method testWithMultipleTokens.

@Test
public void testWithMultipleTokens() throws Exception {
    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    String input = "mykeyword another keyword";
    tokenStream.setReader(new StringReader(input));
    BytesRef payload = new BytesRef("payload");
    CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream);
    completionTokenStream.setPayload(payload);
    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
    CharsRefBuilder builder = new CharsRefBuilder();
    builder.append("mykeyword");
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("another");
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("keyword");
    assertTokenStreamContents(stream, new String[] { builder.toCharsRef().toString() }, null, null, new String[] { payload.utf8ToString() }, new int[] { 1 }, null, null);
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) StringReader(java.io.StringReader) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 53 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestQueryBuilder method testNoTermAttribute.

public void testNoTermAttribute() {
    //Can't use MockTokenizer because it adds TermAttribute and we don't want that
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new Tokenizer() {

                boolean wasReset = false;

                @Override
                public void reset() throws IOException {
                    super.reset();
                    assertFalse(wasReset);
                    wasReset = true;
                }

                @Override
                public boolean incrementToken() throws IOException {
                    assertTrue(wasReset);
                    return false;
                }
            });
        }
    };
    QueryBuilder builder = new QueryBuilder(analyzer);
    assertNull(builder.createBooleanQuery("field", "whatever"));
}
Also used : IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 54 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class AnalysisRequestHandlerBase method analyzeValue.

/**
   * Analyzes the given value using the given Analyzer.
   *
   * @param value   Value to analyze
   * @param context The {@link AnalysisContext analysis context}.
   *
   * @return NamedList containing the tokens produced by analyzing the given value
   */
protected NamedList<? extends Object> analyzeValue(String value, AnalysisContext context) {
    Analyzer analyzer = context.getAnalyzer();
    if (!TokenizerChain.class.isInstance(analyzer)) {
        try (TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), value)) {
            NamedList<List<NamedList>> namedList = new NamedList<>();
            namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
            return namedList;
        } catch (IOException e) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
        }
    }
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
    TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
    TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
    NamedList<Object> namedList = new NamedList<>();
    if (0 < cfiltfacs.length) {
        String source = value;
        for (CharFilterFactory cfiltfac : cfiltfacs) {
            Reader reader = new StringReader(source);
            reader = cfiltfac.create(reader);
            source = writeCharStream(namedList, reader);
        }
    }
    TokenStream tokenStream = tfac.create();
    ((Tokenizer) tokenStream).setReader(tokenizerChain.initReader(null, new StringReader(value)));
    List<AttributeSource> tokens = analyzeTokenStream(tokenStream);
    namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
    ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokenStream, tokens);
    for (TokenFilterFactory tokenFilterFactory : filtfacs) {
        for (final AttributeSource tok : tokens) {
            tok.getAttribute(TokenTrackingAttribute.class).freezeStage();
        }
        // overwrite the vars "tokenStream", "tokens", and "listBasedTokenStream"
        tokenStream = tokenFilterFactory.create(listBasedTokenStream);
        tokens = analyzeTokenStream(tokenStream);
        namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
        listBasedTokenStream = new ListBasedTokenStream(listBasedTokenStream, tokens);
    }
    return namedList;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) AttributeSource(org.apache.lucene.util.AttributeSource) NamedList(org.apache.solr.common.util.NamedList) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) Reader(java.io.Reader) StringReader(java.io.StringReader) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) TokenizerChain(org.apache.solr.analysis.TokenizerChain) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) Tokenizer(org.apache.lucene.analysis.Tokenizer) SolrException(org.apache.solr.common.SolrException)

Example 55 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestPayloadCheckQuery method beforeClass.

@BeforeClass
public static void beforeClass() throws Exception {
    Analyzer simplePayloadAnalyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer, new SimplePayloadFilter(tokenizer));
        }
    };
    directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(simplePayloadAnalyzer).setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
    //writer.infoStream = System.out;
    for (int i = 0; i < 2000; i++) {
        Document doc = new Document();
        doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES));
        writer.addDocument(doc);
    }
    reader = writer.getReader();
    searcher = newSearcher(reader);
    writer.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) SimplePayloadFilter(org.apache.lucene.analysis.SimplePayloadFilter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BeforeClass(org.junit.BeforeClass)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)573 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)286 Analyzer (org.apache.lucene.analysis.Analyzer)265 StringReader (java.io.StringReader)249 TokenStream (org.apache.lucene.analysis.TokenStream)227 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)216 Reader (java.io.Reader)91 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)67 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)63 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)52 StopFilter (org.apache.lucene.analysis.StopFilter)48 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)47 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)45 CharArraySet (org.apache.lucene.analysis.CharArraySet)43 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)36 ESTestCase (org.elasticsearch.test.ESTestCase)30 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)26 HashMap (java.util.HashMap)23 Random (java.util.Random)20 TokenFilter (org.apache.lucene.analysis.TokenFilter)19