Search in sources :

Example 86 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class SynonymsAnalysisTests method match.

private void match(String analyzerName, String source, String target) throws IOException {
    Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
    TokenStream stream = AllTokenStream.allTokenStream("_all", source, 1.0f, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringBuilder sb = new StringBuilder();
    while (stream.incrementToken()) {
        sb.append(termAtt.toString()).append(" ");
    }
    MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) AllTokenStream(org.elasticsearch.common.lucene.all.AllTokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 87 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class AnalysisRegistryTests method testConfigureCamelCaseTokenFilter.

public void testConfigureCamelCaseTokenFilter() throws IOException {
    Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).put("index.analysis.filter.wordDelimiter.type", "word_delimiter").put("index.analysis.filter.wordDelimiter.split_on_numerics", false).put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace").putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter").put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace").putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter").build();
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
    IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry().build(idxSettings);
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 2, token.size());
        assertEquals("j2se", token.get(0));
        assertEquals("j2ee", token.get(1));
    }
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 6, token.size());
        assertEquals("j", token.get(0));
        assertEquals("2", token.get(1));
        assertEquals("se", token.get(2));
        assertEquals("j", token.get(3));
        assertEquals("2", token.get(4));
        assertEquals("ee", token.get(5));
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IndexSettings(org.elasticsearch.index.IndexSettings) ArrayList(java.util.ArrayList) Environment(org.elasticsearch.env.Environment) AnalysisModule(org.elasticsearch.indices.analysis.AnalysisModule) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings)

Example 88 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class SimplePolishTokenFilterTests method testToken.

private void testToken(String source, String expected) throws IOException {
    Index index = new Index("test", "_na_");
    Settings settings = Settings.builder().put("index.analysis.filter.myStemmer.type", "polish_stem").build();
    TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin());
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer");
    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(source));
    TokenStream ts = filterFactory.create(tokenizer);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    assertThat(ts.incrementToken(), equalTo(true));
    assertThat(term1.toString(), equalTo(expected));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) Index(org.elasticsearch.index.Index) AnalysisStempelPlugin(org.elasticsearch.plugin.analysis.stempel.AnalysisStempelPlugin) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Settings(org.elasticsearch.common.settings.Settings)

Example 89 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class SimplePolishTokenFilterTests method testAnalyzer.

private void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new AnalysisStempelPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("polish").analyzer();
    TokenStream ts = analyzer.tokenStream("test", source);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Index(org.elasticsearch.index.Index) AnalysisStempelPlugin(org.elasticsearch.plugin.analysis.stempel.AnalysisStempelPlugin) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 90 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class XMoreLikeThis method addTermFrequencies.

/**
     * Adds term frequencies found by tokenizing text from reader into the Map words
     *
     * @param r a source of text to be tokenized
     * @param termFreqMap a Map of terms and their frequencies
     * @param fieldName Used by analyzer for any special per-field analysis
     */
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsed) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }
            if (isSkipTerm(fieldName, word)) {
                continue;
            }
            // increment frequency
            Int cnt = termFreqMap.get(word);
            if (cnt == null) {
                termFreqMap.put(word, new Int());
            } else {
                cnt.x++;
            }
        }
        ts.end();
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6