use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class SynonymsAnalysisTests method match.
private void match(String analyzerName, String source, String target) throws IOException {
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
TokenStream stream = AllTokenStream.allTokenStream("_all", source, 1.0f, analyzer);
stream.reset();
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
StringBuilder sb = new StringBuilder();
while (stream.incrementToken()) {
sb.append(termAtt.toString()).append(" ");
}
MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class AnalysisRegistryTests method testConfigureCamelCaseTokenFilter.
public void testConfigureCamelCaseTokenFilter() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).put("index.analysis.filter.wordDelimiter.type", "word_delimiter").put("index.analysis.filter.wordDelimiter.split_on_numerics", false).put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace").putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter").put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace").putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry().build(idxSettings);
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
assertNotNull(custom_analyser);
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
List<String> token = new ArrayList<>();
while (tokenStream.incrementToken()) {
token.add(charTermAttribute.toString());
}
assertEquals(token.toString(), 2, token.size());
assertEquals("j2se", token.get(0));
assertEquals("j2ee", token.get(1));
}
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
assertNotNull(custom_analyser);
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
List<String> token = new ArrayList<>();
while (tokenStream.incrementToken()) {
token.add(charTermAttribute.toString());
}
assertEquals(token.toString(), 6, token.size());
assertEquals("j", token.get(0));
assertEquals("2", token.get(1));
assertEquals("se", token.get(2));
assertEquals("j", token.get(3));
assertEquals("2", token.get(4));
assertEquals("ee", token.get(5));
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class SimplePolishTokenFilterTests method testToken.
private void testToken(String source, String expected) throws IOException {
Index index = new Index("test", "_na_");
Settings settings = Settings.builder().put("index.analysis.filter.myStemmer.type", "polish_stem").build();
TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin());
TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer");
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(source));
TokenStream ts = filterFactory.create(tokenizer);
CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
ts.reset();
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term1.toString(), equalTo(expected));
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class SimplePolishTokenFilterTests method testAnalyzer.
private void testAnalyzer(String source, String... expected_terms) throws IOException {
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new AnalysisStempelPlugin());
Analyzer analyzer = analysis.indexAnalyzers.get("polish").analyzer();
TokenStream ts = analyzer.tokenStream("test", source);
CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
ts.reset();
for (String expected : expected_terms) {
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term1.toString(), equalTo(expected));
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class XMoreLikeThis method addTermFrequencies.
/**
* Adds term frequencies found by tokenizing text from reader into the Map words
*
* @param r a source of text to be tokenized
* @param termFreqMap a Map of terms and their frequencies
* @param fieldName Used by analyzer for any special per-field analysis
*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
if (analyzer == null) {
throw new UnsupportedOperationException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
}
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
int tokenCount = 0;
// for every token
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String word = termAtt.toString();
tokenCount++;
if (tokenCount > maxNumTokensParsed) {
break;
}
if (isNoiseWord(word)) {
continue;
}
if (isSkipTerm(fieldName, word)) {
continue;
}
// increment frequency
Int cnt = termFreqMap.get(word);
if (cnt == null) {
termFreqMap.put(word, new Int());
} else {
cnt.x++;
}
}
ts.end();
}
}
Aggregations