use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class CompoundAnalysisTests method analyze.
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
}
}));
IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
AllEntries allEntries = new AllEntries();
allEntries.addText("field1", text, 1.0f);
TokenStream stream = AllTokenStream.allTokenStream("_all", text, 1.0f, analyzer);
stream.reset();
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
List<String> terms = new ArrayList<>();
while (stream.incrementToken()) {
String tokText = termAtt.toString();
terms.add(tokText);
}
return terms;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class DocumentFieldMapperTests method assertAnalyzes.
private void assertAnalyzes(Analyzer analyzer, String field, String output) throws IOException {
try (TokenStream tok = analyzer.tokenStream(field, new StringReader(""))) {
CharTermAttribute term = tok.addAttribute(CharTermAttribute.class);
assertTrue(tok.incrementToken());
assertEquals(output, term.toString());
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class MapperQueryParser method getPossiblyAnalyzedPrefixQuery.
private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
if (!settings.analyzeWildcard()) {
return super.getPrefixQuery(field, termStr);
}
List<List<String>> tlist;
// get Analyzer from superclass and tokenize the term
TokenStream source = null;
try {
try {
source = getAnalyzer().tokenStream(field, termStr);
source.reset();
} catch (IOException e) {
return super.getPrefixQuery(field, termStr);
}
tlist = new ArrayList<>();
List<String> currentPos = new ArrayList<>();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);
while (true) {
try {
if (!source.incrementToken())
break;
} catch (IOException e) {
break;
}
if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
tlist.add(currentPos);
currentPos = new ArrayList<>();
}
currentPos.add(termAtt.toString());
}
if (currentPos.isEmpty() == false) {
tlist.add(currentPos);
}
} finally {
if (source != null) {
IOUtils.closeWhileHandlingException(source);
}
}
if (tlist.size() == 0) {
return null;
}
if (tlist.size() == 1 && tlist.get(0).size() == 1) {
return super.getPrefixQuery(field, tlist.get(0).get(0));
}
// build a boolean query with prefix on the last position only.
List<BooleanClause> clauses = new ArrayList<>();
for (int pos = 0; pos < tlist.size(); pos++) {
List<String> plist = tlist.get(pos);
boolean isLastPos = (pos == tlist.size() - 1);
Query posQuery;
if (plist.size() == 1) {
if (isLastPos) {
posQuery = super.getPrefixQuery(field, plist.get(0));
} else {
posQuery = newTermQuery(new Term(field, plist.get(0)));
}
} else if (isLastPos == false) {
// build a synonym query for terms in the same position.
Term[] terms = new Term[plist.size()];
for (int i = 0; i < plist.size(); i++) {
terms[i] = new Term(field, plist.get(i));
}
posQuery = new SynonymQuery(terms);
} else {
List<BooleanClause> innerClauses = new ArrayList<>();
for (String token : plist) {
innerClauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
}
posQuery = getBooleanQueryCoordDisabled(innerClauses);
}
clauses.add(new BooleanClause(posQuery, getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD));
}
return getBooleanQuery(clauses);
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.
the class MoreLikeThisQuery method handleUnlike.
private void handleUnlike(XMoreLikeThis mlt, String[] unlikeText, Fields[] unlikeFields) throws IOException {
Set<Term> skipTerms = new HashSet<>();
// handle like text
if (unlikeText != null) {
for (String text : unlikeText) {
// only use the first field to be consistent
String fieldName = moreLikeFields[0];
try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
skipTerms.add(new Term(fieldName, termAtt.toString()));
}
ts.end();
}
}
}
// handle like fields
if (unlikeFields != null) {
for (Fields fields : unlikeFields) {
for (String fieldName : fields) {
Terms terms = fields.terms(fieldName);
final TermsEnum termsEnum = terms.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
skipTerms.add(new Term(fieldName, text.utf8ToString()));
}
}
}
}
if (!skipTerms.isEmpty()) {
mlt.setSkipTerms(skipTerms);
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project OpenGrok by OpenGrok.
the class Summarizer method getTokens.
private SToken[] getTokens(String text) throws IOException {
//FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
//also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
ArrayList<SToken> result = new ArrayList<>();
try (TokenStream ts = analyzer.tokenStream("full", text)) {
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
ts.reset();
while (ts.incrementToken()) {
SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
result.add(t);
}
ts.end();
}
return result.toArray(new SToken[result.size()]);
}
Aggregations