use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.
the class PostingsHighlighter method highlight.
@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
FieldMapper fieldMapper = highlighterContext.mapper;
SearchContextHighlight.Field field = highlighterContext.field;
if (canHighlight(fieldMapper) == false) {
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName + "] should be indexed with positions and offsets in the postings list to be used with postings highlighter");
}
SearchContext context = highlighterContext.context;
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
}
HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
if (mapperHighlighterEntry == null) {
Encoder encoder = field.fieldOptions().encoder().equals("html") ? Encoders.HTML : Encoders.DEFAULT;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
}
List<Snippet> snippets = new ArrayList<>();
int numberOfFragments;
try {
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
CustomPostingsHighlighter highlighter;
if (field.fieldOptions().numberOfFragments() == 0) {
//we use a control char to separate values, which is the only char that the custom break iterator breaks the text on,
//so we don't lose the distinction between the different values of a field and we get back a snippet per value
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
//we are highlighting the whole content, one snippet per value
numberOfFragments = fieldValues.size();
} else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, fieldValue, field.fieldOptions().noMatchSize() > 0);
numberOfFragments = field.fieldOptions().numberOfFragments();
}
IndexSearcher searcher = new IndexSearcher(hitContext.reader());
Snippet[] fieldSnippets = highlighter.highlightField(fieldMapper.fieldType().name(), highlighterContext.query, searcher, hitContext.docId(), numberOfFragments);
for (Snippet fieldSnippet : fieldSnippets) {
if (Strings.hasText(fieldSnippet.getText())) {
snippets.add(fieldSnippet);
}
}
} catch (IOException e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
if (field.fieldOptions().scoreOrdered()) {
//let's sort the snippets by score if needed
CollectionUtil.introSort(snippets, new Comparator<Snippet>() {
@Override
public int compare(Snippet o1, Snippet o2) {
return (int) Math.signum(o2.getScore() - o1.getScore());
}
});
}
String[] fragments = new String[snippets.size()];
for (int i = 0; i < fragments.length; i++) {
fragments[i] = snippets.get(i).getText();
}
if (fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
return null;
}
use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.
the class UnifiedHighlighter method highlight.
@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
FieldMapper fieldMapper = highlighterContext.mapper;
SearchContextHighlight.Field field = highlighterContext.field;
SearchContext context = highlighterContext.context;
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
}
HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
if (mapperHighlighterEntry == null) {
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
}
List<Snippet> snippets = new ArrayList<>();
int numberOfFragments;
try {
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
fieldValues = fieldValues.stream().map(obj -> {
if (obj instanceof BytesRef) {
return fieldMapper.fieldType().valueForDisplay(obj).toString();
} else {
return obj;
}
}).collect(Collectors.toList());
IndexSearcher searcher = new IndexSearcher(hitContext.reader());
CustomUnifiedHighlighter highlighter;
if (field.fieldOptions().numberOfFragments() == 0) {
// we use a control char to separate values, which is the only char that the custom break iterator
// breaks the text on, so we don't lose the distinction between the different values of a field and we
// get back a snippet per value
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator = new org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
// we are highlighting the whole content, one snippet per value
numberOfFragments = fieldValues.size();
} else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
BreakIterator bi = getBreakIterator(field);
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi, fieldValue, field.fieldOptions().noMatchSize());
numberOfFragments = field.fieldOptions().numberOfFragments();
}
if (field.fieldOptions().requireFieldMatch()) {
final String fieldName = highlighterContext.fieldName;
highlighter.setFieldMatcher((name) -> fieldName.equals(name));
} else {
highlighter.setFieldMatcher((name) -> true);
}
Snippet[] fieldSnippets = highlighter.highlightField(highlighterContext.fieldName, highlighterContext.query, hitContext.docId(), numberOfFragments);
for (Snippet fieldSnippet : fieldSnippets) {
if (Strings.hasText(fieldSnippet.getText())) {
snippets.add(fieldSnippet);
}
}
} catch (IOException e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
if (field.fieldOptions().scoreOrdered()) {
//let's sort the snippets by score if needed
CollectionUtil.introSort(snippets, (o1, o2) -> Double.compare(o2.getScore(), o1.getScore()));
}
String[] fragments = new String[snippets.size()];
for (int i = 0; i < fragments.length; i++) {
fragments[i] = snippets.get(i).getText();
}
if (fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
return null;
}
use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.
the class AbstractTermVectorsTestCase method indexDocsWithLucene.
protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException {
Map<String, Analyzer> mapping = new HashMap<>();
for (TestFieldSetting field : testDocs[0].fieldSettings) {
if (field.storedPayloads) {
mapping.put(field.name, new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
TokenFilter filter = new LowerCaseFilter(tokenizer);
filter = new TypeAsPayloadTokenFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
});
}
}
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), mapping);
Directory dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
for (TestDoc doc : testDocs) {
Document d = new Document();
d.add(new Field("id", doc.id, StringField.TYPE_STORED));
for (int i = 0; i < doc.fieldContent.length; i++) {
FieldType type = new FieldType(TextField.TYPE_STORED);
TestFieldSetting fieldSetting = doc.fieldSettings[i];
type.setStoreTermVectorOffsets(fieldSetting.storedOffset);
type.setStoreTermVectorPayloads(fieldSetting.storedPayloads);
type.setStoreTermVectorPositions(fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset);
type.setStoreTermVectors(true);
type.freeze();
d.add(new Field(fieldSetting.name, doc.fieldContent[i], type));
}
writer.updateDocument(new Term("id", doc.id), d);
writer.commit();
}
writer.close();
return DirectoryReader.open(dir);
}
use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.
the class TransportAnalyzeAction method shardOperation.
@Override
protected AnalyzeResponse shardOperation(AnalyzeRequest request, ShardId shardId) {
try {
final IndexService indexService;
if (shardId != null) {
indexService = indicesService.indexServiceSafe(shardId.getIndex());
} else {
indexService = null;
}
String field = null;
Analyzer analyzer = null;
if (request.field() != null) {
if (indexService == null) {
throw new IllegalArgumentException("No index provided, and trying to analyzer based on a specific field which requires the index parameter");
}
MappedFieldType fieldType = indexService.mapperService().fullName(request.field());
if (fieldType != null) {
if (fieldType.tokenized()) {
analyzer = fieldType.indexAnalyzer();
} else if (fieldType instanceof KeywordFieldMapper.KeywordFieldType) {
analyzer = ((KeywordFieldMapper.KeywordFieldType) fieldType).normalizer();
if (analyzer == null) {
// this will be KeywordAnalyzer
analyzer = fieldType.indexAnalyzer();
}
} else {
throw new IllegalArgumentException("Can't process field [" + request.field() + "], Analysis requests are only supported on tokenized fields");
}
field = fieldType.name();
}
}
if (field == null) {
if (indexService != null) {
field = indexService.getIndexSettings().getDefaultField();
} else {
field = AllFieldMapper.NAME;
}
}
final AnalysisRegistry analysisRegistry = indicesService.getAnalysis();
return analyze(request, field, analyzer, indexService != null ? indexService.getIndexAnalyzers() : null, analysisRegistry, environment);
} catch (IOException e) {
throw new ElasticsearchException("analysis failed", e);
}
}
use of org.apache.lucene.analysis.Analyzer in project elasticsearch by elastic.
the class MoreLikeThisQueryBuilder method doToQuery.
@Override
protected Query doToQuery(QueryShardContext context) throws IOException {
Item[] likeItems = new Item[this.likeItems.length];
for (int i = 0; i < likeItems.length; i++) {
likeItems[i] = new Item(this.likeItems[i]);
}
Item[] unlikeItems = new Item[this.unlikeItems.length];
for (int i = 0; i < unlikeItems.length; i++) {
unlikeItems[i] = new Item(this.unlikeItems[i]);
}
MoreLikeThisQuery mltQuery = new MoreLikeThisQuery();
// set similarity
mltQuery.setSimilarity(context.getSearchSimilarity());
// set query parameters
mltQuery.setMaxQueryTerms(maxQueryTerms);
mltQuery.setMinTermFrequency(minTermFreq);
mltQuery.setMinDocFreq(minDocFreq);
mltQuery.setMaxDocFreq(maxDocFreq);
mltQuery.setMinWordLen(minWordLength);
mltQuery.setMaxWordLen(maxWordLength);
mltQuery.setMinimumShouldMatch(minimumShouldMatch);
if (stopWords != null) {
mltQuery.setStopWords(new HashSet<>(Arrays.asList(stopWords)));
}
// sets boost terms
if (boostTerms != 0) {
mltQuery.setBoostTerms(true);
mltQuery.setBoostTermsFactor(boostTerms);
}
// set analyzer
Analyzer analyzerObj = context.getIndexAnalyzers().get(analyzer);
if (analyzerObj == null) {
analyzerObj = context.getMapperService().searchAnalyzer();
}
mltQuery.setAnalyzer(analyzerObj);
// set like text fields
boolean useDefaultField = (fields == null);
List<String> moreLikeFields = new ArrayList<>();
if (useDefaultField) {
moreLikeFields = Collections.singletonList(context.defaultField());
} else {
for (String field : fields) {
MappedFieldType fieldType = context.fieldMapper(field);
if (fieldType != null && SUPPORTED_FIELD_TYPES.contains(fieldType.getClass()) == false) {
if (failOnUnsupportedField) {
throw new IllegalArgumentException("more_like_this only supports text/keyword fields: [" + field + "]");
} else {
// skip
continue;
}
}
moreLikeFields.add(fieldType == null ? field : fieldType.name());
}
}
if (moreLikeFields.isEmpty()) {
return null;
}
mltQuery.setMoreLikeFields(moreLikeFields.toArray(new String[moreLikeFields.size()]));
// handle like texts
if (likeTexts.length > 0) {
mltQuery.setLikeText(likeTexts);
}
if (unlikeTexts.length > 0) {
mltQuery.setUnlikeText(unlikeTexts);
}
// handle items
if (likeItems.length > 0) {
return handleItems(context, mltQuery, likeItems, unlikeItems, include, moreLikeFields, useDefaultField);
} else {
return mltQuery;
}
}
Aggregations