use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project elasticsearch by elastic.
the class CustomUnifiedHighlighter method getFieldHighlighter.
@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR);
FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project elasticsearch by elastic.
the class XContentMapValues method filter.
private static Map<String, Object> filter(Map<String, ?> map, CharacterRunAutomaton includeAutomaton, int initialIncludeState, CharacterRunAutomaton excludeAutomaton, int initialExcludeState, CharacterRunAutomaton matchAllAutomaton) {
Map<String, Object> filtered = new HashMap<>();
for (Map.Entry<String, ?> entry : map.entrySet()) {
String key = entry.getKey();
int includeState = step(includeAutomaton, key, initialIncludeState);
if (includeState == -1) {
continue;
}
int excludeState = step(excludeAutomaton, key, initialExcludeState);
if (excludeState != -1 && excludeAutomaton.isAccept(excludeState)) {
continue;
}
Object value = entry.getValue();
CharacterRunAutomaton subIncludeAutomaton = includeAutomaton;
int subIncludeState = includeState;
if (includeAutomaton.isAccept(includeState)) {
if (excludeState == -1 || excludeAutomaton.step(excludeState, '.') == -1) {
// the exclude has no chances to match inner properties
filtered.put(key, value);
continue;
} else {
// the object matched, so consider that the include matches every inner property
// we only care about excludes now
subIncludeAutomaton = matchAllAutomaton;
subIncludeState = 0;
}
}
if (value instanceof Map) {
subIncludeState = subIncludeAutomaton.step(subIncludeState, '.');
if (subIncludeState == -1) {
continue;
}
if (excludeState != -1) {
excludeState = excludeAutomaton.step(excludeState, '.');
}
Map<String, Object> valueAsMap = (Map<String, Object>) value;
Map<String, Object> filteredValue = filter(valueAsMap, subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton);
if (includeAutomaton.isAccept(includeState) || filteredValue.isEmpty() == false) {
filtered.put(key, filteredValue);
}
} else if (value instanceof Iterable) {
List<Object> filteredValue = filter((Iterable<?>) value, subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton);
if (filteredValue.isEmpty() == false) {
filtered.put(key, filteredValue);
}
} else {
// leaf property
if (includeAutomaton.isAccept(includeState) && (excludeState == -1 || excludeAutomaton.isAccept(excludeState) == false)) {
filtered.put(key, value);
}
}
}
return filtered;
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestMockAnalyzer method testKeep.
/** Test a configuration that behaves a lot like KeepWordFilter */
public void testKeep() throws Exception {
CharacterRunAutomaton keepWords = new CharacterRunAutomaton(Operations.complement(Operations.union(Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))), DEFAULT_MAX_DETERMINIZED_STATES));
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
assertAnalyzesTo(a, "quick foo brown bar bar fox foo", new String[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 });
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class SynonymTokenizer method testMaxSizeEndHighlight.
public void testMaxSizeEndHighlight() throws Exception {
TestHighlightRunner helper = new TestHighlightRunner() {
@Override
public void run() throws Exception {
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(new RegExp("i[nt]").toAutomaton());
TermQuery query = new TermQuery(new Term("text", "searchterm"));
String text = "this is a text with searchterm in it";
SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
Highlighter hg = getHighlighter(query, "text", fm);
hg.setTextFragmenter(new NullFragmenter());
hg.setMaxDocCharsToAnalyze(36);
String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
assertTrue("Matched text should contain remainder of text after highlighted query ", match.endsWith("in it"));
}
};
helper.start();
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class FastVectorHighlighterTest method matchedFieldsTestCase.
private void matchedFieldsTestCase(boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses) throws IOException {
Document doc = new Document();
FieldType stored = new FieldType(TextField.TYPE_STORED);
stored.setStoreTermVectorOffsets(true);
stored.setStoreTermVectorPositions(true);
stored.setStoreTermVectors(true);
stored.freeze();
FieldType matched = new FieldType(TextField.TYPE_NOT_STORED);
matched.setStoreTermVectorOffsets(true);
matched.setStoreTermVectorPositions(true);
matched.setStoreTermVectors(true);
matched.freeze();
// Whitespace tokenized with English stop words
doc.add(new Field("field", fieldValue, stored));
// Whitespace tokenized without stop words
doc.add(new Field("field_exact", fieldValue, matched));
// Whitespace tokenized without toLower
doc.add(new Field("field_super_exact", fieldValue, matched));
// Each letter is a token
doc.add(new Field("field_characters", fieldValue, matched));
// Every three letters is a token
doc.add(new Field("field_tripples", fieldValue, matched));
doc.add(new Field("field_sliced", // Sliced at 10 chars then analyzed just like field
fieldValue.substring(// Sliced at 10 chars then analyzed just like field
0, Math.min(fieldValue.length() - 1, 10)), matched));
doc.add(new Field("field_der_red", new // Hacky field containing "der" and "red" at pos = 0
CannedTokenStream(token("der", 1, 0, 3), token("red", 0, 0, 3)), matched));
final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
fieldAnalyzers.put("field", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET));
fieldAnalyzers.put("field_exact", new MockAnalyzer(random()));
fieldAnalyzers.put("field_super_exact", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
fieldAnalyzers.put("field_characters", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
fieldAnalyzers.put("field_tripples", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
fieldAnalyzers.put("field_sliced", fieldAnalyzers.get("field"));
// This is required even though we provide a token stream
fieldAnalyzers.put("field_der_red", fieldAnalyzers.get("field"));
Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
public Analyzer getWrappedAnalyzer(String fieldName) {
return fieldAnalyzers.get(fieldName);
}
};
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
writer.addDocument(doc);
FastVectorHighlighter highlighter = new FastVectorHighlighter();
FragListBuilder fragListBuilder = new SimpleFragListBuilder();
FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
IndexReader reader = DirectoryReader.open(writer);
String[] preTags = new String[] { "<b>" };
String[] postTags = new String[] { "</b>" };
Encoder encoder = new DefaultEncoder();
int docId = 0;
BooleanQuery.Builder query = new BooleanQuery.Builder();
for (Query clause : queryClauses) {
query.add(clause, Occur.MUST);
}
FieldQuery fieldQuery = new FieldQuery(query.build(), reader, true, fieldMatch);
String[] bestFragments;
if (useMatchedFields) {
Set<String> matchedFields = new HashSet<>();
matchedFields.add("field");
matchedFields.add("field_exact");
matchedFields.add("field_super_exact");
matchedFields.add("field_characters");
matchedFields.add("field_tripples");
matchedFields.add("field_sliced");
matchedFields.add("field_der_red");
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
} else {
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
}
assertEquals(expected, bestFragments[0]);
reader.close();
writer.close();
dir.close();
}
Aggregations