Search in sources :

Example 1 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project elasticsearch by elastic.

the class CustomUnifiedHighlighter method getFieldHighlighter.

@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
    BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
    Set<HighlightFlag> highlightFlags = getFlags(field);
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
    OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
    BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR);
    FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
    return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) BytesRef(org.apache.lucene.util.BytesRef) BreakIterator(java.text.BreakIterator)

Example 2 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project elasticsearch by elastic.

the class XContentMapValues method filter.

private static Map<String, Object> filter(Map<String, ?> map, CharacterRunAutomaton includeAutomaton, int initialIncludeState, CharacterRunAutomaton excludeAutomaton, int initialExcludeState, CharacterRunAutomaton matchAllAutomaton) {
    Map<String, Object> filtered = new HashMap<>();
    for (Map.Entry<String, ?> entry : map.entrySet()) {
        String key = entry.getKey();
        int includeState = step(includeAutomaton, key, initialIncludeState);
        if (includeState == -1) {
            continue;
        }
        int excludeState = step(excludeAutomaton, key, initialExcludeState);
        if (excludeState != -1 && excludeAutomaton.isAccept(excludeState)) {
            continue;
        }
        Object value = entry.getValue();
        CharacterRunAutomaton subIncludeAutomaton = includeAutomaton;
        int subIncludeState = includeState;
        if (includeAutomaton.isAccept(includeState)) {
            if (excludeState == -1 || excludeAutomaton.step(excludeState, '.') == -1) {
                // the exclude has no chances to match inner properties
                filtered.put(key, value);
                continue;
            } else {
                // the object matched, so consider that the include matches every inner property
                // we only care about excludes now
                subIncludeAutomaton = matchAllAutomaton;
                subIncludeState = 0;
            }
        }
        if (value instanceof Map) {
            subIncludeState = subIncludeAutomaton.step(subIncludeState, '.');
            if (subIncludeState == -1) {
                continue;
            }
            if (excludeState != -1) {
                excludeState = excludeAutomaton.step(excludeState, '.');
            }
            Map<String, Object> valueAsMap = (Map<String, Object>) value;
            Map<String, Object> filteredValue = filter(valueAsMap, subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton);
            if (includeAutomaton.isAccept(includeState) || filteredValue.isEmpty() == false) {
                filtered.put(key, filteredValue);
            }
        } else if (value instanceof Iterable) {
            List<Object> filteredValue = filter((Iterable<?>) value, subIncludeAutomaton, subIncludeState, excludeAutomaton, excludeState, matchAllAutomaton);
            if (filteredValue.isEmpty() == false) {
                filtered.put(key, filteredValue);
            }
        } else {
            // leaf property
            if (includeAutomaton.isAccept(includeState) && (excludeState == -1 || excludeAutomaton.isAccept(excludeState) == false)) {
                filtered.put(key, value);
            }
        }
    }
    return filtered;
}
Also used : HashMap(java.util.HashMap) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class TestMockAnalyzer method testKeep.

/** Test a configuration that behaves a lot like KeepWordFilter */
public void testKeep() throws Exception {
    CharacterRunAutomaton keepWords = new CharacterRunAutomaton(Operations.complement(Operations.union(Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))), DEFAULT_MAX_DETERMINIZED_STATES));
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
    assertAnalyzesTo(a, "quick foo brown bar bar fox foo", new String[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 });
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton)

Example 4 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class SynonymTokenizer method testMaxSizeEndHighlight.

public void testMaxSizeEndHighlight() throws Exception {
    TestHighlightRunner helper = new TestHighlightRunner() {

        @Override
        public void run() throws Exception {
            CharacterRunAutomaton stopWords = new CharacterRunAutomaton(new RegExp("i[nt]").toAutomaton());
            TermQuery query = new TermQuery(new Term("text", "searchterm"));
            String text = "this is a text with searchterm in it";
            SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
            Highlighter hg = getHighlighter(query, "text", fm);
            hg.setTextFragmenter(new NullFragmenter());
            hg.setMaxDocCharsToAnalyze(36);
            String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
            assertTrue("Matched text should contain remainder of text after highlighted query ", match.endsWith("in it"));
        }
    };
    helper.start();
}
Also used : MultiTermQuery(org.apache.lucene.search.MultiTermQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) TermQuery(org.apache.lucene.search.TermQuery) TestHighlightRunner(org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Term(org.apache.lucene.index.Term)

Example 5 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class FastVectorHighlighterTest method matchedFieldsTestCase.

private void matchedFieldsTestCase(boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses) throws IOException {
    Document doc = new Document();
    FieldType stored = new FieldType(TextField.TYPE_STORED);
    stored.setStoreTermVectorOffsets(true);
    stored.setStoreTermVectorPositions(true);
    stored.setStoreTermVectors(true);
    stored.freeze();
    FieldType matched = new FieldType(TextField.TYPE_NOT_STORED);
    matched.setStoreTermVectorOffsets(true);
    matched.setStoreTermVectorPositions(true);
    matched.setStoreTermVectors(true);
    matched.freeze();
    // Whitespace tokenized with English stop words
    doc.add(new Field("field", fieldValue, stored));
    // Whitespace tokenized without stop words
    doc.add(new Field("field_exact", fieldValue, matched));
    // Whitespace tokenized without toLower
    doc.add(new Field("field_super_exact", fieldValue, matched));
    // Each letter is a token
    doc.add(new Field("field_characters", fieldValue, matched));
    // Every three letters is a token
    doc.add(new Field("field_tripples", fieldValue, matched));
    doc.add(new Field("field_sliced", // Sliced at 10 chars then analyzed just like field
    fieldValue.substring(// Sliced at 10 chars then analyzed just like field
    0, Math.min(fieldValue.length() - 1, 10)), matched));
    doc.add(new Field("field_der_red", new // Hacky field containing "der" and "red" at pos = 0
    CannedTokenStream(token("der", 1, 0, 3), token("red", 0, 0, 3)), matched));
    final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
    fieldAnalyzers.put("field", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET));
    fieldAnalyzers.put("field_exact", new MockAnalyzer(random()));
    fieldAnalyzers.put("field_super_exact", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
    fieldAnalyzers.put("field_characters", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
    fieldAnalyzers.put("field_tripples", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
    fieldAnalyzers.put("field_sliced", fieldAnalyzers.get("field"));
    // This is required even though we provide a token stream
    fieldAnalyzers.put("field_der_red", fieldAnalyzers.get("field"));
    Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {

        public Analyzer getWrappedAnalyzer(String fieldName) {
            return fieldAnalyzers.get(fieldName);
        }
    };
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    writer.addDocument(doc);
    FastVectorHighlighter highlighter = new FastVectorHighlighter();
    FragListBuilder fragListBuilder = new SimpleFragListBuilder();
    FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
    IndexReader reader = DirectoryReader.open(writer);
    String[] preTags = new String[] { "<b>" };
    String[] postTags = new String[] { "</b>" };
    Encoder encoder = new DefaultEncoder();
    int docId = 0;
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    for (Query clause : queryClauses) {
        query.add(clause, Occur.MUST);
    }
    FieldQuery fieldQuery = new FieldQuery(query.build(), reader, true, fieldMatch);
    String[] bestFragments;
    if (useMatchedFields) {
        Set<String> matchedFields = new HashSet<>();
        matchedFields.add("field");
        matchedFields.add("field_exact");
        matchedFields.add("field_super_exact");
        matchedFields.add("field_characters");
        matchedFields.add("field_tripples");
        matchedFields.add("field_sliced");
        matchedFields.add("field_der_red");
        bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
    } else {
        bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
    }
    assertEquals(expected, bestFragments[0]);
    reader.close();
    writer.close();
    dir.close();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) CustomScoreQuery(org.apache.lucene.queries.CustomScoreQuery) TermQuery(org.apache.lucene.search.TermQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Document(org.apache.lucene.document.Document) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StoredField(org.apache.lucene.document.StoredField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) Encoder(org.apache.lucene.search.highlight.Encoder) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet) RegExp(org.apache.lucene.util.automaton.RegExp) TreeMap(java.util.TreeMap) FieldType(org.apache.lucene.document.FieldType) DelegatingAnalyzerWrapper(org.apache.lucene.analysis.DelegatingAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader)

Aggregations

CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)36 RegExp (org.apache.lucene.util.automaton.RegExp)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)10 ArrayList (java.util.ArrayList)7 Term (org.apache.lucene.index.Term)7 PhraseQuery (org.apache.lucene.search.PhraseQuery)6 Query (org.apache.lucene.search.Query)6 TermQuery (org.apache.lucene.search.TermQuery)6 List (java.util.List)5 Analyzer (org.apache.lucene.analysis.Analyzer)5 BooleanQuery (org.apache.lucene.search.BooleanQuery)5 BoostQuery (org.apache.lucene.search.BoostQuery)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 HashMap (java.util.HashMap)4 Map (java.util.Map)4 Document (org.apache.lucene.document.Document)4 FuzzyQuery (org.apache.lucene.search.FuzzyQuery)4 PrefixQuery (org.apache.lucene.search.PrefixQuery)4 TermRangeQuery (org.apache.lucene.search.TermRangeQuery)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)3