Search in sources :

Example 6 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class TestUnifiedHighlighterExtensibility method testUnifiedHighlighterExtensibility.

/**
   * This test is for maintaining the extensibility of the UnifiedHighlighter
   * for customizations out of package.
   */
@Test
public void testUnifiedHighlighterExtensibility() {
    final int maxLength = 1000;
    UnifiedHighlighter uh = new UnifiedHighlighter(null, new MockAnalyzer(random())) {

        @Override
        protected Map<String, Object[]> highlightFieldsAsObjects(String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
            return super.highlightFieldsAsObjects(fieldsIn, query, docIdsIn, maxPassagesIn);
        }

        @Override
        protected OffsetSource getOffsetSource(String field) {
            return super.getOffsetSource(field);
        }

        @Override
        protected BreakIterator getBreakIterator(String field) {
            return super.getBreakIterator(field);
        }

        @Override
        protected PassageScorer getScorer(String field) {
            return super.getScorer(field);
        }

        @Override
        protected PassageFormatter getFormatter(String field) {
            return super.getFormatter(field);
        }

        @Override
        public Analyzer getIndexAnalyzer() {
            return super.getIndexAnalyzer();
        }

        @Override
        public IndexSearcher getIndexSearcher() {
            return super.getIndexSearcher();
        }

        @Override
        protected int getMaxNoHighlightPassages(String field) {
            return super.getMaxNoHighlightPassages(field);
        }

        @Override
        protected Boolean requiresRewrite(SpanQuery spanQuery) {
            return super.requiresRewrite(spanQuery);
        }

        @Override
        protected LimitedStoredFieldVisitor newLimitedStoredFieldsVisitor(String[] fields) {
            return super.newLimitedStoredFieldsVisitor(fields);
        }

        @Override
        protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException {
            return super.loadFieldValues(fields, docIter, cacheCharsThreshold);
        }

        @Override
        protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
            // THIS IS A COPY of the superclass impl; but use CustomFieldHighlighter
            BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
            Set<HighlightFlag> highlightFlags = getFlags(field);
            PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
            CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
            OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
            return new CustomFieldHighlighter(field, getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags), new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR), getScorer(field), maxPassages, getMaxNoHighlightPassages(field), getFormatter(field));
        }

        @Override
        protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Set<HighlightFlag> highlightFlags) {
            return super.getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
        }

        @Override
        public int getMaxLength() {
            return maxLength;
        }
    };
    assertEquals(uh.getMaxLength(), maxLength);
}
Also used : Set(java.util.Set) Query(org.apache.lucene.search.Query) SpanQuery(org.apache.lucene.search.spans.SpanQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) PhraseHelper(org.apache.lucene.search.uhighlight.PhraseHelper) SpanQuery(org.apache.lucene.search.spans.SpanQuery) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) UnifiedHighlighter(org.apache.lucene.search.uhighlight.UnifiedHighlighter) SplittingBreakIterator(org.apache.lucene.search.uhighlight.SplittingBreakIterator) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 7 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class SearchEquivalenceTestBase method beforeClass.

@BeforeClass
public static void beforeClass() throws Exception {
    Random random = random();
    directory = newDirectory();
    stopword = "" + randomChar();
    CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword));
    analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
    RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
    Document doc = new Document();
    Field id = new StringField("id", "", Field.Store.NO);
    Field field = new TextField("field", "", Field.Store.NO);
    doc.add(id);
    doc.add(field);
    // index some docs
    int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
    for (int i = 0; i < numDocs; i++) {
        id.setStringValue(Integer.toString(i));
        field.setStringValue(randomFieldContents());
        iw.addDocument(doc);
    }
    // delete some docs
    int numDeletes = numDocs / 20;
    for (int i = 0; i < numDeletes; i++) {
        Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs)));
        if (random.nextBoolean()) {
            iw.deleteDocuments(toDelete);
        } else {
            iw.deleteDocuments(new TermQuery(toDelete));
        }
    }
    reader = iw.getReader();
    s1 = newSearcher(reader);
    s2 = newSearcher(reader);
    iw.close();
}
Also used : StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Random(java.util.Random) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) StringField(org.apache.lucene.document.StringField) TextField(org.apache.lucene.document.TextField) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BeforeClass(org.junit.BeforeClass)

Example 8 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class FieldOffsetStrategy method createAutomataOffsetsFromTerms.

protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
    List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
    for (int i = 0; i < automata.length; i++) {
        automataPostings.add(new ArrayList<>());
    }
    TermsEnum termsEnum = termsIndex.iterator();
    BytesRef term;
    CharsRefBuilder refBuilder = new CharsRefBuilder();
    while ((term = termsEnum.next()) != null) {
        for (int i = 0; i < automata.length; i++) {
            CharacterRunAutomaton automaton = automata[i];
            refBuilder.copyUTF8Bytes(term);
            if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
                PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
                if (doc == postings.advance(doc)) {
                    automataPostings.get(i).add(postings);
                }
            }
        }
    }
    //will be at most this long
    List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length);
    for (int i = 0; i < automata.length; i++) {
        CharacterRunAutomaton automaton = automata[i];
        List<PostingsEnum> postingsEnums = automataPostings.get(i);
        int size = postingsEnums.size();
        if (size > 0) {
            //only add if we have offsets
            BytesRef wildcardTerm = new BytesRef(automaton.toString());
            if (size == 1) {
                //don't wrap in a composite if there's only one OffsetsEnum
                offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
            } else {
                offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
            }
        }
    }
    return offsetsEnums;
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) ArrayList(java.util.ArrayList) TermsEnum(org.apache.lucene.index.TermsEnum) ArrayList(java.util.ArrayList) List(java.util.List) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 9 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class TokenStreamOffsetStrategy method convertTermsToAutomata.

private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
    CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
    for (int i = 0; i < terms.length; i++) {
        String termString = terms[i].utf8ToString();
        newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {

            @Override
            public String toString() {
                return termString;
            }
        };
    }
    // Append existing automata (that which is used for MTQs)
    System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
    return newAutomata;
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton)

Example 10 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class TestQPHelper method testStopwords.

public void testStopwords() throws Exception {
    StandardQueryParser qp = new StandardQueryParser();
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
    qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
    Query result = qp.parse("a:the OR a:foo", "a");
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a MatchNoDocsQuery", result instanceof MatchNoDocsQuery);
    result = qp.parse("a:woo OR a:the", "a");
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a TermQuery", result instanceof TermQuery);
    result = qp.parse("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", "a");
    Query expected = new BooleanQuery.Builder().add(new TermQuery(new Term("fieldX", "xxxxx")), Occur.SHOULD).add(new TermQuery(new Term("fieldy", "xxxxxxxx")), Occur.SHOULD).build();
    expected = new BoostQuery(expected, 2f);
    assertEquals(expected, result);
}
Also used : MultiTermQuery(org.apache.lucene.search.MultiTermQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) RegexpQuery(org.apache.lucene.search.RegexpQuery) MultiTermQuery(org.apache.lucene.search.MultiTermQuery) PrefixQuery(org.apache.lucene.search.PrefixQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) WildcardQuery(org.apache.lucene.search.WildcardQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) TermRangeQuery(org.apache.lucene.search.TermRangeQuery) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) Term(org.apache.lucene.index.Term) BoostQuery(org.apache.lucene.search.BoostQuery)

Aggregations

CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)36 RegExp (org.apache.lucene.util.automaton.RegExp)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)10 ArrayList (java.util.ArrayList)7 Term (org.apache.lucene.index.Term)7 PhraseQuery (org.apache.lucene.search.PhraseQuery)6 Query (org.apache.lucene.search.Query)6 TermQuery (org.apache.lucene.search.TermQuery)6 List (java.util.List)5 Analyzer (org.apache.lucene.analysis.Analyzer)5 BooleanQuery (org.apache.lucene.search.BooleanQuery)5 BoostQuery (org.apache.lucene.search.BoostQuery)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 HashMap (java.util.HashMap)4 Map (java.util.Map)4 Document (org.apache.lucene.document.Document)4 FuzzyQuery (org.apache.lucene.search.FuzzyQuery)4 PrefixQuery (org.apache.lucene.search.PrefixQuery)4 TermRangeQuery (org.apache.lucene.search.TermRangeQuery)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)3