use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestUnifiedHighlighterExtensibility method testUnifiedHighlighterExtensibility.
/**
* This test is for maintaining the extensibility of the UnifiedHighlighter
* for customizations out of package.
*/
@Test
public void testUnifiedHighlighterExtensibility() {
final int maxLength = 1000;
UnifiedHighlighter uh = new UnifiedHighlighter(null, new MockAnalyzer(random())) {
@Override
protected Map<String, Object[]> highlightFieldsAsObjects(String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
return super.highlightFieldsAsObjects(fieldsIn, query, docIdsIn, maxPassagesIn);
}
@Override
protected OffsetSource getOffsetSource(String field) {
return super.getOffsetSource(field);
}
@Override
protected BreakIterator getBreakIterator(String field) {
return super.getBreakIterator(field);
}
@Override
protected PassageScorer getScorer(String field) {
return super.getScorer(field);
}
@Override
protected PassageFormatter getFormatter(String field) {
return super.getFormatter(field);
}
@Override
public Analyzer getIndexAnalyzer() {
return super.getIndexAnalyzer();
}
@Override
public IndexSearcher getIndexSearcher() {
return super.getIndexSearcher();
}
@Override
protected int getMaxNoHighlightPassages(String field) {
return super.getMaxNoHighlightPassages(field);
}
@Override
protected Boolean requiresRewrite(SpanQuery spanQuery) {
return super.requiresRewrite(spanQuery);
}
@Override
protected LimitedStoredFieldVisitor newLimitedStoredFieldsVisitor(String[] fields) {
return super.newLimitedStoredFieldsVisitor(fields);
}
@Override
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException {
return super.loadFieldValues(fields, docIter, cacheCharsThreshold);
}
@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
// THIS IS A COPY of the superclass impl; but use CustomFieldHighlighter
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
return new CustomFieldHighlighter(field, getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags), new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR), getScorer(field), maxPassages, getMaxNoHighlightPassages(field), getFormatter(field));
}
@Override
protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Set<HighlightFlag> highlightFlags) {
return super.getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
}
@Override
public int getMaxLength() {
return maxLength;
}
};
assertEquals(uh.getMaxLength(), maxLength);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class SearchEquivalenceTestBase method beforeClass.
@BeforeClass
public static void beforeClass() throws Exception {
Random random = random();
directory = newDirectory();
stopword = "" + randomChar();
CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword));
analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
Document doc = new Document();
Field id = new StringField("id", "", Field.Store.NO);
Field field = new TextField("field", "", Field.Store.NO);
doc.add(id);
doc.add(field);
// index some docs
int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
for (int i = 0; i < numDocs; i++) {
id.setStringValue(Integer.toString(i));
field.setStringValue(randomFieldContents());
iw.addDocument(doc);
}
// delete some docs
int numDeletes = numDocs / 20;
for (int i = 0; i < numDeletes; i++) {
Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs)));
if (random.nextBoolean()) {
iw.deleteDocuments(toDelete);
} else {
iw.deleteDocuments(new TermQuery(toDelete));
}
}
reader = iw.getReader();
s1 = newSearcher(reader);
s2 = newSearcher(reader);
iw.close();
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class FieldOffsetStrategy method createAutomataOffsetsFromTerms.
protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
automataPostings.add(new ArrayList<>());
}
TermsEnum termsEnum = termsIndex.iterator();
BytesRef term;
CharsRefBuilder refBuilder = new CharsRefBuilder();
while ((term = termsEnum.next()) != null) {
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
refBuilder.copyUTF8Bytes(term);
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (doc == postings.advance(doc)) {
automataPostings.get(i).add(postings);
}
}
}
}
//will be at most this long
List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
List<PostingsEnum> postingsEnums = automataPostings.get(i);
int size = postingsEnums.size();
if (size > 0) {
//only add if we have offsets
BytesRef wildcardTerm = new BytesRef(automaton.toString());
if (size == 1) {
//don't wrap in a composite if there's only one OffsetsEnum
offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
} else {
offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
}
}
}
return offsetsEnums;
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TokenStreamOffsetStrategy method convertTermsToAutomata.
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
for (int i = 0; i < terms.length; i++) {
String termString = terms[i].utf8ToString();
newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
@Override
public String toString() {
return termString;
}
};
}
// Append existing automata (that which is used for MTQs)
System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
return newAutomata;
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestQPHelper method testStopwords.
public void testStopwords() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
Query result = qp.parse("a:the OR a:foo", "a");
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a MatchNoDocsQuery", result instanceof MatchNoDocsQuery);
result = qp.parse("a:woo OR a:the", "a");
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a TermQuery", result instanceof TermQuery);
result = qp.parse("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", "a");
Query expected = new BooleanQuery.Builder().add(new TermQuery(new Term("fieldX", "xxxxx")), Occur.SHOULD).add(new TermQuery(new Term("fieldy", "xxxxxxxx")), Occur.SHOULD).build();
expected = new BoostQuery(expected, 2f);
assertEquals(expected, result);
}
Aggregations