use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class MultiTermHighlighting method extractAutomata.
/**
* Extracts MultiTermQueries that match the provided field predicate.
* Returns equivalent automata that will match terms.
*/
public static CharacterRunAutomaton[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan, Function<Query, Collection<Query>> preRewriteFunc) {
// TODO Lucene needs a Query visitor API! LUCENE-3041
List<CharacterRunAutomaton> list = new ArrayList<>();
Collection<Query> customSubQueries = preRewriteFunc.apply(query);
if (customSubQueries != null) {
for (Query sub : customSubQueries) {
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (query instanceof BooleanQuery) {
for (BooleanClause clause : (BooleanQuery) query) {
if (!clause.isProhibited()) {
list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
}
}
} else if (query instanceof ConstantScoreQuery) {
list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (query instanceof BoostQuery) {
list.addAll(Arrays.asList(extractAutomata(((BoostQuery) query).getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (query instanceof DisjunctionMaxQuery) {
for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanOrQuery) {
for (Query sub : ((SpanOrQuery) query).getClauses()) {
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanNearQuery) {
for (Query sub : ((SpanNearQuery) query).getClauses()) {
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanNotQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (lookInSpan && query instanceof SpanPositionCheckQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (lookInSpan && query instanceof SpanBoostQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanBoostQuery) query).getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) {
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (query instanceof PrefixQuery) {
final PrefixQuery pq = (PrefixQuery) query;
Term prefix = pq.getPrefix();
if (fieldMatcher.test(prefix.field())) {
list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()), Automata.makeAnyString())) {
@Override
public String toString() {
return pq.toString();
}
});
}
} else if (query instanceof FuzzyQuery) {
final FuzzyQuery fq = (FuzzyQuery) query;
if (fieldMatcher.test(fq.getField())) {
String utf16 = fq.getTerm().text();
int[] termText = new int[utf16.codePointCount(0, utf16.length())];
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termText[j++] = cp = utf16.codePointAt(i);
}
int termLength = termText.length;
int prefixLength = Math.min(fq.getPrefixLength(), termLength);
String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
String prefix = UnicodeUtil.newString(termText, 0, prefixLength);
Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix);
list.add(new CharacterRunAutomaton(automaton) {
@Override
public String toString() {
return fq.toString();
}
});
}
} else if (query instanceof TermRangeQuery) {
final TermRangeQuery tq = (TermRangeQuery) query;
if (fieldMatcher.test(tq.getField())) {
final CharsRef lowerBound;
if (tq.getLowerTerm() == null) {
lowerBound = null;
} else {
lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
}
final CharsRef upperBound;
if (tq.getUpperTerm() == null) {
upperBound = null;
} else {
upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
}
final boolean includeLower = tq.includesLower();
final boolean includeUpper = tq.includesUpper();
final CharsRef scratch = new CharsRef();
@SuppressWarnings("deprecation") final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
// this is *not* an automaton, but its very simple
list.add(new CharacterRunAutomaton(Automata.makeEmpty()) {
@Override
public boolean run(char[] s, int offset, int length) {
scratch.chars = s;
scratch.offset = offset;
scratch.length = length;
if (lowerBound != null) {
int cmp = comparator.compare(scratch, lowerBound);
if (cmp < 0 || (!includeLower && cmp == 0)) {
return false;
}
}
if (upperBound != null) {
int cmp = comparator.compare(scratch, upperBound);
if (cmp > 0 || (!includeUpper && cmp == 0)) {
return false;
}
}
return true;
}
@Override
public String toString() {
return tq.toString();
}
});
}
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (fieldMatcher.test(aq.getField())) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return aq.toString();
}
});
}
}
return list.toArray(new CharacterRunAutomaton[list.size()]);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class SynonymTokenizer method testMaxSizeHighlightTruncates.
public void testMaxSizeHighlightTruncates() throws Exception {
TestHighlightRunner helper = new TestHighlightRunner() {
@Override
public void run() throws Exception {
String goodWord = "goodtoken";
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("stoppedtoken"));
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
analyzer.setEnableChecks(false);
TermQuery query = new TermQuery(new Term("data", goodWord));
String match;
StringBuilder sb = new StringBuilder();
sb.append(goodWord);
for (int i = 0; i < 10000; i++) {
sb.append(" ");
// only one stopword
sb.append("stoppedtoken");
}
SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
// new Highlighter(fm,
Highlighter hg = getHighlighter(query, "data", fm);
// new
// QueryTermScorer(query));
hg.setTextFragmenter(new NullFragmenter());
hg.setMaxDocCharsToAnalyze(100);
match = hg.getBestFragment(analyzer, "data", sb.toString());
assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg.getMaxDocCharsToAnalyze());
// add another tokenized word to the overrall length - but set way
// beyond
// the length of text under consideration (after a large slug of stop
// words
// + whitespace)
sb.append(" ");
sb.append(goodWord);
match = hg.getBestFragment(analyzer, "data", sb.toString());
assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg.getMaxDocCharsToAnalyze());
}
};
helper.start();
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestQueryBuilder method testPhraseQueryPositionIncrements.
public void testPhraseQueryPositionIncrements() throws Exception {
PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "1"), 0);
pqBuilder.add(new Term("field", "2"), 2);
PhraseQuery expected = pqBuilder.build();
CharacterRunAutomaton stopList = new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopList);
QueryBuilder builder = new QueryBuilder(analyzer);
assertEquals(expected, builder.createPhraseQuery("field", "1 stop 2"));
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestPrecedenceQueryParser method testBoost.
public void testBoost() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
PrecedenceQueryParser qp = new PrecedenceQueryParser();
qp.setAnalyzer(oneStopAnalyzer);
Query q = qp.parse("on^1.0", "field");
assertNotNull(q);
q = qp.parse("\"hello\"^2.0", "field");
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = qp.parse("hello^2.0", "field");
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = qp.parse("\"on\"^1.0", "field");
assertNotNull(q);
q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3", "field");
assertNotNull(q);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestMockAnalyzer method testLength.
/** Test a configuration that behaves a lot like LengthFilter */
public void testLength() throws Exception {
CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5);
assertAnalyzesTo(a, "ok toolong fine notfine", new String[] { "ok", "fine" }, new int[] { 1, 2 });
}
Aggregations