use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.
the class XAnalyzingSuggester method toAutomaton.
final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException {
// Create corresponding automaton: labels are bytes
// from each analyzed token, with byte 0 used as
// separator between tokens:
Automaton automaton = ts2a.toAutomaton(ts);
automaton = replaceSep(automaton);
automaton = convertAutomaton(automaton);
return automaton;
}
use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.
the class XAnalyzingSuggester method toFiniteStrings.
// EDIT: Adrien, needed by lookup providers
// NOTE: these XForks are unmaintainable, we need to get rid of them...
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
Automaton automaton;
try (TokenStream ts = stream) {
automaton = toAutomaton(ts, ts2a);
}
LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
Set<IntsRef> set = new HashSet<>();
for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
set.add(IntsRef.deepCopyOf(string));
}
return Collections.unmodifiableSet(set);
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestSynonymGraphFilter method testRandomSyns.
public void testRandomSyns() throws Exception {
int synCount = atLeast(10);
double bias = random().nextDouble();
boolean dedup = random().nextBoolean();
boolean flatten = random().nextBoolean();
SynonymMap.Builder b = new SynonymMap.Builder(dedup);
List<OneSyn> syns = new ArrayList<>();
// Makes random syns from random a / b tokens, mapping to random x / y tokens
if (VERBOSE) {
System.out.println("TEST: make " + synCount + " syns");
System.out.println(" bias for a over b=" + bias);
System.out.println(" dedup=" + dedup);
System.out.println(" flatten=" + flatten);
}
int maxSynLength = 0;
for (int i = 0; i < synCount; i++) {
OneSyn syn = new OneSyn();
syn.in = randomBinaryChars(1, 5, bias, 'a');
syn.out = randomBinaryChars(1, 5, 0.5, 'x');
syn.keepOrig = random().nextBoolean();
syns.add(syn);
maxSynLength = Math.max(maxSynLength, syn.in.length);
if (VERBOSE) {
System.out.println(" " + syn);
}
add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig);
}
// Compute max allowed lookahead for flatten filter:
int maxFlattenLookahead = 0;
if (flatten) {
for (int i = 0; i < synCount; i++) {
OneSyn syn1 = syns.get(i);
int count = syn1.out.length;
boolean keepOrig = syn1.keepOrig;
for (int j = 0; j < synCount; j++) {
OneSyn syn2 = syns.get(i);
keepOrig |= syn2.keepOrig;
if (syn1.in.equals(syn2.in)) {
count += syn2.out.length;
}
}
if (keepOrig) {
count += syn1.in.length;
}
maxFlattenLookahead = Math.max(maxFlattenLookahead, count);
}
}
// Only used w/ VERBOSE:
Analyzer aNoFlattened;
if (VERBOSE) {
aNoFlattened = getAnalyzer(b, true);
} else {
aNoFlattened = null;
}
Analyzer a;
if (flatten) {
a = getFlattenAnalyzer(b, true);
} else {
a = getAnalyzer(b, true);
}
int iters = atLeast(20);
for (int iter = 0; iter < iters; iter++) {
String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a'));
if (VERBOSE) {
System.out.println("TEST: iter=" + iter + " doc=" + doc);
}
Automaton expected = slowSynFilter(doc, syns, flatten);
if (VERBOSE) {
System.out.println(" expected:\n" + expected.toDot());
if (flatten) {
Automaton unflattened = toAutomaton(aNoFlattened.tokenStream("field", new StringReader(doc)));
System.out.println(" actual unflattened:\n" + unflattened.toDot());
}
}
Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc)));
if (VERBOSE) {
System.out.println(" actual:\n" + actual.toDot());
}
assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength, synFilter.getMaxLookaheadUsed() <= maxSynLength);
if (flatten) {
assertTrue("flatten maxLookaheadUsed=" + flattenFilter.getMaxLookaheadUsed() + " maxFlattenLookahead=" + maxFlattenLookahead, flattenFilter.getMaxLookaheadUsed() <= maxFlattenLookahead);
}
checkAnalysisConsistency(random(), a, random().nextBoolean(), doc);
// output token that also happens to be in the input:
try {
actual = Operations.determinize(actual, 50000);
} catch (TooComplexToDeterminizeException tctde) {
// Unfortunately the syns can easily create difficult-to-determinize graphs:
assertTrue(approxEquals(actual, expected));
continue;
}
try {
expected = Operations.determinize(expected, 50000);
} catch (TooComplexToDeterminizeException tctde) {
// Unfortunately the syns can easily create difficult-to-determinize graphs:
assertTrue(approxEquals(actual, expected));
continue;
}
assertTrue(approxEquals(actual, expected));
assertTrue(Operations.sameLanguage(actual, expected));
}
a.close();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class GraphTokenStreamFiniteStrings method build.
/**
* Build an automaton from the provided {@link TokenStream}.
*/
private Automaton build(final TokenStream in) throws IOException {
Automaton.Builder builder = new Automaton.Builder();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
in.reset();
int pos = -1;
int prevIncr = 1;
int state = -1;
while (in.incrementToken()) {
int currentIncr = posIncAtt.getPositionIncrement();
if (pos == -1 && currentIncr < 1) {
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
}
// always use inc 1 while building, but save original increment
int incr = Math.min(1, currentIncr);
if (incr > 0) {
pos += incr;
}
int endPos = pos + posLengthAtt.getPositionLength();
while (state < endPos) {
state = builder.createState();
}
BytesRef term = termBytesAtt.getBytesRef();
int id = getTermID(currentIncr, prevIncr, term);
builder.addTransition(pos, endPos, id);
// only save last increment on non-zero increment in case we have multiple stacked tokens
if (currentIncr > 0) {
prevIncr = currentIncr;
}
}
in.end();
if (state != -1) {
builder.setAccept(state, true);
}
return builder.finish();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestGraphTokenizers method testOverlappedTokensLattice.
public void testOverlappedTokensLattice() throws Exception {
final TokenStream ts = new CannedTokenStream(new Token[] { token("abc", 1, 1), token("xyz", 0, 2), token("def", 1, 1) });
final Automaton a1 = s2a("xyz");
final Automaton a2 = join("abc", "def");
assertSameLanguage(Operations.union(a1, a2), ts);
}
Aggregations