use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TokenStreamToAutomaton method toAutomaton.
/** Pulls the graph (including {@link
* PositionLengthAttribute}) from the provided {@link
* TokenStream}, and creates the corresponding
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
final Automaton.Builder builder = new Automaton.Builder();
builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
in.reset();
// Only temporarily holds states ahead of our current
// position:
final RollingBuffer<Position> positions = new Positions();
int pos = -1;
int freedPos = 0;
Position posData = null;
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (preservePositionIncrements == false && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
if (posInc > 0) {
// New node:
pos += posInc;
posData = positions.get(pos);
assert posData.leaving == -1;
if (posData.arriving == -1) {
// No token ever arrived to this position
if (pos == 0) {
// OK: this is the first token
posData.leaving = 0;
} else {
// This means there's a hole (eg, StopFilter
// does this):
posData.leaving = builder.createState();
addHoles(builder, positions, pos);
}
} else {
posData.leaving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
// A token spanned over a hole; add holes
// "under" it:
addHoles(builder, positions, pos);
}
}
while (freedPos <= pos) {
Position freePosData = positions.get(freedPos);
// don't free this position yet if we may still need to fill holes over it:
if (freePosData.arriving == -1 || freePosData.leaving == -1) {
break;
}
positions.freeBefore(freedPos);
freedPos++;
}
}
final int endPos = pos + posLengthAtt.getPositionLength();
final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
if (endPosData.arriving == -1) {
endPosData.arriving = builder.createState();
}
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
}
} else {
termLen = termUTF8.length;
}
int state = posData.leaving;
for (int byteIDX = 0; byteIDX < termLen; byteIDX++) {
final int nextState = byteIDX == termLen - 1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
builder.addTransition(state, nextState, c);
state = nextState;
}
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
}
in.end();
int endState = -1;
int endPosInc = posIncAtt.getPositionIncrement();
if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
endPosInc = 1;
}
if (endPosInc > 0) {
// there were hole(s) after the last token
endState = builder.createState();
// add trailing holes now:
int lastState = endState;
while (true) {
int state1 = builder.createState();
builder.addTransition(lastState, state1, HOLE);
endPosInc--;
if (endPosInc == 0) {
builder.setAccept(state1, true);
break;
}
int state2 = builder.createState();
builder.addTransition(state1, state2, POS_SEP);
lastState = state2;
}
} else {
endState = -1;
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != -1) {
if (endState != -1) {
builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
builder.setAccept(posData.arriving, true);
}
}
pos++;
}
return builder.finish();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectStartTerm.
public void testIntersectStartTerm() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newStringField("field", "abc", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "abd", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "acd", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "bcd", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
Automaton automaton = new RegExp(".*d", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te;
// should seek to startTerm
te = terms.intersect(ca, new BytesRef("aad"));
assertEquals("abd", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("acd", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("bcd", te.next().utf8ToString());
assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
// should fail to find ceil label on second arc, rewind
te = terms.intersect(ca, new BytesRef("add"));
assertEquals("bcd", te.next().utf8ToString());
assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
// should reach end
te = terms.intersect(ca, new BytesRef("bcd"));
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("ddd"));
assertNull(te.next());
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectRandom.
// Tests Terms.intersect
public void testIntersectRandom() throws IOException {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
final int numTerms = atLeast(300);
//final int numTerms = 50;
final Set<String> terms = new HashSet<>();
final Collection<String> pendingTerms = new ArrayList<>();
final Map<BytesRef, Integer> termToID = new HashMap<>();
int id = 0;
while (terms.size() != numTerms) {
final String s = getRandomString();
if (!terms.contains(s)) {
terms.add(s);
pendingTerms.add(s);
if (random().nextInt(20) == 7) {
addDoc(w, pendingTerms, termToID, id++);
}
}
}
addDoc(w, pendingTerms, termToID, id++);
final BytesRef[] termsArray = new BytesRef[terms.size()];
final Set<BytesRef> termsSet = new HashSet<>();
{
int upto = 0;
for (String s : terms) {
final BytesRef b = new BytesRef(s);
termsArray[upto++] = b;
termsSet.add(b);
}
Arrays.sort(termsArray);
}
if (VERBOSE) {
System.out.println("\nTEST: indexed terms (unicode order):");
for (BytesRef t : termsArray) {
System.out.println(" " + t.utf8ToString() + " -> id:" + termToID.get(t));
}
}
final IndexReader r = w.getReader();
w.close();
int[] docIDToID = new int[r.maxDoc()];
NumericDocValues values = MultiDocValues.getNumericValues(r, "id");
for (int i = 0; i < r.maxDoc(); i++) {
assertEquals(i, values.nextDoc());
docIDToID[i] = (int) values.longValue();
}
for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) {
// TODO: can we also test infinite As here...?
// From the random terms, pick some ratio and compile an
// automaton:
final Set<String> acceptTerms = new HashSet<>();
final TreeSet<BytesRef> sortedAcceptTerms = new TreeSet<>();
final double keepPct = random().nextDouble();
Automaton a;
if (iter == 0) {
if (VERBOSE) {
System.out.println("\nTEST: empty automaton");
}
a = Automata.makeEmpty();
} else {
if (VERBOSE) {
System.out.println("\nTEST: keepPct=" + keepPct);
}
for (String s : terms) {
final String s2;
if (random().nextDouble() <= keepPct) {
s2 = s;
} else {
s2 = getRandomString();
}
acceptTerms.add(s2);
sortedAcceptTerms.add(new BytesRef(s2));
}
a = Automata.makeStringUnion(sortedAcceptTerms);
}
final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000, false);
final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
final Set<BytesRef> acceptTermsSet = new HashSet<>();
int upto = 0;
for (String s : acceptTerms) {
final BytesRef b = new BytesRef(s);
acceptTermsArray[upto++] = b;
acceptTermsSet.add(b);
assertTrue(accepts(c, b));
}
Arrays.sort(acceptTermsArray);
if (VERBOSE) {
System.out.println("\nTEST: accept terms (unicode order):");
for (BytesRef t : acceptTermsArray) {
System.out.println(" " + t.utf8ToString() + (termsSet.contains(t) ? " (exists)" : ""));
}
System.out.println(a.toDot());
}
for (int iter2 = 0; iter2 < 100; iter2++) {
final BytesRef startTerm = acceptTermsArray.length == 0 || random().nextBoolean() ? null : acceptTermsArray[random().nextInt(acceptTermsArray.length)];
if (VERBOSE) {
System.out.println("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.utf8ToString()));
if (startTerm != null) {
int state = 0;
for (int idx = 0; idx < startTerm.length; idx++) {
final int label = startTerm.bytes[startTerm.offset + idx] & 0xff;
System.out.println(" state=" + state + " label=" + label);
state = c.runAutomaton.step(state, label);
assertTrue(state != -1);
}
System.out.println(" state=" + state);
}
}
final TermsEnum te = MultiFields.getTerms(r, "f").intersect(c, startTerm);
int loc;
if (startTerm == null) {
loc = 0;
} else {
loc = Arrays.binarySearch(termsArray, BytesRef.deepCopyOf(startTerm));
if (loc < 0) {
loc = -(loc + 1);
} else {
// startTerm exists in index
loc++;
}
}
while (loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc])) {
loc++;
}
PostingsEnum postingsEnum = null;
while (loc < termsArray.length) {
final BytesRef expected = termsArray[loc];
final BytesRef actual = te.next();
if (VERBOSE) {
System.out.println("TEST: next() expected=" + expected.utf8ToString() + " actual=" + (actual == null ? "null" : actual.utf8ToString()));
}
assertEquals(expected, actual);
assertEquals(1, te.docFreq());
postingsEnum = TestUtil.docs(random(), te, postingsEnum, PostingsEnum.NONE);
final int docID = postingsEnum.nextDoc();
assertTrue(docID != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(docIDToID[docID], termToID.get(expected).intValue());
do {
loc++;
} while (loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc]));
}
assertNull(te.next());
}
}
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestAutomatonQuery method testRewritePrefix.
/**
* Test that rewriting to a prefix query works as expected, preserves
* MultiTermQuery semantics.
*/
public void testRewritePrefix() throws IOException {
Automaton pfx = Automata.makeString("do");
Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
assertEquals(3, automatonQueryNrHits(aq));
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class FuzzySuggesterTest method testRandom.
public void testRandom() throws Exception {
int numQueries = atLeast(100);
final List<TermFreqPayload2> slowCompletor = new ArrayList<>();
final TreeSet<String> allPrefixes = new TreeSet<>();
final Set<String> seen = new HashSet<>();
Input[] keys = new Input[numQueries];
boolean preserveSep = random().nextBoolean();
boolean unicodeAware = random().nextBoolean();
final int numStopChars = random().nextInt(10);
final boolean preserveHoles = random().nextBoolean();
if (VERBOSE) {
System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
}
for (int i = 0; i < numQueries; i++) {
int numTokens = TestUtil.nextInt(random(), 1, 4);
String key;
String analyzedKey;
while (true) {
key = "";
analyzedKey = "";
boolean lastRemoved = false;
for (int token = 0; token < numTokens; token++) {
String s;
while (true) {
// TODO: would be nice to fix this slowCompletor/comparator to
// use full range, but we might lose some coverage too...
s = TestUtil.randomSimpleString(random());
if (s.length() > 0) {
if (token > 0) {
key += " ";
}
if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length()) - 1) != ' ' : analyzedKey.charAt(analyzedKey.length() - 1) != ' ')) {
analyzedKey += " ";
}
key += s;
if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
if (preserveSep && preserveHoles) {
analyzedKey += '