use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class LuceneTestCase method assertTermsEquals.
/**
* Terms api equivalency
*/
public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException {
if (leftTerms == null || rightTerms == null) {
assertNull(info, leftTerms);
assertNull(info, rightTerms);
return;
}
assertTermsStatisticsEquals(info, leftTerms, rightTerms);
assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets());
assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions());
assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads());
TermsEnum leftTermsEnum = leftTerms.iterator();
TermsEnum rightTermsEnum = rightTerms.iterator();
assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);
assertTermsSeekingEquals(info, leftTerms, rightTerms);
if (deep) {
int numIntersections = atLeast(3);
for (int i = 0; i < numIntersections; i++) {
String re = AutomatonTestUtil.randomRegexp(random());
CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
// TODO: test start term too
TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely());
}
}
}
}
use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class BaseDocValuesFormatTestCase method testSortedSetTermsEnum.
public void testSortedSetTermsEnum() throws IOException {
Directory directory = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
iwconfig.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
Document doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
doc.add(new SortedSetDocValuesField("field", new BytesRef("world")));
doc.add(new SortedSetDocValuesField("field", new BytesRef("beer")));
iwriter.addDocument(doc);
DirectoryReader ireader = iwriter.getReader();
iwriter.close();
SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field");
assertEquals(3, dv.getValueCount());
TermsEnum termsEnum = dv.termsEnum();
// next()
assertEquals("beer", termsEnum.next().utf8ToString());
assertEquals(0, termsEnum.ord());
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals("world", termsEnum.next().utf8ToString());
assertEquals(2, termsEnum.ord());
// seekCeil()
assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!")));
assertEquals("hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer")));
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz")));
// seekExact()
assertTrue(termsEnum.seekExact(new BytesRef("beer")));
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
assertTrue(termsEnum.seekExact(new BytesRef("hello")));
assertEquals("hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
assertTrue(termsEnum.seekExact(new BytesRef("world")));
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
assertFalse(termsEnum.seekExact(new BytesRef("bogus")));
// seek(ord)
termsEnum.seekExact(0);
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
termsEnum.seekExact(1);
assertEquals("hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
termsEnum.seekExact(2);
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
// NORMAL automaton
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals("world", termsEnum.next().utf8ToString());
assertEquals(2, termsEnum.ord());
assertNull(termsEnum.next());
// SINGLE automaton
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertNull(termsEnum.next());
ireader.close();
directory.close();
}
use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class RandomPostingsTester method testTermsOneThread.
private void testTermsOneThread(Random random, Fields fieldsSource, EnumSet<Option> options, IndexOptions maxTestOptions, IndexOptions maxIndexOptions, boolean alwaysTestMax) throws IOException {
ThreadState threadState = new ThreadState();
// Test random terms/fields:
List<TermState> termStates = new ArrayList<>();
List<FieldAndTerm> termStateTerms = new ArrayList<>();
boolean supportsOrds = true;
Collections.shuffle(allTerms, random);
int upto = 0;
while (upto < allTerms.size()) {
boolean useTermState = termStates.size() != 0 && random.nextInt(5) == 1;
boolean useTermOrd = supportsOrds && useTermState == false && random.nextInt(5) == 1;
FieldAndTerm fieldAndTerm;
TermsEnum termsEnum;
TermState termState = null;
if (!useTermState) {
// Seek by random field+term:
fieldAndTerm = allTerms.get(upto++);
if (LuceneTestCase.VERBOSE) {
if (useTermOrd) {
System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() + " using ord=" + fieldAndTerm.ord);
} else {
System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
}
}
} else {
// Seek by previous saved TermState
int idx = random.nextInt(termStates.size());
fieldAndTerm = termStateTerms.get(idx);
if (LuceneTestCase.VERBOSE) {
System.out.println("\nTEST: seek using TermState to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
}
termState = termStates.get(idx);
}
Terms terms = fieldsSource.terms(fieldAndTerm.field);
assertNotNull(terms);
termsEnum = terms.iterator();
if (!useTermState) {
if (useTermOrd) {
// Try seek by ord sometimes:
try {
termsEnum.seekExact(fieldAndTerm.ord);
} catch (UnsupportedOperationException uoe) {
supportsOrds = false;
assertTrue(termsEnum.seekExact(fieldAndTerm.term));
}
} else {
assertTrue(termsEnum.seekExact(fieldAndTerm.term));
}
} else {
termsEnum.seekExact(fieldAndTerm.term, termState);
}
// check we really seeked to the right place
assertEquals(fieldAndTerm.term, termsEnum.term());
long termOrd;
if (supportsOrds) {
try {
termOrd = termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
supportsOrds = false;
termOrd = -1;
}
} else {
termOrd = -1;
}
if (termOrd != -1) {
// PostingsFormat supports ords
assertEquals(fieldAndTerm.ord, termsEnum.ord());
}
boolean savedTermState = false;
if (options.contains(Option.TERM_STATE) && !useTermState && random.nextInt(5) == 1) {
// Save away this TermState:
termStates.add(termsEnum.termState());
termStateTerms.add(fieldAndTerm);
savedTermState = true;
}
verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
// Sometimes save term state after pulling the enum:
if (options.contains(Option.TERM_STATE) && !useTermState && !savedTermState && random.nextInt(5) == 1) {
// Save away this TermState:
termStates.add(termsEnum.termState());
termStateTerms.add(fieldAndTerm);
useTermState = true;
}
// from the same term:
if (alwaysTestMax || random.nextInt(10) == 7) {
// Try same term again
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: try enum again on same term");
}
verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
}
}
// Test Terms.intersect:
for (String field : fields.keySet()) {
while (true) {
Automaton a = AutomatonTestUtil.randomAutomaton(random);
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
// Keep retrying until we get an A that will really "use" the PF's intersect code:
continue;
}
// System.out.println("A:\n" + a.toDot());
BytesRef startTerm = null;
if (random.nextBoolean()) {
RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
for (int iter = 0; iter < 100; iter++) {
int[] codePoints = ras.getRandomAcceptedString(random);
if (codePoints.length == 0) {
continue;
}
startTerm = new BytesRef(UnicodeUtil.newString(codePoints, 0, codePoints.length));
break;
}
// Don't allow empty string startTerm:
if (startTerm == null) {
continue;
}
}
TermsEnum intersected = fieldsSource.terms(field).intersect(ca, startTerm);
Set<BytesRef> intersectedTerms = new HashSet<BytesRef>();
BytesRef term;
while ((term = intersected.next()) != null) {
if (startTerm != null) {
// NOTE: not <=
assertTrue(startTerm.compareTo(term) < 0);
}
intersectedTerms.add(BytesRef.deepCopyOf(term));
verifyEnum(random, threadState, field, term, intersected, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
}
if (ca.runAutomaton == null) {
assertTrue(intersectedTerms.isEmpty());
} else {
for (BytesRef term2 : fields.get(field).keySet()) {
boolean expected;
if (startTerm != null && startTerm.compareTo(term2) >= 0) {
expected = false;
} else {
expected = ca.runAutomaton.run(term2.bytes, term2.offset, term2.length);
}
assertEquals("term=" + term2, expected, intersectedTerms.contains(term2));
}
}
break;
}
}
}
use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class TestFSTs method testRealTerms.
// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
final LineFileDocs docs = new LineFileDocs(random());
final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
final Path tempDir = createTempDir("fstlines");
final Directory dir = newFSDirectory(tempDir);
final IndexWriter writer = new IndexWriter(dir, conf);
Document doc;
int docCount = 0;
while ((doc = docs.nextDoc()) != null && docCount < numDocs) {
writer.addDocument(doc);
docCount++;
}
IndexReader r = DirectoryReader.open(writer);
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
if (storeOrd) {
System.out.println("FST stores ord");
} else {
System.out.println("FST stores docFreq");
}
}
Terms terms = MultiFields.getTerms(r, "body");
if (terms != null) {
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
final TermsEnum termsEnum = terms.iterator();
if (VERBOSE) {
System.out.println("TEST: got termsEnum=" + termsEnum);
}
BytesRef term;
int ord = 0;
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
while ((term = termsEnum.next()) != null) {
BytesRef term2 = termsEnum2.next();
assertNotNull(term2);
assertEquals(term, term2);
assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
if (ord == 0) {
try {
termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
if (VERBOSE) {
System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
}
storeOrd = false;
}
}
final int output;
if (storeOrd) {
output = ord;
} else {
output = termsEnum.docFreq();
}
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
}
FST<Long> fst = builder.finish();
if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
}
if (ord > 0) {
final Random random = new Random(random().nextLong());
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
int num = atLeast(1000);
for (int iter = 0; iter < num; iter++) {
final BytesRef randomTerm = new BytesRef(getRandomString(random));
if (VERBOSE) {
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
}
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
if (seekResult == TermsEnum.SeekStatus.END) {
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
} else {
assertSame(termsEnum, fstEnum, storeOrd);
for (int nextIter = 0; nextIter < 10; nextIter++) {
if (VERBOSE) {
System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
}
if (termsEnum.next() != null) {
if (VERBOSE) {
System.out.println(" term=" + termsEnum.term().utf8ToString());
}
assertNotNull(fstEnum.next());
assertSame(termsEnum, fstEnum, storeOrd);
} else {
if (VERBOSE) {
System.out.println(" end!");
}
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
}
}
}
}
}
}
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectEmptyString.
public void testIntersectEmptyString() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newStringField("field", "", Field.Store.NO));
doc.add(newStringField("field", "abc", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
// add empty string to both documents, so that singletonDocID == -1.
// For a FST-based term dict, we'll expect to see the first arc is
// flaged with HAS_FINAL_OUTPUT
doc.add(newStringField("field", "abc", Field.Store.NO));
doc.add(newStringField("field", "", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
// accept ALL
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.intersect(ca, null);
PostingsEnum de;
assertEquals("", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertEquals("abc", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertNull(te.next());
// pass empty string
te = terms.intersect(ca, new BytesRef(""));
assertEquals("abc", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertNull(te.next());
r.close();
dir.close();
}
Aggregations