Search in sources :

Example 1 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class LuceneTestCase method assertTermsEquals.

/** 
   * Terms api equivalency 
   */
public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException {
    if (leftTerms == null || rightTerms == null) {
        assertNull(info, leftTerms);
        assertNull(info, rightTerms);
        return;
    }
    assertTermsStatisticsEquals(info, leftTerms, rightTerms);
    assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets());
    assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions());
    assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads());
    TermsEnum leftTermsEnum = leftTerms.iterator();
    TermsEnum rightTermsEnum = rightTerms.iterator();
    assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);
    assertTermsSeekingEquals(info, leftTerms, rightTerms);
    if (deep) {
        int numIntersections = atLeast(3);
        for (int i = 0; i < numIntersections; i++) {
            String re = AutomatonTestUtil.randomRegexp(random());
            CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
            if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
                // TODO: test start term too
                TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
                TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
                assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely());
            }
        }
    }
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton)

Example 2 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class BaseDocValuesFormatTestCase method testSortedSetTermsEnum.

public void testSortedSetTermsEnum() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
    iwconfig.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
    Document doc = new Document();
    doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
    doc.add(new SortedSetDocValuesField("field", new BytesRef("world")));
    doc.add(new SortedSetDocValuesField("field", new BytesRef("beer")));
    iwriter.addDocument(doc);
    DirectoryReader ireader = iwriter.getReader();
    iwriter.close();
    SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field");
    assertEquals(3, dv.getValueCount());
    TermsEnum termsEnum = dv.termsEnum();
    // next()
    assertEquals("beer", termsEnum.next().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertEquals("hello", termsEnum.next().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertEquals("world", termsEnum.next().utf8ToString());
    assertEquals(2, termsEnum.ord());
    // seekCeil()
    assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!")));
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer")));
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz")));
    // seekExact()
    assertTrue(termsEnum.seekExact(new BytesRef("beer")));
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertTrue(termsEnum.seekExact(new BytesRef("hello")));
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertTrue(termsEnum.seekExact(new BytesRef("world")));
    assertEquals("world", termsEnum.term().utf8ToString());
    assertEquals(2, termsEnum.ord());
    assertFalse(termsEnum.seekExact(new BytesRef("bogus")));
    // seek(ord)
    termsEnum.seekExact(0);
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    termsEnum.seekExact(1);
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    termsEnum.seekExact(2);
    assertEquals("world", termsEnum.term().utf8ToString());
    assertEquals(2, termsEnum.ord());
    // NORMAL automaton
    termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
    assertEquals("hello", termsEnum.next().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertEquals("world", termsEnum.next().utf8ToString());
    assertEquals(2, termsEnum.ord());
    assertNull(termsEnum.next());
    // SINGLE automaton
    termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
    assertEquals("hello", termsEnum.next().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertNull(termsEnum.next());
    ireader.close();
    directory.close();
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 3 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class RandomPostingsTester method testTermsOneThread.

private void testTermsOneThread(Random random, Fields fieldsSource, EnumSet<Option> options, IndexOptions maxTestOptions, IndexOptions maxIndexOptions, boolean alwaysTestMax) throws IOException {
    ThreadState threadState = new ThreadState();
    // Test random terms/fields:
    List<TermState> termStates = new ArrayList<>();
    List<FieldAndTerm> termStateTerms = new ArrayList<>();
    boolean supportsOrds = true;
    Collections.shuffle(allTerms, random);
    int upto = 0;
    while (upto < allTerms.size()) {
        boolean useTermState = termStates.size() != 0 && random.nextInt(5) == 1;
        boolean useTermOrd = supportsOrds && useTermState == false && random.nextInt(5) == 1;
        FieldAndTerm fieldAndTerm;
        TermsEnum termsEnum;
        TermState termState = null;
        if (!useTermState) {
            // Seek by random field+term:
            fieldAndTerm = allTerms.get(upto++);
            if (LuceneTestCase.VERBOSE) {
                if (useTermOrd) {
                    System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() + " using ord=" + fieldAndTerm.ord);
                } else {
                    System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
                }
            }
        } else {
            // Seek by previous saved TermState
            int idx = random.nextInt(termStates.size());
            fieldAndTerm = termStateTerms.get(idx);
            if (LuceneTestCase.VERBOSE) {
                System.out.println("\nTEST: seek using TermState to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
            }
            termState = termStates.get(idx);
        }
        Terms terms = fieldsSource.terms(fieldAndTerm.field);
        assertNotNull(terms);
        termsEnum = terms.iterator();
        if (!useTermState) {
            if (useTermOrd) {
                // Try seek by ord sometimes:
                try {
                    termsEnum.seekExact(fieldAndTerm.ord);
                } catch (UnsupportedOperationException uoe) {
                    supportsOrds = false;
                    assertTrue(termsEnum.seekExact(fieldAndTerm.term));
                }
            } else {
                assertTrue(termsEnum.seekExact(fieldAndTerm.term));
            }
        } else {
            termsEnum.seekExact(fieldAndTerm.term, termState);
        }
        // check we really seeked to the right place
        assertEquals(fieldAndTerm.term, termsEnum.term());
        long termOrd;
        if (supportsOrds) {
            try {
                termOrd = termsEnum.ord();
            } catch (UnsupportedOperationException uoe) {
                supportsOrds = false;
                termOrd = -1;
            }
        } else {
            termOrd = -1;
        }
        if (termOrd != -1) {
            // PostingsFormat supports ords
            assertEquals(fieldAndTerm.ord, termsEnum.ord());
        }
        boolean savedTermState = false;
        if (options.contains(Option.TERM_STATE) && !useTermState && random.nextInt(5) == 1) {
            // Save away this TermState:
            termStates.add(termsEnum.termState());
            termStateTerms.add(fieldAndTerm);
            savedTermState = true;
        }
        verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
        // Sometimes save term state after pulling the enum:
        if (options.contains(Option.TERM_STATE) && !useTermState && !savedTermState && random.nextInt(5) == 1) {
            // Save away this TermState:
            termStates.add(termsEnum.termState());
            termStateTerms.add(fieldAndTerm);
            useTermState = true;
        }
        // from the same term:
        if (alwaysTestMax || random.nextInt(10) == 7) {
            // Try same term again
            if (LuceneTestCase.VERBOSE) {
                System.out.println("TEST: try enum again on same term");
            }
            verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
        }
    }
    // Test Terms.intersect:
    for (String field : fields.keySet()) {
        while (true) {
            Automaton a = AutomatonTestUtil.randomAutomaton(random);
            CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
            if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
                // Keep retrying until we get an A that will really "use" the PF's intersect code:
                continue;
            }
            // System.out.println("A:\n" + a.toDot());
            BytesRef startTerm = null;
            if (random.nextBoolean()) {
                RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
                for (int iter = 0; iter < 100; iter++) {
                    int[] codePoints = ras.getRandomAcceptedString(random);
                    if (codePoints.length == 0) {
                        continue;
                    }
                    startTerm = new BytesRef(UnicodeUtil.newString(codePoints, 0, codePoints.length));
                    break;
                }
                // Don't allow empty string startTerm:
                if (startTerm == null) {
                    continue;
                }
            }
            TermsEnum intersected = fieldsSource.terms(field).intersect(ca, startTerm);
            Set<BytesRef> intersectedTerms = new HashSet<BytesRef>();
            BytesRef term;
            while ((term = intersected.next()) != null) {
                if (startTerm != null) {
                    // NOTE: not <=
                    assertTrue(startTerm.compareTo(term) < 0);
                }
                intersectedTerms.add(BytesRef.deepCopyOf(term));
                verifyEnum(random, threadState, field, term, intersected, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
            }
            if (ca.runAutomaton == null) {
                assertTrue(intersectedTerms.isEmpty());
            } else {
                for (BytesRef term2 : fields.get(field).keySet()) {
                    boolean expected;
                    if (startTerm != null && startTerm.compareTo(term2) >= 0) {
                        expected = false;
                    } else {
                        expected = ca.runAutomaton.run(term2.bytes, term2.offset, term2.length);
                    }
                    assertEquals("term=" + term2, expected, intersectedTerms.contains(term2));
                }
            }
            break;
        }
    }
}
Also used : CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) RandomAcceptedStrings(org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings) ArrayList(java.util.ArrayList) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 4 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class TestFSTs method testRealTerms.

// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
    final LineFileDocs docs = new LineFileDocs(random());
    final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
    final Path tempDir = createTempDir("fstlines");
    final Directory dir = newFSDirectory(tempDir);
    final IndexWriter writer = new IndexWriter(dir, conf);
    Document doc;
    int docCount = 0;
    while ((doc = docs.nextDoc()) != null && docCount < numDocs) {
        writer.addDocument(doc);
        docCount++;
    }
    IndexReader r = DirectoryReader.open(writer);
    writer.close();
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
    boolean storeOrd = random().nextBoolean();
    if (VERBOSE) {
        if (storeOrd) {
            System.out.println("FST stores ord");
        } else {
            System.out.println("FST stores docFreq");
        }
    }
    Terms terms = MultiFields.getTerms(r, "body");
    if (terms != null) {
        final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
        final TermsEnum termsEnum = terms.iterator();
        if (VERBOSE) {
            System.out.println("TEST: got termsEnum=" + termsEnum);
        }
        BytesRef term;
        int ord = 0;
        Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
        final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
        while ((term = termsEnum.next()) != null) {
            BytesRef term2 = termsEnum2.next();
            assertNotNull(term2);
            assertEquals(term, term2);
            assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
            assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
            if (ord == 0) {
                try {
                    termsEnum.ord();
                } catch (UnsupportedOperationException uoe) {
                    if (VERBOSE) {
                        System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
                    }
                    storeOrd = false;
                }
            }
            final int output;
            if (storeOrd) {
                output = ord;
            } else {
                output = termsEnum.docFreq();
            }
            builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
            ord++;
            if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
                System.out.println(ord + " terms...");
            }
        }
        FST<Long> fst = builder.finish();
        if (VERBOSE) {
            System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
        }
        if (ord > 0) {
            final Random random = new Random(random().nextLong());
            // Now confirm BytesRefFSTEnum and TermsEnum act the
            // same:
            final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
            int num = atLeast(1000);
            for (int iter = 0; iter < num; iter++) {
                final BytesRef randomTerm = new BytesRef(getRandomString(random));
                if (VERBOSE) {
                    System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
                }
                final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
                final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
                if (seekResult == TermsEnum.SeekStatus.END) {
                    assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
                } else {
                    assertSame(termsEnum, fstEnum, storeOrd);
                    for (int nextIter = 0; nextIter < 10; nextIter++) {
                        if (VERBOSE) {
                            System.out.println("TEST: next");
                            if (storeOrd) {
                                System.out.println("  ord=" + termsEnum.ord());
                            }
                        }
                        if (termsEnum.next() != null) {
                            if (VERBOSE) {
                                System.out.println("  term=" + termsEnum.term().utf8ToString());
                            }
                            assertNotNull(fstEnum.next());
                            assertSame(termsEnum, fstEnum, storeOrd);
                        } else {
                            if (VERBOSE) {
                                System.out.println("  end!");
                            }
                            BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
                            if (nextResult != null) {
                                System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
                                fail();
                            }
                            break;
                        }
                    }
                }
            }
        }
    }
    r.close();
    dir.close();
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) TermsEnum(org.apache.lucene.index.TermsEnum) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Random(java.util.Random) BytesRef(org.apache.lucene.util.BytesRef) LineFileDocs(org.apache.lucene.util.LineFileDocs) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) RegExp(org.apache.lucene.util.automaton.RegExp) Terms(org.apache.lucene.index.Terms) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 5 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class TestTermsEnum method testIntersectEmptyString.

public void testIntersectEmptyString() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(new LogDocMergePolicy());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(newStringField("field", "", Field.Store.NO));
    doc.add(newStringField("field", "abc", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    // add empty string to both documents, so that singletonDocID == -1.
    // For a FST-based term dict, we'll expect to see the first arc is 
    // flaged with HAS_FINAL_OUTPUT
    doc.add(newStringField("field", "abc", Field.Store.NO));
    doc.add(newStringField("field", "", Field.Store.NO));
    w.addDocument(doc);
    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    LeafReader sub = getOnlyLeafReader(r);
    Terms terms = sub.fields().terms("field");
    // accept ALL
    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
    TermsEnum te = terms.intersect(ca, null);
    PostingsEnum de;
    assertEquals("", te.next().utf8ToString());
    de = te.postings(null, PostingsEnum.NONE);
    assertEquals(0, de.nextDoc());
    assertEquals(1, de.nextDoc());
    assertEquals("abc", te.next().utf8ToString());
    de = te.postings(null, PostingsEnum.NONE);
    assertEquals(0, de.nextDoc());
    assertEquals(1, de.nextDoc());
    assertNull(te.next());
    // pass empty string
    te = terms.intersect(ca, new BytesRef(""));
    assertEquals("abc", te.next().utf8ToString());
    de = te.postings(null, PostingsEnum.NONE);
    assertEquals(0, de.nextDoc());
    assertEquals(1, de.nextDoc());
    assertNull(te.next());
    r.close();
    dir.close();
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)13 BytesRef (org.apache.lucene.util.BytesRef)10 RegExp (org.apache.lucene.util.automaton.RegExp)9 Directory (org.apache.lucene.store.Directory)8 Document (org.apache.lucene.document.Document)7 Automaton (org.apache.lucene.util.automaton.Automaton)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)6 Analyzer (org.apache.lucene.analysis.Analyzer)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)2 IOException (java.io.IOException)1 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 Random (java.util.Random)1 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)1 SortedSetDocValuesField (org.apache.lucene.document.SortedSetDocValuesField)1 IndexReader (org.apache.lucene.index.IndexReader)1 IndexWriter (org.apache.lucene.index.IndexWriter)1 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)1