Search in sources :

Example 6 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class TestTermsEnum method testIntersectBasic.

public void testIntersectBasic() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(new LogDocMergePolicy());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(newTextField("field", "aaa", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("field", "bbb", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newTextField("field", "ccc", Field.Store.NO));
    w.addDocument(doc);
    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    LeafReader sub = getOnlyLeafReader(r);
    Terms terms = sub.fields().terms("field");
    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
    TermsEnum te = terms.intersect(ca, null);
    assertEquals("aaa", te.next().utf8ToString());
    assertEquals(0, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("bbb", te.next().utf8ToString());
    assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("ccc", te.next().utf8ToString());
    assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    te = terms.intersect(ca, new BytesRef("abc"));
    assertEquals("bbb", te.next().utf8ToString());
    assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("ccc", te.next().utf8ToString());
    assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    te = terms.intersect(ca, new BytesRef("aaa"));
    assertEquals("bbb", te.next().utf8ToString());
    assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("ccc", te.next().utf8ToString());
    assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    r.close();
    dir.close();
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 7 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class FuzzyTermsEnum method getAutomatonEnum.

/**
   * return an automata-based enum for matching up to editDistance from
   * lastTerm, if possible
   */
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) throws IOException {
    assert editDistance < automata.length;
    final CompiledAutomaton compiled = automata[editDistance];
    BytesRef initialSeekTerm;
    if (lastTerm == null) {
        // This is the first enum we are pulling:
        initialSeekTerm = null;
    } else {
        // We are pulling this enum (e.g., ed=1) after iterating for a while already (e.g., ed=2):
        initialSeekTerm = compiled.floor(lastTerm, new BytesRefBuilder());
    }
    return terms.intersect(compiled, initialSeekTerm);
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) BytesRef(org.apache.lucene.util.BytesRef)

Example 8 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class TestTermsEnum method testIntersectStartTerm.

public void testIntersectStartTerm() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(new LogDocMergePolicy());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(newStringField("field", "abc", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("field", "abd", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("field", "acd", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("field", "bcd", Field.Store.NO));
    w.addDocument(doc);
    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    LeafReader sub = getOnlyLeafReader(r);
    Terms terms = sub.fields().terms("field");
    Automaton automaton = new RegExp(".*d", RegExp.NONE).toAutomaton();
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
    TermsEnum te;
    // should seek to startTerm
    te = terms.intersect(ca, new BytesRef("aad"));
    assertEquals("abd", te.next().utf8ToString());
    assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("acd", te.next().utf8ToString());
    assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("bcd", te.next().utf8ToString());
    assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    // should fail to find ceil label on second arc, rewind 
    te = terms.intersect(ca, new BytesRef("add"));
    assertEquals("bcd", te.next().utf8ToString());
    assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    // should reach end
    te = terms.intersect(ca, new BytesRef("bcd"));
    assertNull(te.next());
    te = terms.intersect(ca, new BytesRef("ddd"));
    assertNull(te.next());
    r.close();
    dir.close();
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 9 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class TestTermsEnum method testIntersectRandom.

// Tests Terms.intersect
public void testIntersectRandom() throws IOException {
    final Directory dir = newDirectory();
    final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    final int numTerms = atLeast(300);
    //final int numTerms = 50;
    final Set<String> terms = new HashSet<>();
    final Collection<String> pendingTerms = new ArrayList<>();
    final Map<BytesRef, Integer> termToID = new HashMap<>();
    int id = 0;
    while (terms.size() != numTerms) {
        final String s = getRandomString();
        if (!terms.contains(s)) {
            terms.add(s);
            pendingTerms.add(s);
            if (random().nextInt(20) == 7) {
                addDoc(w, pendingTerms, termToID, id++);
            }
        }
    }
    addDoc(w, pendingTerms, termToID, id++);
    final BytesRef[] termsArray = new BytesRef[terms.size()];
    final Set<BytesRef> termsSet = new HashSet<>();
    {
        int upto = 0;
        for (String s : terms) {
            final BytesRef b = new BytesRef(s);
            termsArray[upto++] = b;
            termsSet.add(b);
        }
        Arrays.sort(termsArray);
    }
    if (VERBOSE) {
        System.out.println("\nTEST: indexed terms (unicode order):");
        for (BytesRef t : termsArray) {
            System.out.println("  " + t.utf8ToString() + " -> id:" + termToID.get(t));
        }
    }
    final IndexReader r = w.getReader();
    w.close();
    int[] docIDToID = new int[r.maxDoc()];
    NumericDocValues values = MultiDocValues.getNumericValues(r, "id");
    for (int i = 0; i < r.maxDoc(); i++) {
        assertEquals(i, values.nextDoc());
        docIDToID[i] = (int) values.longValue();
    }
    for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) {
        // TODO: can we also test infinite As here...?
        // From the random terms, pick some ratio and compile an
        // automaton:
        final Set<String> acceptTerms = new HashSet<>();
        final TreeSet<BytesRef> sortedAcceptTerms = new TreeSet<>();
        final double keepPct = random().nextDouble();
        Automaton a;
        if (iter == 0) {
            if (VERBOSE) {
                System.out.println("\nTEST: empty automaton");
            }
            a = Automata.makeEmpty();
        } else {
            if (VERBOSE) {
                System.out.println("\nTEST: keepPct=" + keepPct);
            }
            for (String s : terms) {
                final String s2;
                if (random().nextDouble() <= keepPct) {
                    s2 = s;
                } else {
                    s2 = getRandomString();
                }
                acceptTerms.add(s2);
                sortedAcceptTerms.add(new BytesRef(s2));
            }
            a = Automata.makeStringUnion(sortedAcceptTerms);
        }
        final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000, false);
        final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
        final Set<BytesRef> acceptTermsSet = new HashSet<>();
        int upto = 0;
        for (String s : acceptTerms) {
            final BytesRef b = new BytesRef(s);
            acceptTermsArray[upto++] = b;
            acceptTermsSet.add(b);
            assertTrue(accepts(c, b));
        }
        Arrays.sort(acceptTermsArray);
        if (VERBOSE) {
            System.out.println("\nTEST: accept terms (unicode order):");
            for (BytesRef t : acceptTermsArray) {
                System.out.println("  " + t.utf8ToString() + (termsSet.contains(t) ? " (exists)" : ""));
            }
            System.out.println(a.toDot());
        }
        for (int iter2 = 0; iter2 < 100; iter2++) {
            final BytesRef startTerm = acceptTermsArray.length == 0 || random().nextBoolean() ? null : acceptTermsArray[random().nextInt(acceptTermsArray.length)];
            if (VERBOSE) {
                System.out.println("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.utf8ToString()));
                if (startTerm != null) {
                    int state = 0;
                    for (int idx = 0; idx < startTerm.length; idx++) {
                        final int label = startTerm.bytes[startTerm.offset + idx] & 0xff;
                        System.out.println("  state=" + state + " label=" + label);
                        state = c.runAutomaton.step(state, label);
                        assertTrue(state != -1);
                    }
                    System.out.println("  state=" + state);
                }
            }
            final TermsEnum te = MultiFields.getTerms(r, "f").intersect(c, startTerm);
            int loc;
            if (startTerm == null) {
                loc = 0;
            } else {
                loc = Arrays.binarySearch(termsArray, BytesRef.deepCopyOf(startTerm));
                if (loc < 0) {
                    loc = -(loc + 1);
                } else {
                    // startTerm exists in index
                    loc++;
                }
            }
            while (loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc])) {
                loc++;
            }
            PostingsEnum postingsEnum = null;
            while (loc < termsArray.length) {
                final BytesRef expected = termsArray[loc];
                final BytesRef actual = te.next();
                if (VERBOSE) {
                    System.out.println("TEST:   next() expected=" + expected.utf8ToString() + " actual=" + (actual == null ? "null" : actual.utf8ToString()));
                }
                assertEquals(expected, actual);
                assertEquals(1, te.docFreq());
                postingsEnum = TestUtil.docs(random(), te, postingsEnum, PostingsEnum.NONE);
                final int docID = postingsEnum.nextDoc();
                assertTrue(docID != DocIdSetIterator.NO_MORE_DOCS);
                assertEquals(docIDToID[docID], termToID.get(expected).intValue());
                do {
                    loc++;
                } while (loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc]));
            }
            assertNull(te.next());
        }
    }
    r.close();
    dir.close();
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 10 with CompiledAutomaton

use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.

the class TestTermsEnum method testIntersectRegexp.

// LUCENE-7576
public void testIntersectRegexp() throws Exception {
    Directory d = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), d);
    Document doc = new Document();
    doc.add(newStringField("field", "foobar", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    Fields fields = MultiFields.getFields(r);
    CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
    Terms terms = fields.terms("field");
    String message = expectThrows(IllegalArgumentException.class, () -> {
        terms.intersect(automaton, null);
    }).getMessage();
    assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
    r.close();
    w.close();
    d.close();
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory)

Aggregations

CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)12 BytesRef (org.apache.lucene.util.BytesRef)9 RegExp (org.apache.lucene.util.automaton.RegExp)9 Directory (org.apache.lucene.store.Directory)8 Document (org.apache.lucene.document.Document)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)6 Automaton (org.apache.lucene.util.automaton.Automaton)6 Analyzer (org.apache.lucene.analysis.Analyzer)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)2 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 Random (java.util.Random)1 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)1 SortedSetDocValuesField (org.apache.lucene.document.SortedSetDocValuesField)1 IndexReader (org.apache.lucene.index.IndexReader)1 IndexWriter (org.apache.lucene.index.IndexWriter)1 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)1 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)1