Search in sources :

Example 1 with RandomAcceptedStrings

use of org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings in project lucene-solr by apache.

the class RandomPostingsTester method testTermsOneThread.

private void testTermsOneThread(Random random, Fields fieldsSource, EnumSet<Option> options, IndexOptions maxTestOptions, IndexOptions maxIndexOptions, boolean alwaysTestMax) throws IOException {
    ThreadState threadState = new ThreadState();
    // Test random terms/fields:
    List<TermState> termStates = new ArrayList<>();
    List<FieldAndTerm> termStateTerms = new ArrayList<>();
    boolean supportsOrds = true;
    Collections.shuffle(allTerms, random);
    int upto = 0;
    while (upto < allTerms.size()) {
        boolean useTermState = termStates.size() != 0 && random.nextInt(5) == 1;
        boolean useTermOrd = supportsOrds && useTermState == false && random.nextInt(5) == 1;
        FieldAndTerm fieldAndTerm;
        TermsEnum termsEnum;
        TermState termState = null;
        if (!useTermState) {
            // Seek by random field+term:
            fieldAndTerm = allTerms.get(upto++);
            if (LuceneTestCase.VERBOSE) {
                if (useTermOrd) {
                    System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() + " using ord=" + fieldAndTerm.ord);
                } else {
                    System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
                }
            }
        } else {
            // Seek by previous saved TermState
            int idx = random.nextInt(termStates.size());
            fieldAndTerm = termStateTerms.get(idx);
            if (LuceneTestCase.VERBOSE) {
                System.out.println("\nTEST: seek using TermState to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
            }
            termState = termStates.get(idx);
        }
        Terms terms = fieldsSource.terms(fieldAndTerm.field);
        assertNotNull(terms);
        termsEnum = terms.iterator();
        if (!useTermState) {
            if (useTermOrd) {
                // Try seek by ord sometimes:
                try {
                    termsEnum.seekExact(fieldAndTerm.ord);
                } catch (UnsupportedOperationException uoe) {
                    supportsOrds = false;
                    assertTrue(termsEnum.seekExact(fieldAndTerm.term));
                }
            } else {
                assertTrue(termsEnum.seekExact(fieldAndTerm.term));
            }
        } else {
            termsEnum.seekExact(fieldAndTerm.term, termState);
        }
        // check we really seeked to the right place
        assertEquals(fieldAndTerm.term, termsEnum.term());
        long termOrd;
        if (supportsOrds) {
            try {
                termOrd = termsEnum.ord();
            } catch (UnsupportedOperationException uoe) {
                supportsOrds = false;
                termOrd = -1;
            }
        } else {
            termOrd = -1;
        }
        if (termOrd != -1) {
            // PostingsFormat supports ords
            assertEquals(fieldAndTerm.ord, termsEnum.ord());
        }
        boolean savedTermState = false;
        if (options.contains(Option.TERM_STATE) && !useTermState && random.nextInt(5) == 1) {
            // Save away this TermState:
            termStates.add(termsEnum.termState());
            termStateTerms.add(fieldAndTerm);
            savedTermState = true;
        }
        verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
        // Sometimes save term state after pulling the enum:
        if (options.contains(Option.TERM_STATE) && !useTermState && !savedTermState && random.nextInt(5) == 1) {
            // Save away this TermState:
            termStates.add(termsEnum.termState());
            termStateTerms.add(fieldAndTerm);
            useTermState = true;
        }
        // from the same term:
        if (alwaysTestMax || random.nextInt(10) == 7) {
            // Try same term again
            if (LuceneTestCase.VERBOSE) {
                System.out.println("TEST: try enum again on same term");
            }
            verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
        }
    }
    // Test Terms.intersect:
    for (String field : fields.keySet()) {
        while (true) {
            Automaton a = AutomatonTestUtil.randomAutomaton(random);
            CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
            if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
                // Keep retrying until we get an A that will really "use" the PF's intersect code:
                continue;
            }
            // System.out.println("A:\n" + a.toDot());
            BytesRef startTerm = null;
            if (random.nextBoolean()) {
                RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
                for (int iter = 0; iter < 100; iter++) {
                    int[] codePoints = ras.getRandomAcceptedString(random);
                    if (codePoints.length == 0) {
                        continue;
                    }
                    startTerm = new BytesRef(UnicodeUtil.newString(codePoints, 0, codePoints.length));
                    break;
                }
                // Don't allow empty string startTerm:
                if (startTerm == null) {
                    continue;
                }
            }
            TermsEnum intersected = fieldsSource.terms(field).intersect(ca, startTerm);
            Set<BytesRef> intersectedTerms = new HashSet<BytesRef>();
            BytesRef term;
            while ((term = intersected.next()) != null) {
                if (startTerm != null) {
                    // NOTE: not <=
                    assertTrue(startTerm.compareTo(term) < 0);
                }
                intersectedTerms.add(BytesRef.deepCopyOf(term));
                verifyEnum(random, threadState, field, term, intersected, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
            }
            if (ca.runAutomaton == null) {
                assertTrue(intersectedTerms.isEmpty());
            } else {
                for (BytesRef term2 : fields.get(field).keySet()) {
                    boolean expected;
                    if (startTerm != null && startTerm.compareTo(term2) >= 0) {
                        expected = false;
                    } else {
                        expected = ca.runAutomaton.run(term2.bytes, term2.offset, term2.length);
                    }
                    assertEquals("term=" + term2, expected, intersectedTerms.contains(term2));
                }
            }
            break;
        }
    }
}
Also used : CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) RandomAcceptedStrings(org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings) ArrayList(java.util.ArrayList) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 2 with RandomAcceptedStrings

use of org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings in project lucene-solr by apache.

the class TestAutomaton method testRandomFinite.

public void testRandomFinite() throws Exception {
    int numTerms = atLeast(10);
    int iters = atLeast(100);
    if (VERBOSE) {
        System.out.println("TEST: numTerms=" + numTerms + " iters=" + iters);
    }
    Set<BytesRef> terms = new HashSet<>();
    while (terms.size() < numTerms) {
        terms.add(new BytesRef(getRandomString()));
    }
    Automaton a = unionTerms(terms);
    assertSame(terms, a);
    for (int iter = 0; iter < iters; iter++) {
        if (VERBOSE) {
            System.out.println("TEST: iter=" + iter + " numTerms=" + terms.size() + " a.numStates=" + a.getNumStates());
        /*
        System.out.println("  terms:");
        for(BytesRef term : terms) {
          System.out.println("    " + term);
        }
        */
        }
        switch(random().nextInt(15)) {
            case 0:
                // concatenate prefix
                {
                    if (VERBOSE) {
                        System.out.println("  op=concat prefix");
                    }
                    Set<BytesRef> newTerms = new HashSet<>();
                    BytesRef prefix = new BytesRef(getRandomString());
                    BytesRefBuilder newTerm = new BytesRefBuilder();
                    for (BytesRef term : terms) {
                        newTerm.copyBytes(prefix);
                        newTerm.append(term);
                        newTerms.add(newTerm.toBytesRef());
                    }
                    terms = newTerms;
                    boolean wasDeterministic1 = a.isDeterministic();
                    a = Operations.concatenate(Automata.makeString(prefix.utf8ToString()), a);
                    assertEquals(wasDeterministic1, a.isDeterministic());
                }
                break;
            case 1:
                // concatenate suffix
                {
                    BytesRef suffix = new BytesRef(getRandomString());
                    if (VERBOSE) {
                        System.out.println("  op=concat suffix " + suffix);
                    }
                    Set<BytesRef> newTerms = new HashSet<>();
                    BytesRefBuilder newTerm = new BytesRefBuilder();
                    for (BytesRef term : terms) {
                        newTerm.copyBytes(term);
                        newTerm.append(suffix);
                        newTerms.add(newTerm.toBytesRef());
                    }
                    terms = newTerms;
                    a = Operations.concatenate(a, Automata.makeString(suffix.utf8ToString()));
                }
                break;
            case 2:
                // determinize
                if (VERBOSE) {
                    System.out.println("  op=determinize");
                }
                a = Operations.determinize(a, Integer.MAX_VALUE);
                assertTrue(a.isDeterministic());
                break;
            case 3:
                if (a.getNumStates() < 100) {
                    if (VERBOSE) {
                        System.out.println("  op=minimize");
                    }
                    // minimize
                    a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
                } else if (VERBOSE) {
                    System.out.println("  skip op=minimize: too many states (" + a.getNumStates() + ")");
                }
                break;
            case 4:
                // union
                {
                    if (VERBOSE) {
                        System.out.println("  op=union");
                    }
                    Set<BytesRef> newTerms = new HashSet<>();
                    int numNewTerms = random().nextInt(5);
                    while (newTerms.size() < numNewTerms) {
                        newTerms.add(new BytesRef(getRandomString()));
                    }
                    terms.addAll(newTerms);
                    Automaton newA = unionTerms(newTerms);
                    a = Operations.union(a, newA);
                }
                break;
            case 5:
                // optional
                {
                    if (VERBOSE) {
                        System.out.println("  op=optional");
                    }
                    // NOTE: This can add a dead state:
                    a = Operations.optional(a);
                    terms.add(new BytesRef());
                }
                break;
            case 6:
                // minus finite 
                {
                    if (VERBOSE) {
                        System.out.println("  op=minus finite");
                    }
                    if (terms.size() > 0) {
                        RandomAcceptedStrings rasl = new RandomAcceptedStrings(Operations.removeDeadStates(a));
                        Set<BytesRef> toRemove = new HashSet<>();
                        int numToRemove = TestUtil.nextInt(random(), 1, (terms.size() + 1) / 2);
                        while (toRemove.size() < numToRemove) {
                            int[] ints = rasl.getRandomAcceptedString(random());
                            BytesRef term = new BytesRef(UnicodeUtil.newString(ints, 0, ints.length));
                            if (toRemove.contains(term) == false) {
                                toRemove.add(term);
                            }
                        }
                        for (BytesRef term : toRemove) {
                            boolean removed = terms.remove(term);
                            assertTrue(removed);
                        }
                        Automaton a2 = unionTerms(toRemove);
                        a = Operations.minus(a, a2, Integer.MAX_VALUE);
                    }
                }
                break;
            case 7:
                {
                    // minus infinite
                    List<Automaton> as = new ArrayList<>();
                    int count = TestUtil.nextInt(random(), 1, 5);
                    Set<Integer> prefixes = new HashSet<>();
                    while (prefixes.size() < count) {
                        // prefix is a leading ascii byte; we remove <prefix>* from a
                        int prefix = random().nextInt(128);
                        prefixes.add(prefix);
                    }
                    if (VERBOSE) {
                        System.out.println("  op=minus infinite prefixes=" + prefixes);
                    }
                    for (int prefix : prefixes) {
                        // prefix is a leading ascii byte; we remove <prefix>* from a
                        Automaton a2 = new Automaton();
                        int init = a2.createState();
                        int state = a2.createState();
                        a2.addTransition(init, state, prefix);
                        a2.setAccept(state, true);
                        a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
                        a2.finishState();
                        as.add(a2);
                        Iterator<BytesRef> it = terms.iterator();
                        while (it.hasNext()) {
                            BytesRef term = it.next();
                            if (term.length > 0 && (term.bytes[term.offset] & 0xFF) == prefix) {
                                it.remove();
                            }
                        }
                    }
                    Automaton a2 = randomNoOp(Operations.union(as));
                    a = Operations.minus(a, a2, DEFAULT_MAX_DETERMINIZED_STATES);
                }
                break;
            case 8:
                {
                    int count = TestUtil.nextInt(random(), 10, 20);
                    if (VERBOSE) {
                        System.out.println("  op=intersect infinite count=" + count);
                    }
                    // intersect infinite
                    List<Automaton> as = new ArrayList<>();
                    Set<Integer> prefixes = new HashSet<>();
                    while (prefixes.size() < count) {
                        int prefix = random().nextInt(128);
                        prefixes.add(prefix);
                    }
                    if (VERBOSE) {
                        System.out.println("  prefixes=" + prefixes);
                    }
                    for (int prefix : prefixes) {
                        // prefix is a leading ascii byte; we retain <prefix>* in a
                        Automaton a2 = new Automaton();
                        int init = a2.createState();
                        int state = a2.createState();
                        a2.addTransition(init, state, prefix);
                        a2.setAccept(state, true);
                        a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
                        a2.finishState();
                        as.add(a2);
                        prefixes.add(prefix);
                    }
                    Automaton a2 = Operations.union(as);
                    if (random().nextBoolean()) {
                        a2 = Operations.determinize(a2, DEFAULT_MAX_DETERMINIZED_STATES);
                    } else if (random().nextBoolean()) {
                        a2 = MinimizationOperations.minimize(a2, DEFAULT_MAX_DETERMINIZED_STATES);
                    }
                    a = Operations.intersection(a, a2);
                    Iterator<BytesRef> it = terms.iterator();
                    while (it.hasNext()) {
                        BytesRef term = it.next();
                        if (term.length == 0 || prefixes.contains(term.bytes[term.offset] & 0xff) == false) {
                            if (VERBOSE) {
                                System.out.println("  drop term=" + term);
                            }
                            it.remove();
                        } else {
                            if (VERBOSE) {
                                System.out.println("  keep term=" + term);
                            }
                        }
                    }
                }
                break;
            case 9:
                // reverse
                {
                    if (VERBOSE) {
                        System.out.println("  op=reverse");
                    }
                    a = Operations.reverse(a);
                    Set<BytesRef> newTerms = new HashSet<>();
                    for (BytesRef term : terms) {
                        newTerms.add(new BytesRef(new StringBuilder(term.utf8ToString()).reverse().toString()));
                    }
                    terms = newTerms;
                }
                break;
            case 10:
                if (VERBOSE) {
                    System.out.println("  op=randomNoOp");
                }
                a = randomNoOp(a);
                break;
            case 11:
                // interval
                {
                    int min = random().nextInt(1000);
                    int max = min + random().nextInt(50);
                    // digits must be non-zero else we make cycle
                    int digits = Integer.toString(max).length();
                    if (VERBOSE) {
                        System.out.println("  op=union interval min=" + min + " max=" + max + " digits=" + digits);
                    }
                    a = Operations.union(a, Automata.makeDecimalInterval(min, max, digits));
                    StringBuilder b = new StringBuilder();
                    for (int i = 0; i < digits; i++) {
                        b.append('0');
                    }
                    String prefix = b.toString();
                    for (int i = min; i <= max; i++) {
                        String s = Integer.toString(i);
                        if (s.length() < digits) {
                            // Left-fill with 0s
                            s = prefix.substring(s.length()) + s;
                        }
                        terms.add(new BytesRef(s));
                    }
                }
                break;
            case 12:
                if (VERBOSE) {
                    System.out.println("  op=remove the empty string");
                }
                a = Operations.minus(a, Automata.makeEmptyString(), DEFAULT_MAX_DETERMINIZED_STATES);
                terms.remove(new BytesRef());
                break;
            case 13:
                if (VERBOSE) {
                    System.out.println("  op=add the empty string");
                }
                a = Operations.union(a, Automata.makeEmptyString());
                terms.add(new BytesRef());
                break;
            case 14:
                // Safety in case we are really unlucky w/ the dice:
                if (terms.size() <= numTerms * 3) {
                    if (VERBOSE) {
                        System.out.println("  op=concat finite automaton");
                    }
                    int count = random().nextBoolean() ? 2 : 3;
                    Set<BytesRef> addTerms = new HashSet<>();
                    while (addTerms.size() < count) {
                        addTerms.add(new BytesRef(getRandomString()));
                    }
                    if (VERBOSE) {
                        for (BytesRef term : addTerms) {
                            System.out.println("    term=" + term);
                        }
                    }
                    Automaton a2 = unionTerms(addTerms);
                    Set<BytesRef> newTerms = new HashSet<>();
                    if (random().nextBoolean()) {
                        // suffix
                        if (VERBOSE) {
                            System.out.println("  do suffix");
                        }
                        a = Operations.concatenate(a, randomNoOp(a2));
                        BytesRefBuilder newTerm = new BytesRefBuilder();
                        for (BytesRef term : terms) {
                            for (BytesRef suffix : addTerms) {
                                newTerm.copyBytes(term);
                                newTerm.append(suffix);
                                newTerms.add(newTerm.toBytesRef());
                            }
                        }
                    } else {
                        // prefix
                        if (VERBOSE) {
                            System.out.println("  do prefix");
                        }
                        a = Operations.concatenate(randomNoOp(a2), a);
                        BytesRefBuilder newTerm = new BytesRefBuilder();
                        for (BytesRef term : terms) {
                            for (BytesRef prefix : addTerms) {
                                newTerm.copyBytes(prefix);
                                newTerm.append(term);
                                newTerms.add(newTerm.toBytesRef());
                            }
                        }
                    }
                    terms = newTerms;
                }
                break;
            default:
                throw new AssertionError();
        }
        assertSame(terms, a);
        assertEquals(AutomatonTestUtil.isDeterministicSlow(a), a.isDeterministic());
        if (random().nextInt(10) == 7) {
            a = verifyTopoSort(a);
        }
    }
    assertSame(terms, a);
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) Set(java.util.Set) HashSet(java.util.HashSet) RandomAcceptedStrings(org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 3 with RandomAcceptedStrings

use of org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings in project lucene-solr by apache.

the class TestAutomaton method testReverseRandom2.

public void testReverseRandom2() throws Exception {
    int ITERS = atLeast(100);
    for (int iter = 0; iter < ITERS; iter++) {
        //System.out.println("TEST: iter=" + iter);
        Automaton a = AutomatonTestUtil.randomAutomaton(random());
        if (random().nextBoolean()) {
            a = Operations.removeDeadStates(a);
        }
        Automaton ra = Operations.reverse(a);
        Automaton rda = Operations.determinize(ra, Integer.MAX_VALUE);
        if (Operations.isEmpty(a)) {
            assertTrue(Operations.isEmpty(rda));
            continue;
        }
        RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
        for (int iter2 = 0; iter2 < 20; iter2++) {
            // Find string accepted by original automaton
            int[] s = ras.getRandomAcceptedString(random());
            // Reverse it
            for (int j = 0; j < s.length / 2; j++) {
                int x = s[j];
                s[j] = s[s.length - j - 1];
                s[s.length - j - 1] = x;
            }
            //System.out.println("TEST:   iter2=" + iter2 + " s=" + Arrays.toString(s));
            // Make sure reversed automaton accepts it
            assertTrue(Operations.run(rda, new IntsRef(s, 0, s.length)));
        }
    }
}
Also used : RandomAcceptedStrings(org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings) IntsRef(org.apache.lucene.util.IntsRef)

Aggregations

RandomAcceptedStrings (org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings)3 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 BytesRef (org.apache.lucene.util.BytesRef)2 Iterator (java.util.Iterator)1 List (java.util.List)1 Set (java.util.Set)1 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)1 IntsRef (org.apache.lucene.util.IntsRef)1 Automaton (org.apache.lucene.util.automaton.Automaton)1 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)1