Search in sources :

Example 21 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class FiniteStringsIteratorTest method testShortAccept.

public void testShortAccept() {
    Automaton a = Operations.union(Automata.makeString("x"), Automata.makeString("xy"));
    a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
    FiniteStringsIterator iterator = new FiniteStringsIterator(a);
    List<IntsRef> actual = getFiniteStrings(iterator);
    assertEquals(2, actual.size());
    IntsRefBuilder x = new IntsRefBuilder();
    Util.toIntsRef(new BytesRef("x"), x);
    assertTrue(actual.contains(x.get()));
    IntsRefBuilder xy = new IntsRefBuilder();
    Util.toIntsRef(new BytesRef("xy"), xy);
    assertTrue(actual.contains(xy.get()));
}
Also used : IntsRef(org.apache.lucene.util.IntsRef) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 22 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class FiniteStringsIteratorTest method testSingletonNoLimit.

public void testSingletonNoLimit() {
    Automaton a = Automata.makeString("foobar");
    FiniteStringsIterator iterator = new FiniteStringsIterator(a);
    List<IntsRef> actual = getFiniteStrings(iterator);
    assertEquals(1, actual.size());
    IntsRefBuilder scratch = new IntsRefBuilder();
    Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
    assertTrue(actual.contains(scratch.get()));
}
Also used : IntsRef(org.apache.lucene.util.IntsRef) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder)

Example 23 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class FiniteStringsIteratorTest method testRandomFiniteStrings1.

public void testRandomFiniteStrings1() {
    int numStrings = atLeast(100);
    if (VERBOSE) {
        System.out.println("TEST: numStrings=" + numStrings);
    }
    Set<IntsRef> strings = new HashSet<>();
    List<Automaton> automata = new ArrayList<>();
    IntsRefBuilder scratch = new IntsRefBuilder();
    for (int i = 0; i < numStrings; i++) {
        String s = TestUtil.randomSimpleString(random(), 1, 200);
        Util.toUTF32(s.toCharArray(), 0, s.length(), scratch);
        if (strings.add(scratch.toIntsRef())) {
            automata.add(Automata.makeString(s));
            if (VERBOSE) {
                System.out.println("  add string=" + s);
            }
        }
    }
    // TODO: we could sometimes use
    // DaciukMihovAutomatonBuilder here
    // TODO: what other random things can we do here...
    Automaton a = Operations.union(automata);
    if (random().nextBoolean()) {
        a = MinimizationOperations.minimize(a, 1000000);
        if (VERBOSE) {
            System.out.println("TEST: a.minimize numStates=" + a.getNumStates());
        }
    } else if (random().nextBoolean()) {
        if (VERBOSE) {
            System.out.println("TEST: a.determinize");
        }
        a = Operations.determinize(a, 1000000);
    } else if (random().nextBoolean()) {
        if (VERBOSE) {
            System.out.println("TEST: a.removeDeadStates");
        }
        a = Operations.removeDeadStates(a);
    }
    FiniteStringsIterator iterator = new FiniteStringsIterator(a);
    List<IntsRef> actual = getFiniteStrings(iterator);
    assertFiniteStringsRecursive(a, actual);
    if (!strings.equals(new HashSet<>(actual))) {
        System.out.println("strings.size()=" + strings.size() + " actual.size=" + actual.size());
        List<IntsRef> x = new ArrayList<>(strings);
        Collections.sort(x);
        List<IntsRef> y = new ArrayList<>(actual);
        Collections.sort(y);
        int end = Math.min(x.size(), y.size());
        for (int i = 0; i < end; i++) {
            System.out.println("  i=" + i + " string=" + toString(x.get(i)) + " actual=" + toString(y.get(i)));
        }
        fail("wrong strings found");
    }
}
Also used : ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) HashSet(java.util.HashSet)

Example 24 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class BaseTokenStreamTestCase method assertGraphStrings.

/**
   * Enumerates all accepted strings in the token graph created by the already initialized {@link TokenStream}.
   */
public static void assertGraphStrings(TokenStream tokenStream, String... expectedStrings) throws IOException {
    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
    Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
    Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> actualStrings = new HashSet<>();
    for (IntsRef ir : actualStringPaths) {
        actualStrings.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    for (String s : actualStrings) {
        assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), expectedStringsSet.contains(s));
    }
    for (String s : expectedStrings) {
        assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), actualStrings.contains(s));
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) Automaton(org.apache.lucene.util.automaton.Automaton) IntsRef(org.apache.lucene.util.IntsRef)

Example 25 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class FSTTester method verifyPruned.

// FST is pruned
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
        for (InputOutput<T> pair : pairs) {
            System.out.println("  " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
        }
    }
    // To validate the FST, we brute-force compute all prefixes
    // in the terms, matched to their "common" outputs, prune that
    // set according to the prune thresholds, then assert the FST
    // matches that same set.
    // NOTE: Crazy RAM intensive!!
    //System.out.println("TEST: tally prefixes");
    // build all prefixes
    final Map<IntsRef, CountMinOutput<T>> prefixes = new HashMap<>();
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for (InputOutput<T> pair : pairs) {
        scratch.copyInts(pair.input);
        for (int idx = 0; idx <= pair.input.length; idx++) {
            scratch.setLength(idx);
            CountMinOutput<T> cmo = prefixes.get(scratch.get());
            if (cmo == null) {
                cmo = new CountMinOutput<>();
                cmo.count = 1;
                cmo.output = pair.output;
                prefixes.put(scratch.toIntsRef(), cmo);
            } else {
                cmo.count++;
                T output1 = cmo.output;
                if (output1.equals(outputs.getNoOutput())) {
                    output1 = outputs.getNoOutput();
                }
                T output2 = pair.output;
                if (output2.equals(outputs.getNoOutput())) {
                    output2 = outputs.getNoOutput();
                }
                cmo.output = outputs.common(output1, output2);
            }
            if (idx == pair.input.length) {
                cmo.isFinal = true;
                cmo.finalOutput = cmo.output;
            }
        }
    }
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: now prune");
    }
    // prune 'em
    final Iterator<Map.Entry<IntsRef, CountMinOutput<T>>> it = prefixes.entrySet().iterator();
    while (it.hasNext()) {
        Map.Entry<IntsRef, CountMinOutput<T>> ent = it.next();
        final IntsRef prefix = ent.getKey();
        final CountMinOutput<T> cmo = ent.getValue();
        if (LuceneTestCase.VERBOSE) {
            System.out.println("  term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
        }
        final boolean keep;
        if (prune1 > 0) {
            keep = cmo.count >= prune1;
        } else {
            assert prune2 > 0;
            if (prune2 > 1 && cmo.count >= prune2) {
                keep = true;
            } else if (prefix.length > 0) {
                // consult our parent
                scratch.setLength(prefix.length - 1);
                System.arraycopy(prefix.ints, prefix.offset, scratch.ints(), 0, scratch.length());
                final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
                //System.out.println("    parent count = " + (cmo2 == null ? -1 : cmo2.count));
                keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
            } else if (cmo.count >= prune2) {
                keep = true;
            } else {
                keep = false;
            }
        }
        if (!keep) {
            it.remove();
        //System.out.println("    remove");
        } else {
            // clear isLeaf for all ancestors
            //System.out.println("    keep");
            scratch.copyInts(prefix);
            scratch.setLength(scratch.length() - 1);
            while (scratch.length() >= 0) {
                final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
                if (cmo2 != null) {
                    //System.out.println("    clear isLeaf " + inputToString(inputMode, scratch));
                    cmo2.isLeaf = false;
                }
                scratch.setLength(scratch.length() - 1);
            }
        }
    }
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: after prune");
        for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
            System.out.println("  " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
            if (ent.getValue().isFinal) {
                System.out.println("    finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
            }
        }
    }
    if (prefixes.size() <= 1) {
        assertNull(fst);
        return;
    }
    assertNotNull(fst);
    // make sure FST only enums valid prefixes
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: check pruned enum");
    }
    IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<>(fst);
    IntsRefFSTEnum.InputOutput<T> current;
    while ((current = fstEnum.next()) != null) {
        if (LuceneTestCase.VERBOSE) {
            System.out.println("  fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
        }
        final CountMinOutput<T> cmo = prefixes.get(current.input);
        assertNotNull(cmo);
        assertTrue(cmo.isLeaf || cmo.isFinal);
        //if (cmo.isFinal && !cmo.isLeaf) {
        if (cmo.isFinal) {
            assertEquals(cmo.finalOutput, current.output);
        } else {
            assertEquals(cmo.output, current.output);
        }
    }
    // make sure all non-pruned prefixes are present in the FST
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: verify all prefixes");
    }
    final int[] stopNode = new int[1];
    for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
        if (ent.getKey().length > 0) {
            final CountMinOutput<T> cmo = ent.getValue();
            final T output = run(fst, ent.getKey(), stopNode);
            if (LuceneTestCase.VERBOSE) {
                System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
            }
            // if (cmo.isFinal && !cmo.isLeaf) {
            if (cmo.isFinal) {
                assertEquals(cmo.finalOutput, output);
            } else {
                assertEquals(cmo.output, output);
            }
            assertEquals(ent.getKey().length, stopNode[0]);
        }
    }
}
Also used : HashMap(java.util.HashMap) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

IntsRef (org.apache.lucene.util.IntsRef)63 BytesRef (org.apache.lucene.util.BytesRef)19 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)19 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)13 Automaton (org.apache.lucene.util.automaton.Automaton)13 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)12 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)9 IOException (java.io.IOException)7 Directory (org.apache.lucene.store.Directory)7 HashMap (java.util.HashMap)5 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)5 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)5 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 FilterInputStream (java.io.FilterInputStream)4 InputStream (java.io.InputStream)4 Map (java.util.Map)4 Random (java.util.Random)4 TokenStream (org.apache.lucene.analysis.TokenStream)4