Search in sources :

Example 6 with Pair

use of org.apache.lucene.util.fst.PairOutputs.Pair in project lucene-solr by apache.

the class TestFSTs method testShortestPathsWFSTRandom.

/** like testShortestPathsRandom, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFSTRandom() throws Exception {
    int numWords = atLeast(1000);
    final TreeMap<String, TwoLongs> slowCompletor = new TreeMap<>();
    final TreeSet<String> allPrefixes = new TreeSet<>();
    PairOutputs<Long, Long> outputs = new PairOutputs<>(// weight
    PositiveIntOutputs.getSingleton(), // output
    PositiveIntOutputs.getSingleton());
    final Builder<Pair<Long, Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    Random random = random();
    for (int i = 0; i < numWords; i++) {
        String s;
        while (true) {
            s = TestUtil.randomSimpleString(random);
            if (!slowCompletor.containsKey(s)) {
                break;
            }
        }
        for (int j = 1; j < s.length(); j++) {
            allPrefixes.add(s.substring(0, j));
        }
        // weights 1..100
        int weight = TestUtil.nextInt(random, 1, 100);
        // outputs 0..500
        int output = TestUtil.nextInt(random, 0, 500);
        slowCompletor.put(s, new TwoLongs(weight, output));
    }
    for (Map.Entry<String, TwoLongs> e : slowCompletor.entrySet()) {
        //System.out.println("add: " + e);
        long weight = e.getValue().a;
        long output = e.getValue().b;
        builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
    }
    final FST<Pair<Long, Long>> fst = builder.finish();
    //System.out.println("SAVE out.dot");
    //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
    //Util.toDot(fst, w, false, false);
    //w.close();
    BytesReader reader = fst.getBytesReader();
    //System.out.println("testing: " + allPrefixes.size() + " prefixes");
    for (String prefix : allPrefixes) {
        // 1. run prefix against fst, then complete by value
        //System.out.println("TEST: " + prefix);
        Pair<Long, Long> prefixOutput = outputs.getNoOutput();
        FST.Arc<Pair<Long, Long>> arc = fst.getFirstArc(new FST.Arc<Pair<Long, Long>>());
        for (int idx = 0; idx < prefix.length(); idx++) {
            if (fst.findTargetArc((int) prefix.charAt(idx), arc, arc, reader) == null) {
                fail();
            }
            prefixOutput = outputs.add(prefixOutput, arc.output);
        }
        final int topN = TestUtil.nextInt(random, 1, 10);
        Util.TopResults<Pair<Long, Long>> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true);
        assertTrue(r.isComplete);
        // 2. go thru whole treemap (slowCompletor) and check it's actually the best suggestion
        final List<Result<Pair<Long, Long>>> matches = new ArrayList<>();
        // TODO: could be faster... but it's slowCompletor for a reason
        for (Map.Entry<String, TwoLongs> e : slowCompletor.entrySet()) {
            if (e.getKey().startsWith(prefix)) {
                //System.out.println("  consider " + e.getKey());
                matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRefBuilder()), outputs.newPair(e.getValue().a - prefixOutput.output1, e.getValue().b - prefixOutput.output2)));
            }
        }
        assertTrue(matches.size() > 0);
        Collections.sort(matches, new TieBreakByInputComparator<>(minPairWeightComparator));
        if (matches.size() > topN) {
            matches.subList(topN, matches.size()).clear();
        }
        assertEquals(matches.size(), r.topN.size());
        for (int hit = 0; hit < r.topN.size(); hit++) {
            //System.out.println("  check hit " + hit);
            assertEquals(matches.get(hit).input, r.topN.get(hit).input);
            assertEquals(matches.get(hit).output, r.topN.get(hit).output);
        }
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ArrayList(java.util.ArrayList) TestUtil(org.apache.lucene.util.TestUtil) FSTTester.simpleRandomString(org.apache.lucene.util.fst.FSTTester.simpleRandomString) FSTTester.getRandomString(org.apache.lucene.util.fst.FSTTester.getRandomString) Result(org.apache.lucene.util.fst.Util.Result) Random(java.util.Random) TreeSet(java.util.TreeSet) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) TreeMap(java.util.TreeMap) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Map(java.util.Map) TreeMap(java.util.TreeMap)

Example 7 with Pair

use of org.apache.lucene.util.fst.PairOutputs.Pair in project lucene-solr by apache.

the class TestFSTs method testShortestPathsWFST.

/** like testShortestPaths, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFST() throws Exception {
    PairOutputs<Long, Long> outputs = new PairOutputs<>(// weight
    PositiveIntOutputs.getSingleton(), // output
    PositiveIntOutputs.getSingleton());
    final Builder<Pair<Long, Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
    builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
    builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
    final FST<Pair<Long, Long>> fst = builder.finish();
    //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
    //Util.toDot(fst, w, false, false);
    //w.close();
    Util.TopResults<Pair<Long, Long>> res = Util.shortestPaths(fst, fst.getFirstArc(new FST.Arc<Pair<Long, Long>>()), outputs.getNoOutput(), minPairWeightComparator, 3, true);
    assertTrue(res.isComplete);
    assertEquals(3, res.topN.size());
    assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input);
    // weight
    assertEquals(7L, res.topN.get(0).output.output1.longValue());
    // output
    assertEquals(36L, res.topN.get(0).output.output2.longValue());
    assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), res.topN.get(1).input);
    // weight
    assertEquals(17L, res.topN.get(1).output.output1.longValue());
    // output
    assertEquals(85L, res.topN.get(1).output.output2.longValue());
    assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), res.topN.get(2).input);
    // weight
    assertEquals(22L, res.topN.get(2).output.output1.longValue());
    // output
    assertEquals(57L, res.topN.get(2).output.output2.longValue());
}
Also used : Arc(org.apache.lucene.util.fst.FST.Arc) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) TestUtil(org.apache.lucene.util.TestUtil) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair)

Aggregations

BytesRef (org.apache.lucene.util.BytesRef)7 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)7 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)5 HashSet (java.util.HashSet)4 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)4 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)4 IntsRef (org.apache.lucene.util.IntsRef)4 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)3 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)3 Util (org.apache.lucene.util.fst.Util)3 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)2 IndexOutput (org.apache.lucene.store.IndexOutput)2 ArrayUtil (org.apache.lucene.util.ArrayUtil)2 OfflineSorter (org.apache.lucene.util.OfflineSorter)2 TestUtil (org.apache.lucene.util.TestUtil)2 Automaton (org.apache.lucene.util.automaton.Automaton)2 LimitedFiniteStringsIterator (org.apache.lucene.util.automaton.LimitedFiniteStringsIterator)2