Search in sources :

Example 11 with BytesReader

use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.

the class WFSTCompletionLookup method lookupPrefix.

private Long lookupPrefix(BytesRef scratch, Arc<Long> arc) throws /*Bogus*/
IOException {
    assert 0 == fst.outputs.getNoOutput().longValue();
    long output = 0;
    BytesReader bytesReader = fst.getBytesReader();
    fst.getFirstArc(arc);
    byte[] bytes = scratch.bytes;
    int pos = scratch.offset;
    int end = pos + scratch.length;
    while (pos < end) {
        if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
            return null;
        } else {
            output += arc.output.longValue();
        }
    }
    return output;
}
Also used : BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 12 with BytesReader

use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.

the class Util method toDot.

/**
   * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
   * for visualization. Example of use:
   * 
   * <pre class="prettyprint">
   * PrintWriter pw = new PrintWriter(&quot;out.dot&quot;);
   * Util.toDot(fst, pw, true, true);
   * pw.close();
   * </pre>
   * 
   * and then, from command line:
   * 
   * <pre>
   * dot -Tpng -o out.png out.dot
   * </pre>
   * 
   * <p>
   * Note: larger FSTs (a few thousand nodes) won't even
   * render, don't bother.  If the FST is &gt; 2.1 GB in size
   * then this method will throw strange exceptions.
   * 
   * @param sameRank
   *          If <code>true</code>, the resulting <code>dot</code> file will try
   *          to order states in layers of breadth-first traversal. This may
   *          mess up arcs, but makes the output FST's structure a bit clearer.
   * 
   * @param labelStates
   *          If <code>true</code> states will have labels equal to their offsets in their
   *          binary format. Expands the graph considerably. 
   * 
   * @see <a href="http://www.graphviz.org/">graphviz project</a>
   */
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates) throws IOException {
    final String expandedNodeColor = "blue";
    // This is the start arc in the automaton (from the epsilon state to the first state 
    // with outgoing transitions.
    final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
    // A queue of transitions to consider for the next level.
    final List<FST.Arc<T>> thisLevelQueue = new ArrayList<>();
    // A queue of transitions to consider when processing the next level.
    final List<FST.Arc<T>> nextLevelQueue = new ArrayList<>();
    nextLevelQueue.add(startArc);
    //System.out.println("toDot: startArc: " + startArc);
    // A list of states on the same level (for ranking).
    final List<Integer> sameLevelStates = new ArrayList<>();
    // A bitset of already seen states (target offset).
    final BitSet seen = new BitSet();
    seen.set((int) startArc.target);
    // Shape for states.
    final String stateShape = "circle";
    final String finalStateShape = "doublecircle";
    // Emit DOT prologue.
    out.write("digraph FST {\n");
    out.write("  rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
    if (!labelStates) {
        out.write("  node [shape=circle, width=.2, height=.2, style=filled]\n");
    }
    emitDotState(out, "initial", "point", "white", "");
    final T NO_OUTPUT = fst.outputs.getNoOutput();
    final BytesReader r = fst.getBytesReader();
    // final FST.Arc<T> scratchArc = new FST.Arc<>();
    {
        final String stateColor;
        if (fst.isExpandedTarget(startArc, r)) {
            stateColor = expandedNodeColor;
        } else {
            stateColor = null;
        }
        final boolean isFinal;
        final T finalOutput;
        if (startArc.isFinal()) {
            isFinal = true;
            finalOutput = startArc.nextFinalOutput == NO_OUTPUT ? null : startArc.nextFinalOutput;
        } else {
            isFinal = false;
            finalOutput = null;
        }
        emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput));
    }
    out.write("  initial -> " + startArc.target + "\n");
    int level = 0;
    while (!nextLevelQueue.isEmpty()) {
        // we could double buffer here, but it doesn't matter probably.
        //System.out.println("next level=" + level);
        thisLevelQueue.addAll(nextLevelQueue);
        nextLevelQueue.clear();
        level++;
        out.write("\n  // Transitions and states at level: " + level + "\n");
        while (!thisLevelQueue.isEmpty()) {
            final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
            //System.out.println("  pop: " + arc);
            if (FST.targetHasArcs(arc)) {
                // scan all target arcs
                //System.out.println("  readFirstTarget...");
                final long node = arc.target;
                fst.readFirstRealTargetArc(arc.target, arc, r);
                while (true) {
                    // Emit the unseen state and add it to the queue for the next level.
                    if (arc.target >= 0 && !seen.get((int) arc.target)) {
                        /*
              boolean isFinal = false;
              T finalOutput = null;
              fst.readFirstTargetArc(arc, scratchArc);
              if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) {
                // target is final
                isFinal = true;
                finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output;
                System.out.println("dot hit final label=" + (char) scratchArc.label);
              }
              */
                        final String stateColor;
                        if (fst.isExpandedTarget(arc, r)) {
                            stateColor = expandedNodeColor;
                        } else {
                            stateColor = null;
                        }
                        final String finalOutput;
                        if (arc.nextFinalOutput != null && arc.nextFinalOutput != NO_OUTPUT) {
                            finalOutput = fst.outputs.outputToString(arc.nextFinalOutput);
                        } else {
                            finalOutput = "";
                        }
                        emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput);
                        // To see the node address, use this instead:
                        //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
                        seen.set((int) arc.target);
                        nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
                        sameLevelStates.add((int) arc.target);
                    }
                    String outs;
                    if (arc.output != NO_OUTPUT) {
                        outs = "/" + fst.outputs.outputToString(arc.output);
                    } else {
                        outs = "";
                    }
                    if (!FST.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) {
                        // Tricky special case: sometimes, due to
                        // pruning, the builder can [sillily] produce
                        // an FST with an arc into the final end state
                        // (-1) but also with a next final output; in
                        // this case we pull that output up onto this
                        // arc
                        outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
                    }
                    final String arcColor;
                    if (arc.flag(FST.BIT_TARGET_NEXT)) {
                        arcColor = "red";
                    } else {
                        arcColor = "black";
                    }
                    assert arc.label != FST.END_LABEL;
                    out.write("  " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n");
                    // Break the loop if we're on the last arc of this state.
                    if (arc.isLast()) {
                        //System.out.println("    break");
                        break;
                    }
                    fst.readNextRealArc(arc, r);
                }
            }
        }
        // Emit state ranking information.
        if (sameRank && sameLevelStates.size() > 1) {
            out.write("  {rank=same; ");
            for (int state : sameLevelStates) {
                out.write(state + "; ");
            }
            out.write(" }\n");
        }
        sameLevelStates.clear();
    }
    // Emit terminating state (always there anyway).
    out.write("  -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n");
    out.write("  {rank=sink; -1 }\n");
    out.write("}\n");
    out.flush();
}
Also used : ArrayList(java.util.ArrayList) BitSet(java.util.BitSet) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc)

Example 13 with BytesReader

use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.

the class TestFSTs method testShortestPathsWFSTRandom.

/** like testShortestPathsRandom, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFSTRandom() throws Exception {
    int numWords = atLeast(1000);
    final TreeMap<String, TwoLongs> slowCompletor = new TreeMap<>();
    final TreeSet<String> allPrefixes = new TreeSet<>();
    PairOutputs<Long, Long> outputs = new PairOutputs<>(// weight
    PositiveIntOutputs.getSingleton(), // output
    PositiveIntOutputs.getSingleton());
    final Builder<Pair<Long, Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    Random random = random();
    for (int i = 0; i < numWords; i++) {
        String s;
        while (true) {
            s = TestUtil.randomSimpleString(random);
            if (!slowCompletor.containsKey(s)) {
                break;
            }
        }
        for (int j = 1; j < s.length(); j++) {
            allPrefixes.add(s.substring(0, j));
        }
        // weights 1..100
        int weight = TestUtil.nextInt(random, 1, 100);
        // outputs 0..500
        int output = TestUtil.nextInt(random, 0, 500);
        slowCompletor.put(s, new TwoLongs(weight, output));
    }
    for (Map.Entry<String, TwoLongs> e : slowCompletor.entrySet()) {
        //System.out.println("add: " + e);
        long weight = e.getValue().a;
        long output = e.getValue().b;
        builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
    }
    final FST<Pair<Long, Long>> fst = builder.finish();
    //System.out.println("SAVE out.dot");
    //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
    //Util.toDot(fst, w, false, false);
    //w.close();
    BytesReader reader = fst.getBytesReader();
    //System.out.println("testing: " + allPrefixes.size() + " prefixes");
    for (String prefix : allPrefixes) {
        // 1. run prefix against fst, then complete by value
        //System.out.println("TEST: " + prefix);
        Pair<Long, Long> prefixOutput = outputs.getNoOutput();
        FST.Arc<Pair<Long, Long>> arc = fst.getFirstArc(new FST.Arc<Pair<Long, Long>>());
        for (int idx = 0; idx < prefix.length(); idx++) {
            if (fst.findTargetArc((int) prefix.charAt(idx), arc, arc, reader) == null) {
                fail();
            }
            prefixOutput = outputs.add(prefixOutput, arc.output);
        }
        final int topN = TestUtil.nextInt(random, 1, 10);
        Util.TopResults<Pair<Long, Long>> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true);
        assertTrue(r.isComplete);
        // 2. go thru whole treemap (slowCompletor) and check it's actually the best suggestion
        final List<Result<Pair<Long, Long>>> matches = new ArrayList<>();
        // TODO: could be faster... but it's slowCompletor for a reason
        for (Map.Entry<String, TwoLongs> e : slowCompletor.entrySet()) {
            if (e.getKey().startsWith(prefix)) {
                //System.out.println("  consider " + e.getKey());
                matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRefBuilder()), outputs.newPair(e.getValue().a - prefixOutput.output1, e.getValue().b - prefixOutput.output2)));
            }
        }
        assertTrue(matches.size() > 0);
        Collections.sort(matches, new TieBreakByInputComparator<>(minPairWeightComparator));
        if (matches.size() > topN) {
            matches.subList(topN, matches.size()).clear();
        }
        assertEquals(matches.size(), r.topN.size());
        for (int hit = 0; hit < r.topN.size(); hit++) {
            //System.out.println("  check hit " + hit);
            assertEquals(matches.get(hit).input, r.topN.get(hit).input);
            assertEquals(matches.get(hit).output, r.topN.get(hit).output);
        }
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ArrayList(java.util.ArrayList) TestUtil(org.apache.lucene.util.TestUtil) FSTTester.simpleRandomString(org.apache.lucene.util.fst.FSTTester.simpleRandomString) FSTTester.getRandomString(org.apache.lucene.util.fst.FSTTester.getRandomString) Result(org.apache.lucene.util.fst.Util.Result) Random(java.util.Random) TreeSet(java.util.TreeSet) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) TreeMap(java.util.TreeMap) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Map(java.util.Map) TreeMap(java.util.TreeMap)

Aggregations

BytesReader (org.apache.lucene.util.fst.FST.BytesReader)13 ArrayList (java.util.ArrayList)7 BytesRef (org.apache.lucene.util.BytesRef)7 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 Arc (org.apache.lucene.util.fst.FST.Arc)6 IOException (java.io.IOException)5 IntsRef (org.apache.lucene.util.IntsRef)5 HashSet (java.util.HashSet)3 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)3 FSTTester.getRandomString (org.apache.lucene.util.fst.FSTTester.getRandomString)3 FSTTester.simpleRandomString (org.apache.lucene.util.fst.FSTTester.simpleRandomString)3 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)3 Util (org.apache.lucene.util.fst.Util)3 Result (org.apache.lucene.util.fst.Util.Result)3 Map (java.util.Map)2 Random (java.util.Random)2 TreeMap (java.util.TreeMap)2 TreeSet (java.util.TreeSet)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2