Search in sources :

Example 6 with BytesReader

use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.

the class Util method get.

// TODO: maybe a CharsRef version for BYTE2
/** Looks up the output for this input, or null if the
   *  input is not accepted */
public static <T> T get(FST<T> fst, BytesRef input) throws IOException {
    assert fst.inputType == FST.INPUT_TYPE.BYTE1;
    final BytesReader fstReader = fst.getBytesReader();
    // TODO: would be nice not to alloc this on every lookup
    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
    // Accumulate output as we go
    T output = fst.outputs.getNoOutput();
    for (int i = 0; i < input.length; i++) {
        if (fst.findTargetArc(input.bytes[i + input.offset] & 0xFF, arc, arc, fstReader) == null) {
            return null;
        }
        output = fst.outputs.add(output, arc.output);
    }
    if (arc.isFinal()) {
        return fst.outputs.add(output, arc.nextFinalOutput);
    } else {
        return null;
    }
}
Also used : BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 7 with BytesReader

use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.

the class Util method getByOutput.

/** Reverse lookup (lookup by output instead of by input),
   *  in the special case when your FSTs outputs are
   *  strictly ascending.  This locates the input/output
   *  pair where the output is equal to the target, and will
   *  return null if that output does not exist.
   *
   *  <p>NOTE: this only works with {@code FST<Long>}, only
   *  works when the outputs are ascending in order with
   *  the inputs.
   *  For example, simple ordinals (0, 1,
   *  2, ...), or file offets (when appending to a file)
   *  fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
    final BytesReader in = fst.getBytesReader();
    // TODO: would be nice not to alloc this on every lookup
    FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
    FST.Arc<Long> scratchArc = new FST.Arc<>();
    final IntsRefBuilder result = new IntsRefBuilder();
    return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
Also used : BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder)

Example 8 with BytesReader

use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.

the class MemoryDocValuesProducer method getSortedSet.

@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
    SortedSetEntry sortedSetEntry = sortedSets.get(field.name);
    if (sortedSetEntry.singleton) {
        return DocValues.singleton(getSorted(field));
    }
    final FSTEntry entry = fsts.get(field.name);
    if (entry.numOrds == 0) {
        // empty FST!
        return DocValues.emptySortedSet();
    }
    FST<Long> instance;
    synchronized (this) {
        instance = fstInstances.get(field.name);
        if (instance == null) {
            IndexInput data = this.data.clone();
            data.seek(entry.offset);
            instance = new FST<>(data, PositiveIntOutputs.getSingleton());
            if (!merging) {
                ramBytesUsed.addAndGet(instance.ramBytesUsed());
                fstInstances.put(field.name, instance);
            }
        }
    }
    final LegacyBinaryDocValues docToOrds = getLegacyBinary(field);
    final FST<Long> fst = instance;
    // per-thread resources
    final BytesReader in = fst.getBytesReader();
    final Arc<Long> firstArc = new Arc<>();
    final Arc<Long> scratchArc = new Arc<>();
    final IntsRefBuilder scratchInts = new IntsRefBuilder();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
    final ByteArrayDataInput input = new ByteArrayDataInput();
    return new LegacySortedSetDocValuesWrapper(new LegacySortedSetDocValues() {

        final BytesRefBuilder term = new BytesRefBuilder();

        BytesRef ref;

        long currentOrd;

        @Override
        public long nextOrd() {
            if (input.eof()) {
                return NO_MORE_ORDS;
            } else {
                currentOrd += input.readVLong();
                return currentOrd;
            }
        }

        @Override
        public void setDocument(int docID) {
            ref = docToOrds.get(docID);
            input.reset(ref.bytes, ref.offset, ref.length);
            currentOrd = 0;
        }

        @Override
        public BytesRef lookupOrd(long ord) {
            try {
                in.setPosition(0);
                fst.getFirstArc(firstArc);
                IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
                return Util.toBytesRef(output, term);
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
        }

        @Override
        public long lookupTerm(BytesRef key) {
            try {
                InputOutput<Long> o = fstEnum.seekCeil(key);
                if (o == null) {
                    return -getValueCount() - 1;
                } else if (o.input.equals(key)) {
                    return o.output.intValue();
                } else {
                    return -o.output - 1;
                }
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
        }

        @Override
        public long getValueCount() {
            return entry.numOrds;
        }

        @Override
        public TermsEnum termsEnum() {
            return new FSTTermsEnum(fst);
        }
    }, maxDoc);
}
Also used : ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) IndexInput(org.apache.lucene.store.IndexInput) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) InputOutput(org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput) IOException(java.io.IOException) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRefFSTEnum(org.apache.lucene.util.fst.BytesRefFSTEnum) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) AtomicLong(java.util.concurrent.atomic.AtomicLong)

Example 9 with BytesReader

use of org.apache.lucene.util.fst.FST.BytesReader in project elasticsearch by elastic.

the class XAnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == holeCharacter) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == sepLabel) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 10 with BytesReader

use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.

the class Util method toDot.

/**
   * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
   * for visualization. Example of use:
   * 
   * <pre class="prettyprint">
   * PrintWriter pw = new PrintWriter(&quot;out.dot&quot;);
   * Util.toDot(fst, pw, true, true);
   * pw.close();
   * </pre>
   * 
   * and then, from command line:
   * 
   * <pre>
   * dot -Tpng -o out.png out.dot
   * </pre>
   * 
   * <p>
   * Note: larger FSTs (a few thousand nodes) won't even
   * render, don't bother.  If the FST is &gt; 2.1 GB in size
   * then this method will throw strange exceptions.
   * 
   * @param sameRank
   *          If <code>true</code>, the resulting <code>dot</code> file will try
   *          to order states in layers of breadth-first traversal. This may
   *          mess up arcs, but makes the output FST's structure a bit clearer.
   * 
   * @param labelStates
   *          If <code>true</code> states will have labels equal to their offsets in their
   *          binary format. Expands the graph considerably. 
   * 
   * @see <a href="http://www.graphviz.org/">graphviz project</a>
   */
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates) throws IOException {
    final String expandedNodeColor = "blue";
    // This is the start arc in the automaton (from the epsilon state to the first state 
    // with outgoing transitions.
    final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
    // A queue of transitions to consider for the next level.
    final List<FST.Arc<T>> thisLevelQueue = new ArrayList<>();
    // A queue of transitions to consider when processing the next level.
    final List<FST.Arc<T>> nextLevelQueue = new ArrayList<>();
    nextLevelQueue.add(startArc);
    //System.out.println("toDot: startArc: " + startArc);
    // A list of states on the same level (for ranking).
    final List<Integer> sameLevelStates = new ArrayList<>();
    // A bitset of already seen states (target offset).
    final BitSet seen = new BitSet();
    seen.set((int) startArc.target);
    // Shape for states.
    final String stateShape = "circle";
    final String finalStateShape = "doublecircle";
    // Emit DOT prologue.
    out.write("digraph FST {\n");
    out.write("  rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
    if (!labelStates) {
        out.write("  node [shape=circle, width=.2, height=.2, style=filled]\n");
    }
    emitDotState(out, "initial", "point", "white", "");
    final T NO_OUTPUT = fst.outputs.getNoOutput();
    final BytesReader r = fst.getBytesReader();
    // final FST.Arc<T> scratchArc = new FST.Arc<>();
    {
        final String stateColor;
        if (fst.isExpandedTarget(startArc, r)) {
            stateColor = expandedNodeColor;
        } else {
            stateColor = null;
        }
        final boolean isFinal;
        final T finalOutput;
        if (startArc.isFinal()) {
            isFinal = true;
            finalOutput = startArc.nextFinalOutput == NO_OUTPUT ? null : startArc.nextFinalOutput;
        } else {
            isFinal = false;
            finalOutput = null;
        }
        emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput));
    }
    out.write("  initial -> " + startArc.target + "\n");
    int level = 0;
    while (!nextLevelQueue.isEmpty()) {
        // we could double buffer here, but it doesn't matter probably.
        //System.out.println("next level=" + level);
        thisLevelQueue.addAll(nextLevelQueue);
        nextLevelQueue.clear();
        level++;
        out.write("\n  // Transitions and states at level: " + level + "\n");
        while (!thisLevelQueue.isEmpty()) {
            final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
            //System.out.println("  pop: " + arc);
            if (FST.targetHasArcs(arc)) {
                // scan all target arcs
                //System.out.println("  readFirstTarget...");
                final long node = arc.target;
                fst.readFirstRealTargetArc(arc.target, arc, r);
                while (true) {
                    // Emit the unseen state and add it to the queue for the next level.
                    if (arc.target >= 0 && !seen.get((int) arc.target)) {
                        /*
              boolean isFinal = false;
              T finalOutput = null;
              fst.readFirstTargetArc(arc, scratchArc);
              if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) {
                // target is final
                isFinal = true;
                finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output;
                System.out.println("dot hit final label=" + (char) scratchArc.label);
              }
              */
                        final String stateColor;
                        if (fst.isExpandedTarget(arc, r)) {
                            stateColor = expandedNodeColor;
                        } else {
                            stateColor = null;
                        }
                        final String finalOutput;
                        if (arc.nextFinalOutput != null && arc.nextFinalOutput != NO_OUTPUT) {
                            finalOutput = fst.outputs.outputToString(arc.nextFinalOutput);
                        } else {
                            finalOutput = "";
                        }
                        emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput);
                        // To see the node address, use this instead:
                        //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
                        seen.set((int) arc.target);
                        nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
                        sameLevelStates.add((int) arc.target);
                    }
                    String outs;
                    if (arc.output != NO_OUTPUT) {
                        outs = "/" + fst.outputs.outputToString(arc.output);
                    } else {
                        outs = "";
                    }
                    if (!FST.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) {
                        // Tricky special case: sometimes, due to
                        // pruning, the builder can [sillily] produce
                        // an FST with an arc into the final end state
                        // (-1) but also with a next final output; in
                        // this case we pull that output up onto this
                        // arc
                        outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
                    }
                    final String arcColor;
                    if (arc.flag(FST.BIT_TARGET_NEXT)) {
                        arcColor = "red";
                    } else {
                        arcColor = "black";
                    }
                    assert arc.label != FST.END_LABEL;
                    out.write("  " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n");
                    // Break the loop if we're on the last arc of this state.
                    if (arc.isLast()) {
                        //System.out.println("    break");
                        break;
                    }
                    fst.readNextRealArc(arc, r);
                }
            }
        }
        // Emit state ranking information.
        if (sameRank && sameLevelStates.size() > 1) {
            out.write("  {rank=same; ");
            for (int state : sameLevelStates) {
                out.write(state + "; ");
            }
            out.write(" }\n");
        }
        sameLevelStates.clear();
    }
    // Emit terminating state (always there anyway).
    out.write("  -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n");
    out.write("  {rank=sink; -1 }\n");
    out.write("}\n");
    out.flush();
}
Also used : ArrayList(java.util.ArrayList) BitSet(java.util.BitSet) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc)

Aggregations

BytesReader (org.apache.lucene.util.fst.FST.BytesReader)13 ArrayList (java.util.ArrayList)7 BytesRef (org.apache.lucene.util.BytesRef)7 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 Arc (org.apache.lucene.util.fst.FST.Arc)6 IOException (java.io.IOException)5 IntsRef (org.apache.lucene.util.IntsRef)5 HashSet (java.util.HashSet)3 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)3 FSTTester.getRandomString (org.apache.lucene.util.fst.FSTTester.getRandomString)3 FSTTester.simpleRandomString (org.apache.lucene.util.fst.FSTTester.simpleRandomString)3 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)3 Util (org.apache.lucene.util.fst.Util)3 Result (org.apache.lucene.util.fst.Util.Result)3 Map (java.util.Map)2 Random (java.util.Random)2 TreeMap (java.util.TreeMap)2 TreeSet (java.util.TreeSet)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2