Search in sources :

Example 1 with Arc

use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.

the class FreeTextSuggester method lookup.

/** Retrieve suggestions. */
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        throw new IllegalStateException("Lookup not supported at this time");
    }
    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
        //System.out.println("lookup: key='" + key + "'");
        // Run full analysis, but save only the
        // last 1gram, last 2gram, etc.:
        int maxEndOffset = -1;
        boolean sawRealToken = false;
        while (ts.incrementToken()) {
            BytesRef tokenBytes = termBytesAtt.getBytesRef();
            sawRealToken |= tokenBytes.length > 0;
            // TODO: this is somewhat iffy; today, ShingleFilter
            // sets posLen to the gram count; maybe we should make
            // a separate dedicated att for this?
            int gramCount = posLenAtt.getPositionLength();
            assert gramCount <= grams;
            // Safety: make sure the recalculated count "agrees":
            if (countGrams(tokenBytes) != gramCount) {
                throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
            }
            maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
            BytesRefBuilder b = new BytesRefBuilder();
            b.append(tokenBytes);
            lastTokens[gramCount - 1] = b;
        }
        ts.end();
        if (!sawRealToken) {
            throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
        }
        // Carefully fill last tokens with _ tokens;
        // ShingleFilter appraently won't emit "only hole"
        // tokens:
        int endPosInc = posIncAtt.getPositionIncrement();
        // Note this will also be true if input is the empty
        // string (in which case we saw no tokens and
        // maxEndOffset is still -1), which in fact works out OK
        // because we fill the unigram with an empty BytesRef
        // below:
        boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
        if (lastTokenEnded) {
            // starting with "foo":
            for (int i = grams - 1; i > 0; i--) {
                BytesRefBuilder token = lastTokens[i - 1];
                if (token == null) {
                    continue;
                }
                token.append(separator);
                lastTokens[i] = token;
            }
            lastTokens[0] = new BytesRefBuilder();
        }
        Arc<Long> arc = new Arc<>();
        BytesReader bytesReader = fst.getBytesReader();
        // Try highest order models first, and if they return
        // results, return that; else, fallback:
        double backoff = 1.0;
        List<LookupResult> results = new ArrayList<>(num);
        // We only add a given suffix once, from the highest
        // order model that saw it; for subsequent lower order
        // models we skip it:
        final Set<BytesRef> seen = new HashSet<>();
        for (int gram = grams - 1; gram >= 0; gram--) {
            BytesRefBuilder token = lastTokens[gram];
            // Don't make unigram predictions from empty string:
            if (token == null || (token.length() == 0 && key.length() > 0)) {
                //System.out.println("  gram=" + gram + ": skip: not enough input");
                continue;
            }
            if (endPosInc > 0 && gram <= endPosInc) {
                //System.out.println("  break: only holes now");
                break;
            }
            //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
            // TODO: we could add fuzziness here
            // match the prefix portion exactly
            //Pair<Long,BytesRef> prefixOutput = null;
            Long prefixOutput = null;
            try {
                prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
            if (prefixOutput == null) {
                // This model never saw this prefix, e.g. the
                // trigram model never saw context "purple mushroom"
                backoff *= ALPHA;
                continue;
            }
            // TODO: we could do this division at build time, and
            // bake it into the FST?
            // Denominator for computing scores from current
            // model's predictions:
            long contextCount = totTokens;
            BytesRef lastTokenFragment = null;
            for (int i = token.length() - 1; i >= 0; i--) {
                if (token.byteAt(i) == separator) {
                    BytesRef context = new BytesRef(token.bytes(), 0, i);
                    Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
                    assert output != null;
                    contextCount = decodeWeight(output);
                    lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
                    break;
                }
            }
            final BytesRefBuilder finalLastToken = new BytesRefBuilder();
            if (lastTokenFragment == null) {
                finalLastToken.copyBytes(token.get());
            } else {
                finalLastToken.copyBytes(lastTokenFragment);
            }
            CharsRefBuilder spare = new CharsRefBuilder();
            // complete top-N
            TopResults<Long> completions = null;
            try {
                // Because we store multiple models in one FST
                // (1gram, 2gram, 3gram), we must restrict the
                // search so that it only considers the current
                // model.  For highest order model, this is not
                // necessary since all completions in the FST
                // must be from this model, but for lower order
                // models we have to filter out the higher order
                // ones:
                // Must do num+seen.size() for queue depth because we may
                // reject up to seen.size() paths in acceptResult():
                Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num + seen.size(), weightComparator) {

                    BytesRefBuilder scratchBytes = new BytesRefBuilder();

                    @Override
                    protected void addIfCompetitive(Util.FSTPath<Long> path) {
                        if (path.arc.label != separator) {
                            //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                            super.addIfCompetitive(path);
                        } else {
                        //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                        }
                    }

                    @Override
                    protected boolean acceptResult(IntsRef input, Long output) {
                        Util.toBytesRef(input, scratchBytes);
                        finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
                        int lenSav = finalLastToken.length();
                        finalLastToken.append(scratchBytes);
                        //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
                        boolean ret = seen.contains(finalLastToken.get()) == false;
                        finalLastToken.setLength(lenSav);
                        return ret;
                    }
                };
                // since this search is initialized with a single start node 
                // it is okay to start with an empty input path here
                searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
                completions = searcher.search();
                assert completions.isComplete;
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
            int prefixLength = token.length();
            BytesRefBuilder suffix = new BytesRefBuilder();
            nextCompletion: for (Result<Long> completion : completions) {
                token.setLength(prefixLength);
                // append suffix
                Util.toBytesRef(completion.input, suffix);
                token.append(suffix);
                //System.out.println("    completion " + token.utf8ToString());
                // Skip this path if a higher-order model already
                // saw/predicted its last token:
                BytesRef lastToken = token.get();
                for (int i = token.length() - 1; i >= 0; i--) {
                    if (token.byteAt(i) == separator) {
                        assert token.length() - i - 1 > 0;
                        lastToken = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
                        break;
                    }
                }
                if (seen.contains(lastToken)) {
                    //System.out.println("      skip dup " + lastToken.utf8ToString());
                    continue nextCompletion;
                }
                seen.add(BytesRef.deepCopyOf(lastToken));
                spare.copyUTF8Bytes(token.get());
                LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
                results.add(result);
                assert results.size() == seen.size();
            //System.out.println("  add result=" + result);
            }
            backoff *= ALPHA;
        }
        Collections.sort(results, new Comparator<LookupResult>() {

            @Override
            public int compare(LookupResult a, LookupResult b) {
                if (a.value > b.value) {
                    return -1;
                } else if (a.value < b.value) {
                    return 1;
                } else {
                    // Tie break by UTF16 sort order:
                    return ((String) a.key).compareTo((String) b.key);
                }
            }
        });
        if (results.size() > num) {
            results.subList(num, results.size()).clear();
        }
        return results;
    }
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) Result(org.apache.lucene.util.fst.Util.Result) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IOException(java.io.IOException) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 2 with Arc

use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.

the class WFSTCompletionLookup method lookup.

@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    BytesRefBuilder scratch = new BytesRefBuilder();
    scratch.copyChars(key);
    int prefixLength = scratch.length();
    Arc<Long> arc = new Arc<>();
    // match the prefix portion exactly
    Long prefixOutput = null;
    try {
        prefixOutput = lookupPrefix(scratch.get(), arc);
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
    if (prefixOutput == null) {
        return Collections.emptyList();
    }
    List<LookupResult> results = new ArrayList<>(num);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (exactFirst && arc.isFinal()) {
        spare.copyUTF8Bytes(scratch.get());
        results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
        if (--num == 0) {
            // that was quick
            return results;
        }
    }
    // complete top-N
    TopResults<Long> completions = null;
    try {
        completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
        assert completions.isComplete;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
    BytesRefBuilder suffix = new BytesRefBuilder();
    for (Result<Long> completion : completions) {
        scratch.setLength(prefixLength);
        // append suffix
        Util.toBytesRef(completion.input, suffix);
        scratch.append(suffix);
        spare.copyUTF8Bytes(scratch.get());
        results.add(new LookupResult(spare.toString(), decodeWeight(completion.output)));
    }
    return results;
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Arc(org.apache.lucene.util.fst.FST.Arc) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder)

Example 3 with Arc

use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.

the class TestFSTs method testExpandedCloseToRoot.

/**
   * Test state expansion (array format) on close-to-root states. Creates
   * synthetic input that has one expanded state on each level.
   *
   * @see <a href="https://issues.apache.org/jira/browse/LUCENE-2933">LUCENE-2933</a>
   */
public void testExpandedCloseToRoot() throws Exception {
    class SyntheticData {

        FST<Object> compile(String[] lines) throws IOException {
            final NoOutputs outputs = NoOutputs.getSingleton();
            final Object nothing = outputs.getNoOutput();
            final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
            int line = 0;
            final BytesRefBuilder term = new BytesRefBuilder();
            final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
            while (line < lines.length) {
                String w = lines[line++];
                if (w == null) {
                    break;
                }
                term.copyChars(w);
                b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
            }
            return b.finish();
        }

        void generate(ArrayList<String> out, StringBuilder b, char from, char to, int depth) {
            if (depth == 0 || from == to) {
                String seq = b.toString() + "_" + out.size() + "_end";
                out.add(seq);
            } else {
                for (char c = from; c <= to; c++) {
                    b.append(c);
                    generate(out, b, from, c == to ? to : from, depth - 1);
                    b.deleteCharAt(b.length() - 1);
                }
            }
        }

        public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth) throws IOException {
            if (FST.targetHasArcs(arc)) {
                int childCount = 0;
                BytesReader fstReader = fst.getBytesReader();
                for (arc = fst.readFirstTargetArc(arc, arc, fstReader); ; arc = fst.readNextArc(arc, fstReader), childCount++) {
                    boolean expanded = fst.isExpandedTarget(arc, fstReader);
                    int children = verifyStateAndBelow(fst, new FST.Arc<>().copyFrom(arc), depth + 1);
                    assertEquals(expanded, (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
                    if (arc.isLast())
                        break;
                }
                return childCount;
            }
            return 0;
        }
    }
    // Sanity check.
    assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
    assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
    SyntheticData s = new SyntheticData();
    ArrayList<String> out = new ArrayList<>();
    StringBuilder b = new StringBuilder();
    s.generate(out, b, 'a', 'i', 10);
    String[] input = out.toArray(new String[out.size()]);
    Arrays.sort(input);
    FST<Object> fst = s.compile(input);
    FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<>());
    s.verifyStateAndBelow(fst, arc, 1);
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ArrayList(java.util.ArrayList) FSTTester.simpleRandomString(org.apache.lucene.util.fst.FSTTester.simpleRandomString) FSTTester.getRandomString(org.apache.lucene.util.fst.FSTTester.getRandomString) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc)

Example 4 with Arc

use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.

the class TestFSTs method testIllegallyModifyRootArc.

public void testIllegallyModifyRootArc() throws Exception {
    assumeTrue("test relies on assertions", assertsAreEnabled);
    Set<BytesRef> terms = new HashSet<>();
    for (int i = 0; i < 100; i++) {
        String prefix = Character.toString((char) ('a' + i));
        terms.add(new BytesRef(prefix));
        if (prefix.equals("m") == false) {
            for (int j = 0; j < 20; j++) {
                // Make a big enough FST that the root cache will be created:
                String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20);
                terms.add(new BytesRef(prefix + suffix));
            }
        }
    }
    List<BytesRef> termsList = new ArrayList<>(terms);
    Collections.sort(termsList);
    ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
    Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    IntsRefBuilder input = new IntsRefBuilder();
    for (BytesRef term : termsList) {
        Util.toIntsRef(term, input);
        builder.add(input.get(), term);
    }
    FST<BytesRef> fst = builder.finish();
    Arc<BytesRef> arc = new FST.Arc<>();
    fst.getFirstArc(arc);
    FST.BytesReader reader = fst.getBytesReader();
    arc = fst.findTargetArc((int) 'm', arc, arc, reader);
    assertNotNull(arc);
    assertEquals(new BytesRef("m"), arc.output);
    // NOTE: illegal:
    arc.output.length = 0;
    fst.getFirstArc(arc);
    try {
        arc = fst.findTargetArc((int) 'm', arc, arc, reader);
    } catch (AssertionError ae) {
    // expected
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ArrayList(java.util.ArrayList) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) FSTTester.simpleRandomString(org.apache.lucene.util.fst.FSTTester.simpleRandomString) FSTTester.getRandomString(org.apache.lucene.util.fst.FSTTester.getRandomString) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Arc(org.apache.lucene.util.fst.FST.Arc) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 5 with Arc

use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.

the class Util method getByOutput.

/** Reverse lookup (lookup by output instead of by input),
   *  in the special case when your FSTs outputs are
   *  strictly ascending.  This locates the input/output
   *  pair where the output is equal to the target, and will
   *  return null if that output does not exist.
   *
   *  <p>NOTE: this only works with {@code FST<Long>}, only
   *  works when the outputs are ascending in order with
   *  the inputs.
   *  For example, simple ordinals (0, 1,
   *  2, ...), or file offets (when appending to a file)
   *  fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
    final BytesReader in = fst.getBytesReader();
    // TODO: would be nice not to alloc this on every lookup
    FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
    FST.Arc<Long> scratchArc = new FST.Arc<>();
    final IntsRefBuilder result = new IntsRefBuilder();
    return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
Also used : BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder)

Aggregations

Arc (org.apache.lucene.util.fst.FST.Arc)8 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)6 ArrayList (java.util.ArrayList)5 IOException (java.io.IOException)4 BytesRef (org.apache.lucene.util.BytesRef)4 IntsRef (org.apache.lucene.util.IntsRef)3 HashSet (java.util.HashSet)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)2 IndexInput (org.apache.lucene.store.IndexInput)2 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)2 BytesRefFSTEnum (org.apache.lucene.util.fst.BytesRefFSTEnum)2 FSTTester.getRandomString (org.apache.lucene.util.fst.FSTTester.getRandomString)2 FSTTester.simpleRandomString (org.apache.lucene.util.fst.FSTTester.simpleRandomString)2 BitSet (java.util.BitSet)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)1