Search in sources :

Example 1 with Pair

use of org.apache.lucene.util.fst.PairOutputs.Pair in project elasticsearch by elastic.

the class XAnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    String prefix = getClass().getSimpleName();
    Directory tempDir = getTempDir();
    OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    hasPayloads = iterator.hasPayloads();
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == payloadSep) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = (byte) payloadSep;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //PrintWriter pw = new PrintWriter("/tmp/out.dot");
    //Util.toDot(fst, pw, true, true);
    //pw.close();
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 2 with Pair

use of org.apache.lucene.util.fst.PairOutputs.Pair in project lucene-solr by apache.

the class AnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == 0x1E) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == 0x1F) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            assert completions.isComplete;
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        assert completions.isComplete;
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 3 with Pair

use of org.apache.lucene.util.fst.PairOutputs.Pair in project lucene-solr by apache.

the class AnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    hasPayloads = iterator.hasPayloads();
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        CodecUtil.writeFooter(tempInput);
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed.get());
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = PAYLOAD_SEP;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //Util.dotToFile(fst, "/tmp/suggest.dot");
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 4 with Pair

use of org.apache.lucene.util.fst.PairOutputs.Pair in project lucene-solr by apache.

the class NRTSuggester method lookup.

/**
   * Collects at most {@link TopSuggestDocsCollector#getCountToCollect()} completions that
   * match the provided {@link CompletionScorer}.
   * <p>
   * The {@link CompletionScorer#automaton} is intersected with the {@link #fst}.
   * {@link CompletionScorer#weight} is used to compute boosts and/or extract context
   * for each matched partial paths. A top N search is executed on {@link #fst} seeded with
   * the matched partial paths. Upon reaching a completed path, {@link CompletionScorer#accept(int, Bits)}
   * and {@link CompletionScorer#score(float, float)} is used on the document id, index weight
   * and query boost to filter and score the entry, before being collected via
   * {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}
   */
public void lookup(final CompletionScorer scorer, Bits acceptDocs, final TopSuggestDocsCollector collector) throws IOException {
    final double liveDocsRatio = calculateLiveDocRatio(scorer.reader.numDocs(), scorer.reader.maxDoc());
    if (liveDocsRatio == -1) {
        return;
    }
    final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(scorer.automaton, fst);
    // The topN is increased by a factor of # of intersected path
    // to ensure search admissibility. For example, one suggestion can
    // have multiple contexts, resulting in num_context paths for the
    // suggestion instead of 1 in the FST. When queried for the suggestion,
    // the topN value ensures that all paths to the suggestion are evaluated
    // (in case of a match all context query).
    // Note that collectors will early terminate as soon as enough suggestions
    // have been collected, regardless of the set topN value. This value is the
    // maximum number of suggestions that can be collected.
    final int topN = collector.getCountToCollect() * prefixPaths.size();
    final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio, scorer.filtered);
    final CharsRefBuilder spare = new CharsRefBuilder();
    Comparator<Pair<Long, BytesRef>> comparator = getComparator();
    Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, topN, queueSize, comparator, new ScoringPathComparator(scorer)) {

        private final ByteArrayDataInput scratchInput = new ByteArrayDataInput();

        @Override
        protected boolean acceptPartialPath(Util.FSTPath<Pair<Long, BytesRef>> path) {
            if (collector.doSkipDuplicates()) {
                // We are removing dups
                if (path.payload == -1) {
                    // This path didn't yet see the complete surface form; let's see if it just did with the arc output we just added:
                    BytesRef arcOutput = path.arc.output.output2;
                    BytesRef output = path.output.output2;
                    for (int i = 0; i < arcOutput.length; i++) {
                        if (arcOutput.bytes[arcOutput.offset + i] == payloadSep) {
                            // OK this arc that the path was just extended by contains the payloadSep, so we now have a full surface form in this path
                            path.payload = output.length - arcOutput.length + i;
                            assert output.bytes[output.offset + path.payload] == payloadSep;
                            break;
                        }
                    }
                }
                if (path.payload != -1) {
                    BytesRef output = path.output.output2;
                    spare.copyUTF8Bytes(output.bytes, output.offset, path.payload);
                    if (collector.seenSurfaceForms.contains(spare.chars(), 0, spare.length())) {
                        return false;
                    }
                }
            }
            return true;
        }

        @Override
        protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
            BytesRef output = path.output.output2;
            int payloadSepIndex;
            if (path.payload != -1) {
                payloadSepIndex = path.payload;
                spare.copyUTF8Bytes(output.bytes, output.offset, payloadSepIndex);
            } else {
                assert collector.doSkipDuplicates() == false;
                payloadSepIndex = parseSurfaceForm(output, payloadSep, spare);
            }
            scratchInput.reset(output.bytes, output.offset + payloadSepIndex + 1, output.length - payloadSepIndex - 1);
            int docID = scratchInput.readVInt();
            if (!scorer.accept(docID, acceptDocs)) {
                return false;
            }
            if (collector.doSkipDuplicates()) {
                // now record that we've seen this surface form:
                char[] key = new char[spare.length()];
                System.arraycopy(spare.chars(), 0, key, 0, spare.length());
                if (collector.seenSurfaceForms.contains(key)) {
                    // we already collected a higher scoring document with this key, in this segment:
                    return false;
                }
                collector.seenSurfaceForms.add(key);
            }
            try {
                float score = scorer.score(decode(path.output.output1), path.boost);
                collector.collect(docID, spare.toCharsRef(), path.context, score);
                return true;
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    };
    for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
        scorer.weight.setNextMatch(path.input.get());
        BytesRef output = path.output.output2;
        int payload = -1;
        if (collector.doSkipDuplicates()) {
            for (int j = 0; j < output.length; j++) {
                if (output.bytes[output.offset + j] == payloadSep) {
                    // Important to cache this, else we have a possibly O(N^2) cost where N is the length of suggestions
                    payload = j;
                    break;
                }
            }
        }
        searcher.addStartPaths(path.fstNode, path.output, false, path.input, scorer.weight.boost(), scorer.weight.context(), payload);
    }
    // hits are also returned by search()
    // we do not use it, instead collect at acceptResult
    searcher.search();
// search admissibility is not guaranteed
// see comment on getMaxTopNSearcherQueueSize
// assert  search.isComplete;
}
Also used : FSTUtil(org.apache.lucene.search.suggest.analyzing.FSTUtil) Util(org.apache.lucene.util.fst.Util) IOException(java.io.IOException) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) FSTUtil(org.apache.lucene.search.suggest.analyzing.FSTUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair)

Example 5 with Pair

use of org.apache.lucene.util.fst.PairOutputs.Pair in project elasticsearch by elastic.

the class XAnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == holeCharacter) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == sepLabel) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Aggregations

BytesRef (org.apache.lucene.util.BytesRef)7 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)7 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)5 HashSet (java.util.HashSet)4 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)4 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)4 IntsRef (org.apache.lucene.util.IntsRef)4 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)3 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)3 Util (org.apache.lucene.util.fst.Util)3 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)2 IndexOutput (org.apache.lucene.store.IndexOutput)2 ArrayUtil (org.apache.lucene.util.ArrayUtil)2 OfflineSorter (org.apache.lucene.util.OfflineSorter)2 TestUtil (org.apache.lucene.util.TestUtil)2 Automaton (org.apache.lucene.util.automaton.Automaton)2 LimitedFiniteStringsIterator (org.apache.lucene.util.automaton.LimitedFiniteStringsIterator)2