Search in sources :

Example 26 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class TestSuggestField method testReservedChars.

@Test
public void testReservedChars() throws Exception {
    CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
    charsRefBuilder.append("sugg");
    charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.SEP_LABEL);
    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
        new SuggestField("name", charsRefBuilder.toString(), 1);
    });
    assertTrue(expected.getMessage().contains("[0x1f]"));
    charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.HOLE_CHARACTER);
    expected = expectThrows(IllegalArgumentException.class, () -> {
        new SuggestField("name", charsRefBuilder.toString(), 1);
    });
    assertTrue(expected.getMessage().contains("[0x1e]"));
    charsRefBuilder.setCharAt(2, (char) NRTSuggesterBuilder.END_BYTE);
    expected = expectThrows(IllegalArgumentException.class, () -> {
        new SuggestField("name", charsRefBuilder.toString(), 1);
    });
    assertTrue(expected.getMessage().contains("[0x0]"));
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Test(org.junit.Test)

Example 27 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project elasticsearch by elastic.

the class TermVectorsResponse method toXContent.

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
    assert index != null;
    assert type != null;
    assert id != null;
    builder.startObject();
    builder.field(FieldStrings._INDEX, index);
    builder.field(FieldStrings._TYPE, type);
    if (!isArtificial()) {
        builder.field(FieldStrings._ID, id);
    }
    builder.field(FieldStrings._VERSION, docVersion);
    builder.field(FieldStrings.FOUND, isExists());
    builder.field(FieldStrings.TOOK, tookInMillis);
    if (isExists()) {
        builder.startObject(FieldStrings.TERM_VECTORS);
        final CharsRefBuilder spare = new CharsRefBuilder();
        Fields theFields = getFields();
        Iterator<String> fieldIter = theFields.iterator();
        while (fieldIter.hasNext()) {
            buildField(builder, spare, theFields, fieldIter);
        }
        builder.endObject();
    }
    builder.endObject();
    return builder;
}
Also used : Fields(org.apache.lucene.index.Fields) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder)

Example 28 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project elasticsearch by elastic.

the class XAnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == holeCharacter) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == sepLabel) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 29 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project elasticsearch by elastic.

the class ContextMappings method toContextQuery.

/**
     * Wraps a {@link CompletionQuery} with context queries
     *
     * @param query base completion query to wrap
     * @param queryContexts a map of context mapping name and collected query contexts
     * @return a context-enabled query
     */
public ContextQuery toContextQuery(CompletionQuery query, Map<String, List<ContextMapping.InternalQueryContext>> queryContexts) {
    ContextQuery typedContextQuery = new ContextQuery(query);
    if (queryContexts.isEmpty() == false) {
        CharsRefBuilder scratch = new CharsRefBuilder();
        scratch.grow(1);
        for (int typeId = 0; typeId < contextMappings.size(); typeId++) {
            scratch.setCharAt(0, (char) typeId);
            scratch.setLength(1);
            ContextMapping mapping = contextMappings.get(typeId);
            List<ContextMapping.InternalQueryContext> internalQueryContext = queryContexts.get(mapping.name());
            if (internalQueryContext != null) {
                for (ContextMapping.InternalQueryContext context : internalQueryContext) {
                    scratch.append(context.context);
                    typedContextQuery.addContext(scratch.toCharsRef(), context.boost, !context.isPrefix);
                    scratch.setLength(1);
                }
            }
        }
    }
    return typedContextQuery;
}
Also used : ContextQuery(org.apache.lucene.search.suggest.document.ContextQuery) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder)

Example 30 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class DirectSpellChecker method suggestSimilar.

/**
   * Suggest similar words.
   * 
   * <p>Unlike {@link SpellChecker}, the similarity used to fetch the most
   * relevant terms is an edit distance, therefore typically a low value
   * for numSug will work very well.
   * 
   * @param term Term you want to spell check on
   * @param numSug the maximum number of suggested words
   * @param ir IndexReader to find terms from
   * @param suggestMode specifies when to return suggested words
   * @param accuracy return only suggested words that match with this similarity
   * @return sorted list of the suggested words according to the comparator
   * @throws IOException If there is a low-level I/O error.
   */
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) throws IOException {
    final CharsRefBuilder spare = new CharsRefBuilder();
    String text = term.text();
    if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
        return new SuggestWord[0];
    if (lowerCaseTerms) {
        term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
    }
    int docfreq = ir.docFreq(term);
    if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
        return new SuggestWord[0];
    }
    int maxDoc = ir.maxDoc();
    if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
        return new SuggestWord[0];
    } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float) maxDoc)) {
        return new SuggestWord[0];
    }
    if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR)
        docfreq = 0;
    if (thresholdFrequency >= 1f) {
        docfreq = Math.max(docfreq, (int) thresholdFrequency);
    } else if (thresholdFrequency > 0f) {
        docfreq = Math.max(docfreq, (int) (thresholdFrequency * (float) maxDoc) - 1);
    }
    Collection<ScoreTerm> terms = null;
    int inspections = numSug * maxInspections;
    // try ed=1 first, in case we get lucky
    terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
    if (maxEdits > 1 && terms.size() < inspections) {
        HashSet<ScoreTerm> moreTerms = new HashSet<>();
        moreTerms.addAll(terms);
        moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
        terms = moreTerms;
    }
    // create the suggestword response, sort it, and trim it to size.
    SuggestWord[] suggestions = new SuggestWord[terms.size()];
    int index = suggestions.length - 1;
    for (ScoreTerm s : terms) {
        SuggestWord suggestion = new SuggestWord();
        if (s.termAsString == null) {
            spare.copyUTF8Bytes(s.term);
            s.termAsString = spare.toString();
        }
        suggestion.string = s.termAsString;
        suggestion.score = s.score;
        suggestion.freq = s.docfreq;
        suggestions[index--] = suggestion;
    }
    ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator));
    if (numSug < suggestions.length) {
        SuggestWord[] trimmed = new SuggestWord[numSug];
        System.arraycopy(suggestions, 0, trimmed, 0, numSug);
        suggestions = trimmed;
    }
    return suggestions;
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Term(org.apache.lucene.index.Term) HashSet(java.util.HashSet)

Aggregations

CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)52 BytesRef (org.apache.lucene.util.BytesRef)30 ArrayList (java.util.ArrayList)11 IOException (java.io.IOException)10 NamedList (org.apache.solr.common.util.NamedList)10 FieldType (org.apache.solr.schema.FieldType)10 TermsEnum (org.apache.lucene.index.TermsEnum)9 SchemaField (org.apache.solr.schema.SchemaField)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 HashSet (java.util.HashSet)5 Test (org.junit.Test)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 PostingsEnum (org.apache.lucene.index.PostingsEnum)4 Terms (org.apache.lucene.index.Terms)4 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)4 LeafReader (org.apache.lucene.index.LeafReader)3 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)3 CharsRef (org.apache.lucene.util.CharsRef)3 Util (org.apache.lucene.util.fst.Util)3 SolrException (org.apache.solr.common.SolrException)3