Search in sources :

Example 16 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class SuggestComponent method toSuggesterResult.

/** Convert NamedList (suggester response) to {@link SuggesterResult} */
private SuggesterResult toSuggesterResult(Map<String, SimpleOrderedMap<NamedList<Object>>> suggestionsMap) {
    SuggesterResult result = new SuggesterResult();
    if (suggestionsMap == null) {
        return result;
    }
    // for each token
    for (Map.Entry<String, SimpleOrderedMap<NamedList<Object>>> entry : suggestionsMap.entrySet()) {
        String suggesterName = entry.getKey();
        for (Iterator<Map.Entry<String, NamedList<Object>>> suggestionsIter = entry.getValue().iterator(); suggestionsIter.hasNext(); ) {
            Map.Entry<String, NamedList<Object>> suggestions = suggestionsIter.next();
            String tokenString = suggestions.getKey();
            List<LookupResult> lookupResults = new ArrayList<>();
            NamedList<Object> suggestion = suggestions.getValue();
            // for each suggestion
            for (int j = 0; j < suggestion.size(); j++) {
                String property = suggestion.getName(j);
                if (property.equals(SuggesterResultLabels.SUGGESTIONS)) {
                    @SuppressWarnings("unchecked") List<NamedList<Object>> suggestionEntries = (List<NamedList<Object>>) suggestion.getVal(j);
                    for (NamedList<Object> suggestionEntry : suggestionEntries) {
                        String term = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_TERM);
                        Long weight = (Long) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_WEIGHT);
                        String payload = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_PAYLOAD);
                        LookupResult res = new LookupResult(new CharsRef(term), weight, new BytesRef(payload));
                        lookupResults.add(res);
                    }
                }
                result.add(suggesterName, tokenString, lookupResults);
            }
        }
    }
    return result;
}
Also used : NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) CharsRef(org.apache.lucene.util.CharsRef) SuggesterResult(org.apache.solr.spelling.suggest.SuggesterResult) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) AtomicLong(java.util.concurrent.atomic.AtomicLong) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) MetricsMap(org.apache.solr.metrics.MetricsMap) BytesRef(org.apache.lucene.util.BytesRef)

Example 17 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class Dictionary method parseConversions.

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
    Map<String, String> mappings = new TreeMap<>();
    for (int i = 0; i < num; i++) {
        String line = reader.readLine();
        String[] parts = line.split("\\s+");
        if (parts.length != 3) {
            throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
        }
        if (mappings.put(parts[1], parts[2]) != null) {
            throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
        }
    }
    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, String> entry : mappings.entrySet()) {
        Util.toUTF16(entry.getKey(), scratchInts);
        builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
    }
    return builder.finish();
}
Also used : Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) TreeMap(java.util.TreeMap) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CharsRef(org.apache.lucene.util.CharsRef) ParseException(java.text.ParseException) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TreeMap(java.util.TreeMap)

Example 18 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class CharSequenceOutputs method read.

@Override
public CharsRef read(DataInput in) throws IOException {
    final int len = in.readVInt();
    if (len == 0) {
        return NO_OUTPUT;
    } else {
        final CharsRef output = new CharsRef(len);
        for (int idx = 0; idx < len; idx++) {
            output.chars[idx] = (char) in.readVInt();
        }
        output.length = len;
        return output;
    }
}
Also used : CharsRef(org.apache.lucene.util.CharsRef)

Example 19 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class DaciukMihovAutomatonBuilder method build.

/**
   * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
   * strings in UTF-8. These strings must be binary-sorted.
   */
public static Automaton build(Collection<BytesRef> input) {
    final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
    char[] chars = new char[0];
    CharsRef ref = new CharsRef();
    for (BytesRef b : input) {
        chars = ArrayUtil.grow(chars, b.length);
        final int len = UnicodeUtil.UTF8toUTF16(b, chars);
        ref.chars = chars;
        ref.length = len;
        builder.add(ref);
    }
    Automaton.Builder a = new Automaton.Builder();
    convert(a, builder.complete(), new IdentityHashMap<State, Integer>());
    return a.finish();
}
Also used : CharsRef(org.apache.lucene.util.CharsRef) BytesRef(org.apache.lucene.util.BytesRef)

Example 20 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class MultiTermHighlighting method extractAutomata.

/**
   * Extracts MultiTermQueries that match the provided field predicate.
   * Returns equivalent automata that will match terms.
   */
public static CharacterRunAutomaton[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan, Function<Query, Collection<Query>> preRewriteFunc) {
    // TODO Lucene needs a Query visitor API!  LUCENE-3041
    List<CharacterRunAutomaton> list = new ArrayList<>();
    Collection<Query> customSubQueries = preRewriteFunc.apply(query);
    if (customSubQueries != null) {
        for (Query sub : customSubQueries) {
            list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
        }
    } else if (query instanceof BooleanQuery) {
        for (BooleanClause clause : (BooleanQuery) query) {
            if (!clause.isProhibited()) {
                list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
            }
        }
    } else if (query instanceof ConstantScoreQuery) {
        list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
    } else if (query instanceof BoostQuery) {
        list.addAll(Arrays.asList(extractAutomata(((BoostQuery) query).getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
    } else if (query instanceof DisjunctionMaxQuery) {
        for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
            list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
        }
    } else if (lookInSpan && query instanceof SpanOrQuery) {
        for (Query sub : ((SpanOrQuery) query).getClauses()) {
            list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
        }
    } else if (lookInSpan && query instanceof SpanNearQuery) {
        for (Query sub : ((SpanNearQuery) query).getClauses()) {
            list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
        }
    } else if (lookInSpan && query instanceof SpanNotQuery) {
        list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), fieldMatcher, lookInSpan, preRewriteFunc)));
    } else if (lookInSpan && query instanceof SpanPositionCheckQuery) {
        list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), fieldMatcher, lookInSpan, preRewriteFunc)));
    } else if (lookInSpan && query instanceof SpanBoostQuery) {
        list.addAll(Arrays.asList(extractAutomata(((SpanBoostQuery) query).getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
    } else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) {
        list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
    } else if (query instanceof PrefixQuery) {
        final PrefixQuery pq = (PrefixQuery) query;
        Term prefix = pq.getPrefix();
        if (fieldMatcher.test(prefix.field())) {
            list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()), Automata.makeAnyString())) {

                @Override
                public String toString() {
                    return pq.toString();
                }
            });
        }
    } else if (query instanceof FuzzyQuery) {
        final FuzzyQuery fq = (FuzzyQuery) query;
        if (fieldMatcher.test(fq.getField())) {
            String utf16 = fq.getTerm().text();
            int[] termText = new int[utf16.codePointCount(0, utf16.length())];
            for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
                termText[j++] = cp = utf16.codePointAt(i);
            }
            int termLength = termText.length;
            int prefixLength = Math.min(fq.getPrefixLength(), termLength);
            String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
            LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
            String prefix = UnicodeUtil.newString(termText, 0, prefixLength);
            Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix);
            list.add(new CharacterRunAutomaton(automaton) {

                @Override
                public String toString() {
                    return fq.toString();
                }
            });
        }
    } else if (query instanceof TermRangeQuery) {
        final TermRangeQuery tq = (TermRangeQuery) query;
        if (fieldMatcher.test(tq.getField())) {
            final CharsRef lowerBound;
            if (tq.getLowerTerm() == null) {
                lowerBound = null;
            } else {
                lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
            }
            final CharsRef upperBound;
            if (tq.getUpperTerm() == null) {
                upperBound = null;
            } else {
                upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
            }
            final boolean includeLower = tq.includesLower();
            final boolean includeUpper = tq.includesUpper();
            final CharsRef scratch = new CharsRef();
            @SuppressWarnings("deprecation") final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
            // this is *not* an automaton, but its very simple
            list.add(new CharacterRunAutomaton(Automata.makeEmpty()) {

                @Override
                public boolean run(char[] s, int offset, int length) {
                    scratch.chars = s;
                    scratch.offset = offset;
                    scratch.length = length;
                    if (lowerBound != null) {
                        int cmp = comparator.compare(scratch, lowerBound);
                        if (cmp < 0 || (!includeLower && cmp == 0)) {
                            return false;
                        }
                    }
                    if (upperBound != null) {
                        int cmp = comparator.compare(scratch, upperBound);
                        if (cmp > 0 || (!includeUpper && cmp == 0)) {
                            return false;
                        }
                    }
                    return true;
                }

                @Override
                public String toString() {
                    return tq.toString();
                }
            });
        }
    } else if (query instanceof AutomatonQuery) {
        final AutomatonQuery aq = (AutomatonQuery) query;
        if (fieldMatcher.test(aq.getField())) {
            list.add(new CharacterRunAutomaton(aq.getAutomaton()) {

                @Override
                public String toString() {
                    return aq.toString();
                }
            });
        }
    }
    return list.toArray(new CharacterRunAutomaton[list.size()]);
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) AutomatonQuery(org.apache.lucene.search.AutomatonQuery) SpanPositionCheckQuery(org.apache.lucene.search.spans.SpanPositionCheckQuery) SpanBoostQuery(org.apache.lucene.search.spans.SpanBoostQuery) SpanNotQuery(org.apache.lucene.search.spans.SpanNotQuery) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) PrefixQuery(org.apache.lucene.search.PrefixQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) TermRangeQuery(org.apache.lucene.search.TermRangeQuery) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) ArrayList(java.util.ArrayList) SpanBoostQuery(org.apache.lucene.search.spans.SpanBoostQuery) BoostQuery(org.apache.lucene.search.BoostQuery) SpanNotQuery(org.apache.lucene.search.spans.SpanNotQuery) Comparator(java.util.Comparator) AutomatonQuery(org.apache.lucene.search.AutomatonQuery) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) SpanMultiTermQueryWrapper(org.apache.lucene.search.spans.SpanMultiTermQueryWrapper) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) LevenshteinAutomata(org.apache.lucene.util.automaton.LevenshteinAutomata) TermRangeQuery(org.apache.lucene.search.TermRangeQuery) Term(org.apache.lucene.index.Term) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) CharsRef(org.apache.lucene.util.CharsRef) BooleanClause(org.apache.lucene.search.BooleanClause) SpanPositionCheckQuery(org.apache.lucene.search.spans.SpanPositionCheckQuery) PrefixQuery(org.apache.lucene.search.PrefixQuery) SpanBoostQuery(org.apache.lucene.search.spans.SpanBoostQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery)

Aggregations

CharsRef (org.apache.lucene.util.CharsRef)27 BytesRef (org.apache.lucene.util.BytesRef)8 ArrayList (java.util.ArrayList)6 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)6 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)6 StringReader (java.io.StringReader)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)4 SynonymFilter (org.apache.lucene.analysis.synonym.SynonymFilter)4 SynonymMap (org.apache.lucene.analysis.synonym.SynonymMap)4 HashMap (java.util.HashMap)3 TokenStream (org.apache.lucene.analysis.TokenStream)3 IntsRef (org.apache.lucene.util.IntsRef)3 Test (org.junit.Test)3 ParseException (java.text.ParseException)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 Tokenizer (org.apache.lucene.analysis.Tokenizer)2