use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class TestSuggestField method testReservedChars.
@Test
public void testReservedChars() throws Exception {
CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
charsRefBuilder.append("sugg");
charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.SEP_LABEL);
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new SuggestField("name", charsRefBuilder.toString(), 1);
});
assertTrue(expected.getMessage().contains("[0x1f]"));
charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.HOLE_CHARACTER);
expected = expectThrows(IllegalArgumentException.class, () -> {
new SuggestField("name", charsRefBuilder.toString(), 1);
});
assertTrue(expected.getMessage().contains("[0x1e]"));
charsRefBuilder.setCharAt(2, (char) NRTSuggesterBuilder.END_BYTE);
expected = expectThrows(IllegalArgumentException.class, () -> {
new SuggestField("name", charsRefBuilder.toString(), 1);
});
assertTrue(expected.getMessage().contains("[0x0]"));
}
use of org.apache.lucene.util.CharsRefBuilder in project elasticsearch by elastic.
the class TermVectorsResponse method toXContent.
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
assert index != null;
assert type != null;
assert id != null;
builder.startObject();
builder.field(FieldStrings._INDEX, index);
builder.field(FieldStrings._TYPE, type);
if (!isArtificial()) {
builder.field(FieldStrings._ID, id);
}
builder.field(FieldStrings._VERSION, docVersion);
builder.field(FieldStrings.FOUND, isExists());
builder.field(FieldStrings.TOOK, tookInMillis);
if (isExists()) {
builder.startObject(FieldStrings.TERM_VECTORS);
final CharsRefBuilder spare = new CharsRefBuilder();
Fields theFields = getFields();
Iterator<String> fieldIter = theFields.iterator();
while (fieldIter.hasNext()) {
buildField(builder, spare, theFields, fieldIter);
}
builder.endObject();
}
builder.endObject();
return builder;
}
use of org.apache.lucene.util.CharsRefBuilder in project elasticsearch by elastic.
the class XAnalyzingSuggester method lookup.
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == holeCharacter) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == sepLabel) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
final List<LookupResult> results = new ArrayList<>();
List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// ...:
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
// maxSurfaceFormsPerAnalyzedForm:
for (Result<Pair<Long, BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
for (Result<Pair<Long, BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
use of org.apache.lucene.util.CharsRefBuilder in project elasticsearch by elastic.
the class ContextMappings method toContextQuery.
/**
* Wraps a {@link CompletionQuery} with context queries
*
* @param query base completion query to wrap
* @param queryContexts a map of context mapping name and collected query contexts
* @return a context-enabled query
*/
public ContextQuery toContextQuery(CompletionQuery query, Map<String, List<ContextMapping.InternalQueryContext>> queryContexts) {
ContextQuery typedContextQuery = new ContextQuery(query);
if (queryContexts.isEmpty() == false) {
CharsRefBuilder scratch = new CharsRefBuilder();
scratch.grow(1);
for (int typeId = 0; typeId < contextMappings.size(); typeId++) {
scratch.setCharAt(0, (char) typeId);
scratch.setLength(1);
ContextMapping mapping = contextMappings.get(typeId);
List<ContextMapping.InternalQueryContext> internalQueryContext = queryContexts.get(mapping.name());
if (internalQueryContext != null) {
for (ContextMapping.InternalQueryContext context : internalQueryContext) {
scratch.append(context.context);
typedContextQuery.addContext(scratch.toCharsRef(), context.boost, !context.isPrefix);
scratch.setLength(1);
}
}
}
}
return typedContextQuery;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class DirectSpellChecker method suggestSimilar.
/**
* Suggest similar words.
*
* <p>Unlike {@link SpellChecker}, the similarity used to fetch the most
* relevant terms is an edit distance, therefore typically a low value
* for numSug will work very well.
*
* @param term Term you want to spell check on
* @param numSug the maximum number of suggested words
* @param ir IndexReader to find terms from
* @param suggestMode specifies when to return suggested words
* @param accuracy return only suggested words that match with this similarity
* @return sorted list of the suggested words according to the comparator
* @throws IOException If there is a low-level I/O error.
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) throws IOException {
final CharsRefBuilder spare = new CharsRefBuilder();
String text = term.text();
if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
return new SuggestWord[0];
if (lowerCaseTerms) {
term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
}
int docfreq = ir.docFreq(term);
if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
return new SuggestWord[0];
}
int maxDoc = ir.maxDoc();
if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
return new SuggestWord[0];
} else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float) maxDoc)) {
return new SuggestWord[0];
}
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR)
docfreq = 0;
if (thresholdFrequency >= 1f) {
docfreq = Math.max(docfreq, (int) thresholdFrequency);
} else if (thresholdFrequency > 0f) {
docfreq = Math.max(docfreq, (int) (thresholdFrequency * (float) maxDoc) - 1);
}
Collection<ScoreTerm> terms = null;
int inspections = numSug * maxInspections;
// try ed=1 first, in case we get lucky
terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
if (maxEdits > 1 && terms.size() < inspections) {
HashSet<ScoreTerm> moreTerms = new HashSet<>();
moreTerms.addAll(terms);
moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
terms = moreTerms;
}
// create the suggestword response, sort it, and trim it to size.
SuggestWord[] suggestions = new SuggestWord[terms.size()];
int index = suggestions.length - 1;
for (ScoreTerm s : terms) {
SuggestWord suggestion = new SuggestWord();
if (s.termAsString == null) {
spare.copyUTF8Bytes(s.term);
s.termAsString = spare.toString();
}
suggestion.string = s.termAsString;
suggestion.score = s.score;
suggestion.freq = s.docfreq;
suggestions[index--] = suggestion;
}
ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator));
if (numSug < suggestions.length) {
SuggestWord[] trimmed = new SuggestWord[numSug];
System.arraycopy(suggestions, 0, trimmed, 0, numSug);
suggestions = trimmed;
}
return suggestions;
}
Aggregations