use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method convertTokensToNamedLists.
/**
* Converts the list of Tokens to a list of NamedLists representing the tokens.
*
* @param tokenList Tokens to convert
* @param context The analysis context
*
* @return List of NamedLists containing the relevant information taken from the tokens
*/
private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) {
final List<NamedList> tokensNamedLists = new ArrayList<>();
final FieldType fieldType = context.getFieldType();
final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]);
// sort the tokens by absolute position
ArrayUtil.timSort(tokens, new Comparator<AttributeSource>() {
@Override
public int compare(AttributeSource a, AttributeSource b) {
return arrayCompare(a.getAttribute(TokenTrackingAttribute.class).getPositions(), b.getAttribute(TokenTrackingAttribute.class).getPositions());
}
private int arrayCompare(int[] a, int[] b) {
int p = 0;
final int stop = Math.min(a.length, b.length);
while (p < stop) {
int diff = a[p] - b[p];
if (diff != 0)
return diff;
p++;
}
// One is a prefix of the other, or, they are equal:
return a.length - b.length;
}
});
for (int i = 0; i < tokens.length; i++) {
AttributeSource token = tokens[i];
final NamedList<Object> tokenNamedList = new SimpleOrderedMap<>();
final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
BytesRef rawBytes = termAtt.getBytesRef();
final String text = fieldType.indexedToReadable(rawBytes, new CharsRefBuilder()).toString();
tokenNamedList.add("text", text);
if (token.hasAttribute(CharTermAttribute.class)) {
final String rawText = token.getAttribute(CharTermAttribute.class).toString();
if (!rawText.equals(text)) {
tokenNamedList.add("raw_text", rawText);
}
}
tokenNamedList.add("raw_bytes", rawBytes.toString());
if (context.getTermsToMatch().contains(rawBytes)) {
tokenNamedList.add("match", true);
}
token.reflectWith(new AttributeReflector() {
@Override
public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
// leave out position and bytes term
if (TermToBytesRefAttribute.class.isAssignableFrom(attClass))
return;
if (CharTermAttribute.class.isAssignableFrom(attClass))
return;
if (PositionIncrementAttribute.class.isAssignableFrom(attClass))
return;
String k = attClass.getName() + '#' + key;
// map keys for "standard attributes":
if (ATTRIBUTE_MAPPING.containsKey(k)) {
k = ATTRIBUTE_MAPPING.get(k);
}
if (value instanceof BytesRef) {
final BytesRef p = (BytesRef) value;
value = p.toString();
}
tokenNamedList.add(k, value);
}
});
tokensNamedLists.add(tokenNamedList);
}
return tokensNamedLists;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class SimpleFacets method getFacetTermEnumCounts.
/**
* Returns a list of terms in the specified field along with the
* corresponding count of documents in the set that match that constraint.
* This method uses the FilterCache to get the intersection count between <code>docs</code>
* and the DocSet for each term in the filter.
*
* @see FacetParams#FACET_LIMIT
* @see FacetParams#FACET_ZEROS
* @see FacetParams#FACET_MISSING
*/
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck) throws IOException {
/* :TODO: potential optimization...
* cache the Terms with the highest docFreq and try them first
* don't enum if we get our max from them
*/
// Minimum term docFreq in order to use the filterCache for that term.
int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);
// make sure we have a set that is fast for random access, if we will use it for that
DocSet fastForRandomSet = docs;
if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
SortedIntDocSet sset = (SortedIntDocSet) docs;
fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
}
IndexSchema schema = searcher.getSchema();
FieldType ft = schema.getFieldType(field);
assert !ft.isPointField() : "Point Fields don't support enum method";
LeafReader r = searcher.getSlowAtomicReader();
boolean sortByCount = sort.equals("count") || sort.equals("true");
final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
final NamedList<Integer> res = new NamedList<>();
// the smallest value in the top 'N' values
int min = mincount - 1;
int off = offset;
int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
BytesRef prefixTermBytes = null;
if (prefix != null) {
String indexedPrefix = ft.toInternal(prefix);
prefixTermBytes = new BytesRef(indexedPrefix);
}
Fields fields = r.fields();
Terms terms = fields == null ? null : fields.terms(field);
TermsEnum termsEnum = null;
SolrIndexSearcher.DocsEnumState deState = null;
BytesRef term = null;
if (terms != null) {
termsEnum = terms.iterator();
if (prefixTermBytes != null) {
if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
termsEnum = null;
} else {
term = termsEnum.term();
}
} else {
// position termsEnum on first term
term = termsEnum.next();
}
}
PostingsEnum postingsEnum = null;
CharsRefBuilder charsRef = new CharsRefBuilder();
if (docs.size() >= mincount) {
while (term != null) {
if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
break;
if (termFilter == null || termFilter.test(term)) {
int df = termsEnum.docFreq();
// make a large difference (for example, many terms with df=1).
if (df > 0 && df > min) {
int c;
if (df >= minDfFilterCache) {
if (deState == null) {
deState = new SolrIndexSearcher.DocsEnumState();
deState.fieldName = field;
deState.liveDocs = r.getLiveDocs();
deState.termsEnum = termsEnum;
deState.postingsEnum = postingsEnum;
}
if (intersectsCheck) {
c = searcher.intersects(docs, deState) ? 1 : 0;
} else {
c = searcher.numDocs(docs, deState);
}
postingsEnum = deState.postingsEnum;
} else {
// iterate over TermDocs to calculate the intersection
// TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this?
// TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
// TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet?
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
c = 0;
if (postingsEnum instanceof MultiPostingsEnum) {
MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
SEGMENTS_LOOP: for (int subindex = 0; subindex < numSubs; subindex++) {
MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
if (sub.postingsEnum == null)
continue;
int base = sub.slice.start;
int docid;
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid + base)) {
c++;
if (intersectsCheck) {
assert c == 1;
break SEGMENTS_LOOP;
}
}
}
}
} else {
int docid;
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid)) {
c++;
if (intersectsCheck) {
assert c == 1;
break;
}
}
}
}
}
if (sortByCount) {
if (c > min) {
BytesRef termCopy = BytesRef.deepCopyOf(term);
queue.add(new CountPair<>(termCopy, c));
if (queue.size() >= maxsize)
min = queue.last().val;
}
} else {
if (c >= mincount && --off < 0) {
if (--lim < 0)
break;
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
}
}
}
term = termsEnum.next();
}
}
if (sortByCount) {
for (CountPair<BytesRef, Integer> p : queue) {
if (--off >= 0)
continue;
if (--lim < 0)
break;
ft.indexedToReadable(p.key, charsRef);
res.add(charsRef.toString(), p.val);
}
}
if (missing) {
res.add(null, getFieldMissingCount(searcher, docs, field));
}
return res;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class FieldOffsetStrategy method createAutomataOffsetsFromTerms.
protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
automataPostings.add(new ArrayList<>());
}
TermsEnum termsEnum = termsIndex.iterator();
BytesRef term;
CharsRefBuilder refBuilder = new CharsRefBuilder();
while ((term = termsEnum.next()) != null) {
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
refBuilder.copyUTF8Bytes(term);
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (doc == postings.advance(doc)) {
automataPostings.get(i).add(postings);
}
}
}
}
//will be at most this long
List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
List<PostingsEnum> postingsEnums = automataPostings.get(i);
int size = postingsEnums.size();
if (size > 0) {
//only add if we have offsets
BytesRef wildcardTerm = new BytesRef(automaton.toString());
if (size == 1) {
//don't wrap in a composite if there's only one OffsetsEnum
offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
} else {
offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
}
}
}
return offsetsEnums;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class NRTSuggester method lookup.
/**
* Collects at most {@link TopSuggestDocsCollector#getCountToCollect()} completions that
* match the provided {@link CompletionScorer}.
* <p>
* The {@link CompletionScorer#automaton} is intersected with the {@link #fst}.
* {@link CompletionScorer#weight} is used to compute boosts and/or extract context
* for each matched partial paths. A top N search is executed on {@link #fst} seeded with
* the matched partial paths. Upon reaching a completed path, {@link CompletionScorer#accept(int, Bits)}
* and {@link CompletionScorer#score(float, float)} is used on the document id, index weight
* and query boost to filter and score the entry, before being collected via
* {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}
*/
public void lookup(final CompletionScorer scorer, Bits acceptDocs, final TopSuggestDocsCollector collector) throws IOException {
final double liveDocsRatio = calculateLiveDocRatio(scorer.reader.numDocs(), scorer.reader.maxDoc());
if (liveDocsRatio == -1) {
return;
}
final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(scorer.automaton, fst);
// The topN is increased by a factor of # of intersected path
// to ensure search admissibility. For example, one suggestion can
// have multiple contexts, resulting in num_context paths for the
// suggestion instead of 1 in the FST. When queried for the suggestion,
// the topN value ensures that all paths to the suggestion are evaluated
// (in case of a match all context query).
// Note that collectors will early terminate as soon as enough suggestions
// have been collected, regardless of the set topN value. This value is the
// maximum number of suggestions that can be collected.
final int topN = collector.getCountToCollect() * prefixPaths.size();
final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio, scorer.filtered);
final CharsRefBuilder spare = new CharsRefBuilder();
Comparator<Pair<Long, BytesRef>> comparator = getComparator();
Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, topN, queueSize, comparator, new ScoringPathComparator(scorer)) {
private final ByteArrayDataInput scratchInput = new ByteArrayDataInput();
@Override
protected boolean acceptPartialPath(Util.FSTPath<Pair<Long, BytesRef>> path) {
if (collector.doSkipDuplicates()) {
// We are removing dups
if (path.payload == -1) {
// This path didn't yet see the complete surface form; let's see if it just did with the arc output we just added:
BytesRef arcOutput = path.arc.output.output2;
BytesRef output = path.output.output2;
for (int i = 0; i < arcOutput.length; i++) {
if (arcOutput.bytes[arcOutput.offset + i] == payloadSep) {
// OK this arc that the path was just extended by contains the payloadSep, so we now have a full surface form in this path
path.payload = output.length - arcOutput.length + i;
assert output.bytes[output.offset + path.payload] == payloadSep;
break;
}
}
}
if (path.payload != -1) {
BytesRef output = path.output.output2;
spare.copyUTF8Bytes(output.bytes, output.offset, path.payload);
if (collector.seenSurfaceForms.contains(spare.chars(), 0, spare.length())) {
return false;
}
}
}
return true;
}
@Override
protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
BytesRef output = path.output.output2;
int payloadSepIndex;
if (path.payload != -1) {
payloadSepIndex = path.payload;
spare.copyUTF8Bytes(output.bytes, output.offset, payloadSepIndex);
} else {
assert collector.doSkipDuplicates() == false;
payloadSepIndex = parseSurfaceForm(output, payloadSep, spare);
}
scratchInput.reset(output.bytes, output.offset + payloadSepIndex + 1, output.length - payloadSepIndex - 1);
int docID = scratchInput.readVInt();
if (!scorer.accept(docID, acceptDocs)) {
return false;
}
if (collector.doSkipDuplicates()) {
// now record that we've seen this surface form:
char[] key = new char[spare.length()];
System.arraycopy(spare.chars(), 0, key, 0, spare.length());
if (collector.seenSurfaceForms.contains(key)) {
// we already collected a higher scoring document with this key, in this segment:
return false;
}
collector.seenSurfaceForms.add(key);
}
try {
float score = scorer.score(decode(path.output.output1), path.boost);
collector.collect(docID, spare.toCharsRef(), path.context, score);
return true;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
scorer.weight.setNextMatch(path.input.get());
BytesRef output = path.output.output2;
int payload = -1;
if (collector.doSkipDuplicates()) {
for (int j = 0; j < output.length; j++) {
if (output.bytes[output.offset + j] == payloadSep) {
// Important to cache this, else we have a possibly O(N^2) cost where N is the length of suggestions
payload = j;
break;
}
}
}
searcher.addStartPaths(path.fstNode, path.output, false, path.input, scorer.weight.boost(), scorer.weight.context(), payload);
}
// hits are also returned by search()
// we do not use it, instead collect at acceptResult
searcher.search();
// search admissibility is not guaranteed
// see comment on getMaxTopNSearcherQueueSize
// assert search.isComplete;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class WFSTCompletionLookup method lookup.
@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (fst == null) {
return Collections.emptyList();
}
BytesRefBuilder scratch = new BytesRefBuilder();
scratch.copyChars(key);
int prefixLength = scratch.length();
Arc<Long> arc = new Arc<>();
// match the prefix portion exactly
Long prefixOutput = null;
try {
prefixOutput = lookupPrefix(scratch.get(), arc);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
if (prefixOutput == null) {
return Collections.emptyList();
}
List<LookupResult> results = new ArrayList<>(num);
CharsRefBuilder spare = new CharsRefBuilder();
if (exactFirst && arc.isFinal()) {
spare.copyUTF8Bytes(scratch.get());
results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
if (--num == 0) {
// that was quick
return results;
}
}
// complete top-N
TopResults<Long> completions = null;
try {
completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
assert completions.isComplete;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
BytesRefBuilder suffix = new BytesRefBuilder();
for (Result<Long> completion : completions) {
scratch.setLength(prefixLength);
// append suffix
Util.toBytesRef(completion.input, suffix);
scratch.append(suffix);
spare.copyUTF8Bytes(scratch.get());
results.add(new LookupResult(spare.toString(), decodeWeight(completion.output)));
}
return results;
}
Aggregations