use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class TokenStreamFromTermVector method init.
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
short dpEnumFlags = PostingsEnum.POSITIONS;
if (vector.hasOffsets()) {
dpEnumFlags |= PostingsEnum.OFFSETS;
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
//must ask for offsets too
dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
}
// We put term data here
termCharsBuilder = new CharsRefBuilder();
//7 is over-estimate of average term len
termCharsBuilder.grow((int) (vector.size() * 7));
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
int lastPosition = -1;
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
//only for UTF8->UTF16 call
CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
//int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
tempCharsRefBuilder.grow(termBytesRef.length);
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
final int termCharsOff = termCharsBuilder.length();
termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
// presumably checked by TokenSources.hasPositions earlier
assert dpEnum != null;
dpEnum.nextDoc();
final int freq = dpEnum.freq();
//sumFreq += freq;
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
token.termCharsOff = termCharsOff;
token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
if (token.startOffset > maxStartOffset) {
//filter this token out; exceeds threshold
continue;
}
token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
//divide by 8
pos = token.startOffset >> 3;
}
}
if (payloadAttribute != null) {
final BytesRef payload = dpEnum.getPayload();
token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
}
//Add token to an array indexed by position
if (positionedTokens.length <= pos) {
//grow, but not 2x since we think our original length estimate is close
TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
positionedTokens = newPositionedTokens;
}
positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
lastPosition = Math.max(lastPosition, pos);
}
}
// System.out.println(String.format(
// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
// (originalPositionEstimate/(lastPosition + 1.0f))));
// Step 2: Link all Tokens into a linked-list and set position increments as we go
int prevTokenPos = -1;
TokenLL prevToken = null;
for (int pos = 0; pos <= lastPosition; pos++) {
TokenLL token = positionedTokens[pos];
if (token == null) {
continue;
}
//link
if (prevToken != null) {
assert prevToken.next == null;
//concatenate linked-list
prevToken.next = token;
} else {
assert firstToken == null;
firstToken = token;
}
//set increments
if (vector.hasPositions()) {
token.positionIncrement = pos - prevTokenPos;
while (token.next != null) {
token = token.next;
token.positionIncrement = 0;
}
} else {
token.positionIncrement = 1;
while (token.next != null) {
prevToken = token;
token = token.next;
if (prevToken.startOffset == token.startOffset) {
token.positionIncrement = 0;
} else {
token.positionIncrement = 1;
}
}
}
prevTokenPos = pos;
prevToken = token;
}
initialized = true;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class DistributedUpdateProcessor method processAdd.
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
assert TestInjection.injectFailUpdateRequests();
updateCommand = cmd;
if (zkEnabled) {
zkCheck();
nodes = setupRequest(cmd.getHashableId(), cmd.getSolrInputDocument());
} else {
isLeader = getNonZkLeaderAssumption(req);
}
// check if client has requested minimum replication factor information
// disabled by default
int minRf = -1;
if (replicationTracker != null) {
// for subsequent requests in the same batch
minRf = replicationTracker.minRf;
} else {
SolrParams rp = cmd.getReq().getParams();
String distribUpdate = rp.get(DISTRIB_UPDATE_PARAM);
// a leader or this is the top-level request processor
if (distribUpdate == null || distribUpdate.equals(DistribPhase.TOLEADER.toString())) {
String minRepFact = rp.get(UpdateRequest.MIN_REPFACT);
if (minRepFact != null) {
try {
minRf = Integer.parseInt(minRepFact);
} catch (NumberFormatException nfe) {
minRf = -1;
}
if (minRf <= 0)
throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid value " + minRepFact + " for " + UpdateRequest.MIN_REPFACT + "; must be >0 and less than or equal to the collection replication factor.");
}
if (minRf > 1) {
String myShardId = forwardToLeader ? null : cloudDesc.getShardId();
replicationTracker = new RequestReplicationTracker(myShardId, minRf);
}
}
}
// If we were sent a previous version, set this to the AddUpdateCommand (if not already set)
if (!cmd.isInPlaceUpdate()) {
cmd.prevVersion = cmd.getReq().getParams().getLong(DistributedUpdateProcessor.DISTRIB_INPLACE_PREVVERSION, -1);
}
// TODO: if minRf > 1 and we know the leader is the only active replica, we could fail
// the request right here but for now I think it is better to just return the status
// to the client that the minRf wasn't reached and let them handle it
boolean dropCmd = false;
if (!forwardToLeader) {
dropCmd = versionAdd(cmd);
}
if (dropCmd) {
// TODO: do we need to add anything to the response?
return;
}
if (zkEnabled && isLeader && !isSubShardLeader) {
DocCollection coll = zkController.getClusterState().getCollection(collection);
List<Node> subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), cmd.getHashableId(), cmd.getSolrInputDocument());
// the list<node> will actually have only one element for an add request
if (subShardLeaders != null && !subShardLeaders.isEmpty()) {
ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
params.set(DISTRIB_FROM_PARENT, req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId());
for (Node subShardLeader : subShardLeaders) {
cmdDistrib.distribAdd(cmd, Collections.singletonList(subShardLeader), params, true);
}
}
final List<Node> nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, cmd.getHashableId(), cmd.getSolrInputDocument());
if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) {
ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
params.set(DISTRIB_FROM_COLLECTION, req.getCore().getCoreDescriptor().getCloudDescriptor().getCollectionName());
params.set(DISTRIB_FROM_SHARD, req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId());
for (Node nodesByRoutingRule : nodesByRoutingRules) {
cmdDistrib.distribAdd(cmd, Collections.singletonList(nodesByRoutingRule), params, true);
}
}
}
ModifiableSolrParams params = null;
if (nodes != null) {
params = new ModifiableSolrParams(filterParams(req.getParams()));
params.set(DISTRIB_UPDATE_PARAM, (isLeader || isSubShardLeader ? DistribPhase.FROMLEADER.toString() : DistribPhase.TOLEADER.toString()));
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
if (replicationTracker != null && minRf > 1)
params.set(UpdateRequest.MIN_REPFACT, String.valueOf(minRf));
if (cmd.isInPlaceUpdate()) {
params.set(DISTRIB_INPLACE_PREVVERSION, String.valueOf(cmd.prevVersion));
// Use synchronous=true so that a new connection is used, instead
// of the update being streamed through an existing streaming client.
// When using a streaming client, the previous update
// and the current in-place update (that depends on the previous update), if reordered
// in the stream, can result in the current update being bottled up behind the previous
// update in the stream and can lead to degraded performance.
cmdDistrib.distribAdd(cmd, nodes, params, true, replicationTracker);
} else {
cmdDistrib.distribAdd(cmd, nodes, params, false, replicationTracker);
}
}
// TODO: what to do when no idField?
if (returnVersions && rsp != null && idField != null) {
if (addsResponse == null) {
addsResponse = new NamedList<String>(1);
rsp.add("adds", addsResponse);
}
if (scratch == null)
scratch = new CharsRefBuilder();
idField.getType().indexedToReadable(cmd.getIndexedId(), scratch);
addsResponse.add(scratch.toString(), cmd.getVersion());
}
// TODO: keep track of errors? needs to be done at a higher level though since
// an id may fail before it gets to this processor.
// Given that, it may also make sense to move the version reporting out of this
// processor too.
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class DirectSpellChecker method suggestSimilar.
/**
* Suggest similar words.
*
* <p>Unlike {@link SpellChecker}, the similarity used to fetch the most
* relevant terms is an edit distance, therefore typically a low value
* for numSug will work very well.
*
* @param term Term you want to spell check on
* @param numSug the maximum number of suggested words
* @param ir IndexReader to find terms from
* @param suggestMode specifies when to return suggested words
* @param accuracy return only suggested words that match with this similarity
* @return sorted list of the suggested words according to the comparator
* @throws IOException If there is a low-level I/O error.
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) throws IOException {
final CharsRefBuilder spare = new CharsRefBuilder();
String text = term.text();
if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
return new SuggestWord[0];
if (lowerCaseTerms) {
term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
}
int docfreq = ir.docFreq(term);
if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
return new SuggestWord[0];
}
int maxDoc = ir.maxDoc();
if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
return new SuggestWord[0];
} else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float) maxDoc)) {
return new SuggestWord[0];
}
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR)
docfreq = 0;
if (thresholdFrequency >= 1f) {
docfreq = Math.max(docfreq, (int) thresholdFrequency);
} else if (thresholdFrequency > 0f) {
docfreq = Math.max(docfreq, (int) (thresholdFrequency * (float) maxDoc) - 1);
}
Collection<ScoreTerm> terms = null;
int inspections = numSug * maxInspections;
// try ed=1 first, in case we get lucky
terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
if (maxEdits > 1 && terms.size() < inspections) {
HashSet<ScoreTerm> moreTerms = new HashSet<>();
moreTerms.addAll(terms);
moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
terms = moreTerms;
}
// create the suggestword response, sort it, and trim it to size.
SuggestWord[] suggestions = new SuggestWord[terms.size()];
int index = suggestions.length - 1;
for (ScoreTerm s : terms) {
SuggestWord suggestion = new SuggestWord();
if (s.termAsString == null) {
spare.copyUTF8Bytes(s.term);
s.termAsString = spare.toString();
}
suggestion.string = s.termAsString;
suggestion.score = s.score;
suggestion.freq = s.docfreq;
suggestions[index--] = suggestion;
}
ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator));
if (numSug < suggestions.length) {
SuggestWord[] trimmed = new SuggestWord[numSug];
System.arraycopy(suggestions, 0, trimmed, 0, numSug);
suggestions = trimmed;
}
return suggestions;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class FSTCompletionLookup method lookup.
@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean higherWeightsFirst, int num) {
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
final List<Completion> completions;
if (higherWeightsFirst) {
completions = higherWeightsCompletion.lookup(key, num);
} else {
completions = normalCompletion.lookup(key, num);
}
final ArrayList<LookupResult> results = new ArrayList<>(completions.size());
CharsRefBuilder spare = new CharsRefBuilder();
for (Completion c : completions) {
spare.copyUTF8Bytes(c.utf8);
results.add(new LookupResult(spare.toString(), c.bucket));
}
return results;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class QueryParsing method writeFieldVal.
static void writeFieldVal(BytesRef val, FieldType ft, Appendable out, int flags) throws IOException {
if (ft != null) {
try {
CharsRefBuilder readable = new CharsRefBuilder();
ft.indexedToReadable(val, readable);
out.append(readable.get());
} catch (Exception e) {
out.append("EXCEPTION(val=");
out.append(val.utf8ToString());
out.append(")");
}
} else {
out.append(val.utf8ToString());
}
}
Aggregations