Search in sources :

Example 41 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class TokenStreamFromTermVector method init.

//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
    assert !initialized;
    short dpEnumFlags = PostingsEnum.POSITIONS;
    if (vector.hasOffsets()) {
        dpEnumFlags |= PostingsEnum.OFFSETS;
        offsetAttribute = addAttribute(OffsetAttribute.class);
    }
    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
        //must ask for offsets too
        dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
        payloadAttribute = getAttribute(PayloadAttribute.class);
        payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
        spareBytesRefBuilder = new BytesRefBuilder();
    }
    // We put term data here
    termCharsBuilder = new CharsRefBuilder();
    //7 is over-estimate of average term len
    termCharsBuilder.grow((int) (vector.size() * 7));
    // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
    TokenLL[] positionedTokens = initTokensArray();
    int lastPosition = -1;
    final TermsEnum termsEnum = vector.iterator();
    BytesRef termBytesRef;
    PostingsEnum dpEnum = null;
    //only for UTF8->UTF16 call
    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
    //int sumFreq = 0;
    while ((termBytesRef = termsEnum.next()) != null) {
        //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
        // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
        tempCharsRefBuilder.grow(termBytesRef.length);
        final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
        final int termCharsOff = termCharsBuilder.length();
        termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
        dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
        // presumably checked by TokenSources.hasPositions earlier
        assert dpEnum != null;
        dpEnum.nextDoc();
        final int freq = dpEnum.freq();
        //sumFreq += freq;
        for (int j = 0; j < freq; j++) {
            int pos = dpEnum.nextPosition();
            TokenLL token = new TokenLL();
            token.termCharsOff = termCharsOff;
            token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
            if (offsetAttribute != null) {
                token.startOffset = dpEnum.startOffset();
                if (token.startOffset > maxStartOffset) {
                    //filter this token out; exceeds threshold
                    continue;
                }
                token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
                if (pos == -1) {
                    //divide by 8
                    pos = token.startOffset >> 3;
                }
            }
            if (payloadAttribute != null) {
                final BytesRef payload = dpEnum.getPayload();
                token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
            }
            //Add token to an array indexed by position
            if (positionedTokens.length <= pos) {
                //grow, but not 2x since we think our original length estimate is close
                TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
                System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
                positionedTokens = newPositionedTokens;
            }
            positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
            lastPosition = Math.max(lastPosition, pos);
        }
    }
    //    System.out.println(String.format(
    //        "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
    //        sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
    //        (originalPositionEstimate/(lastPosition + 1.0f))));
    // Step 2:  Link all Tokens into a linked-list and set position increments as we go
    int prevTokenPos = -1;
    TokenLL prevToken = null;
    for (int pos = 0; pos <= lastPosition; pos++) {
        TokenLL token = positionedTokens[pos];
        if (token == null) {
            continue;
        }
        //link
        if (prevToken != null) {
            assert prevToken.next == null;
            //concatenate linked-list
            prevToken.next = token;
        } else {
            assert firstToken == null;
            firstToken = token;
        }
        //set increments
        if (vector.hasPositions()) {
            token.positionIncrement = pos - prevTokenPos;
            while (token.next != null) {
                token = token.next;
                token.positionIncrement = 0;
            }
        } else {
            token.positionIncrement = 1;
            while (token.next != null) {
                prevToken = token;
                token = token.next;
                if (prevToken.startOffset == token.startOffset) {
                    token.positionIncrement = 0;
                } else {
                    token.positionIncrement = 1;
                }
            }
        }
        prevTokenPos = pos;
        prevToken = token;
    }
    initialized = true;
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRefArray(org.apache.lucene.util.BytesRefArray) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 42 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class DistributedUpdateProcessor method processAdd.

@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
    assert TestInjection.injectFailUpdateRequests();
    updateCommand = cmd;
    if (zkEnabled) {
        zkCheck();
        nodes = setupRequest(cmd.getHashableId(), cmd.getSolrInputDocument());
    } else {
        isLeader = getNonZkLeaderAssumption(req);
    }
    // check if client has requested minimum replication factor information
    // disabled by default
    int minRf = -1;
    if (replicationTracker != null) {
        // for subsequent requests in the same batch
        minRf = replicationTracker.minRf;
    } else {
        SolrParams rp = cmd.getReq().getParams();
        String distribUpdate = rp.get(DISTRIB_UPDATE_PARAM);
        // a leader or this is the top-level request processor
        if (distribUpdate == null || distribUpdate.equals(DistribPhase.TOLEADER.toString())) {
            String minRepFact = rp.get(UpdateRequest.MIN_REPFACT);
            if (minRepFact != null) {
                try {
                    minRf = Integer.parseInt(minRepFact);
                } catch (NumberFormatException nfe) {
                    minRf = -1;
                }
                if (minRf <= 0)
                    throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid value " + minRepFact + " for " + UpdateRequest.MIN_REPFACT + "; must be >0 and less than or equal to the collection replication factor.");
            }
            if (minRf > 1) {
                String myShardId = forwardToLeader ? null : cloudDesc.getShardId();
                replicationTracker = new RequestReplicationTracker(myShardId, minRf);
            }
        }
    }
    // If we were sent a previous version, set this to the AddUpdateCommand (if not already set)
    if (!cmd.isInPlaceUpdate()) {
        cmd.prevVersion = cmd.getReq().getParams().getLong(DistributedUpdateProcessor.DISTRIB_INPLACE_PREVVERSION, -1);
    }
    // TODO: if minRf > 1 and we know the leader is the only active replica, we could fail
    // the request right here but for now I think it is better to just return the status
    // to the client that the minRf wasn't reached and let them handle it    
    boolean dropCmd = false;
    if (!forwardToLeader) {
        dropCmd = versionAdd(cmd);
    }
    if (dropCmd) {
        // TODO: do we need to add anything to the response?
        return;
    }
    if (zkEnabled && isLeader && !isSubShardLeader) {
        DocCollection coll = zkController.getClusterState().getCollection(collection);
        List<Node> subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), cmd.getHashableId(), cmd.getSolrInputDocument());
        // the list<node> will actually have only one element for an add request
        if (subShardLeaders != null && !subShardLeaders.isEmpty()) {
            ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
            params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
            params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
            params.set(DISTRIB_FROM_PARENT, req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId());
            for (Node subShardLeader : subShardLeaders) {
                cmdDistrib.distribAdd(cmd, Collections.singletonList(subShardLeader), params, true);
            }
        }
        final List<Node> nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, cmd.getHashableId(), cmd.getSolrInputDocument());
        if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) {
            ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
            params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
            params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
            params.set(DISTRIB_FROM_COLLECTION, req.getCore().getCoreDescriptor().getCloudDescriptor().getCollectionName());
            params.set(DISTRIB_FROM_SHARD, req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId());
            for (Node nodesByRoutingRule : nodesByRoutingRules) {
                cmdDistrib.distribAdd(cmd, Collections.singletonList(nodesByRoutingRule), params, true);
            }
        }
    }
    ModifiableSolrParams params = null;
    if (nodes != null) {
        params = new ModifiableSolrParams(filterParams(req.getParams()));
        params.set(DISTRIB_UPDATE_PARAM, (isLeader || isSubShardLeader ? DistribPhase.FROMLEADER.toString() : DistribPhase.TOLEADER.toString()));
        params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
        if (replicationTracker != null && minRf > 1)
            params.set(UpdateRequest.MIN_REPFACT, String.valueOf(minRf));
        if (cmd.isInPlaceUpdate()) {
            params.set(DISTRIB_INPLACE_PREVVERSION, String.valueOf(cmd.prevVersion));
            // Use synchronous=true so that a new connection is used, instead
            // of the update being streamed through an existing streaming client.
            // When using a streaming client, the previous update
            // and the current in-place update (that depends on the previous update), if reordered
            // in the stream, can result in the current update being bottled up behind the previous
            // update in the stream and can lead to degraded performance.
            cmdDistrib.distribAdd(cmd, nodes, params, true, replicationTracker);
        } else {
            cmdDistrib.distribAdd(cmd, nodes, params, false, replicationTracker);
        }
    }
    // TODO: what to do when no idField?
    if (returnVersions && rsp != null && idField != null) {
        if (addsResponse == null) {
            addsResponse = new NamedList<String>(1);
            rsp.add("adds", addsResponse);
        }
        if (scratch == null)
            scratch = new CharsRefBuilder();
        idField.getType().indexedToReadable(cmd.getIndexedId(), scratch);
        addsResponse.add(scratch.toString(), cmd.getVersion());
    }
// TODO: keep track of errors?  needs to be done at a higher level though since
// an id may fail before it gets to this processor.
// Given that, it may also make sense to move the version reporting out of this
// processor too.
}
Also used : RetryNode(org.apache.solr.update.SolrCmdDistributor.RetryNode) Node(org.apache.solr.update.SolrCmdDistributor.Node) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) SolrParams(org.apache.solr.common.params.SolrParams) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) DocCollection(org.apache.solr.common.cloud.DocCollection) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) SolrException(org.apache.solr.common.SolrException)

Example 43 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class DirectSpellChecker method suggestSimilar.

/**
   * Suggest similar words.
   * 
   * <p>Unlike {@link SpellChecker}, the similarity used to fetch the most
   * relevant terms is an edit distance, therefore typically a low value
   * for numSug will work very well.
   * 
   * @param term Term you want to spell check on
   * @param numSug the maximum number of suggested words
   * @param ir IndexReader to find terms from
   * @param suggestMode specifies when to return suggested words
   * @param accuracy return only suggested words that match with this similarity
   * @return sorted list of the suggested words according to the comparator
   * @throws IOException If there is a low-level I/O error.
   */
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) throws IOException {
    final CharsRefBuilder spare = new CharsRefBuilder();
    String text = term.text();
    if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
        return new SuggestWord[0];
    if (lowerCaseTerms) {
        term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
    }
    int docfreq = ir.docFreq(term);
    if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
        return new SuggestWord[0];
    }
    int maxDoc = ir.maxDoc();
    if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
        return new SuggestWord[0];
    } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float) maxDoc)) {
        return new SuggestWord[0];
    }
    if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR)
        docfreq = 0;
    if (thresholdFrequency >= 1f) {
        docfreq = Math.max(docfreq, (int) thresholdFrequency);
    } else if (thresholdFrequency > 0f) {
        docfreq = Math.max(docfreq, (int) (thresholdFrequency * (float) maxDoc) - 1);
    }
    Collection<ScoreTerm> terms = null;
    int inspections = numSug * maxInspections;
    // try ed=1 first, in case we get lucky
    terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
    if (maxEdits > 1 && terms.size() < inspections) {
        HashSet<ScoreTerm> moreTerms = new HashSet<>();
        moreTerms.addAll(terms);
        moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
        terms = moreTerms;
    }
    // create the suggestword response, sort it, and trim it to size.
    SuggestWord[] suggestions = new SuggestWord[terms.size()];
    int index = suggestions.length - 1;
    for (ScoreTerm s : terms) {
        SuggestWord suggestion = new SuggestWord();
        if (s.termAsString == null) {
            spare.copyUTF8Bytes(s.term);
            s.termAsString = spare.toString();
        }
        suggestion.string = s.termAsString;
        suggestion.score = s.score;
        suggestion.freq = s.docfreq;
        suggestions[index--] = suggestion;
    }
    ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator));
    if (numSug < suggestions.length) {
        SuggestWord[] trimmed = new SuggestWord[numSug];
        System.arraycopy(suggestions, 0, trimmed, 0, numSug);
        suggestions = trimmed;
    }
    return suggestions;
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Term(org.apache.lucene.index.Term) HashSet(java.util.HashSet)

Example 44 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class FSTCompletionLookup method lookup.

@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean higherWeightsFirst, int num) {
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    final List<Completion> completions;
    if (higherWeightsFirst) {
        completions = higherWeightsCompletion.lookup(key, num);
    } else {
        completions = normalCompletion.lookup(key, num);
    }
    final ArrayList<LookupResult> results = new ArrayList<>(completions.size());
    CharsRefBuilder spare = new CharsRefBuilder();
    for (Completion c : completions) {
        spare.copyUTF8Bytes(c.utf8);
        results.add(new LookupResult(spare.toString(), c.bucket));
    }
    return results;
}
Also used : Completion(org.apache.lucene.search.suggest.fst.FSTCompletion.Completion) ArrayList(java.util.ArrayList) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder)

Example 45 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class QueryParsing method writeFieldVal.

static void writeFieldVal(BytesRef val, FieldType ft, Appendable out, int flags) throws IOException {
    if (ft != null) {
        try {
            CharsRefBuilder readable = new CharsRefBuilder();
            ft.indexedToReadable(val, readable);
            out.append(readable.get());
        } catch (Exception e) {
            out.append("EXCEPTION(val=");
            out.append(val.utf8ToString());
            out.append(")");
        }
    } else {
        out.append(val.utf8ToString());
    }
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IOException(java.io.IOException)

Aggregations

CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)52 BytesRef (org.apache.lucene.util.BytesRef)30 ArrayList (java.util.ArrayList)11 IOException (java.io.IOException)10 NamedList (org.apache.solr.common.util.NamedList)10 FieldType (org.apache.solr.schema.FieldType)10 TermsEnum (org.apache.lucene.index.TermsEnum)9 SchemaField (org.apache.solr.schema.SchemaField)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 HashSet (java.util.HashSet)5 Test (org.junit.Test)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 PostingsEnum (org.apache.lucene.index.PostingsEnum)4 Terms (org.apache.lucene.index.Terms)4 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)4 LeafReader (org.apache.lucene.index.LeafReader)3 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)3 CharsRef (org.apache.lucene.util.CharsRef)3 Util (org.apache.lucene.util.fst.Util)3 SolrException (org.apache.solr.common.SolrException)3