Search in sources :

Example 1 with BytesRefArray

use of org.apache.lucene.util.BytesRefArray in project elasticsearch by elastic.

the class CollectionUtilsTests method testSortByteRefArray.

public void testSortByteRefArray() {
    List<BytesRef> values = new ArrayList<>();
    final int numValues = scaledRandomIntBetween(0, 10000);
    BytesRefArray array = new BytesRefArray(Counter.newCounter());
    for (int i = 0; i < numValues; i++) {
        String s = randomRealisticUnicodeOfCodepointLengthBetween(1, 100);
        values.add(new BytesRef(s));
        array.append(new BytesRef(s));
    }
    if (randomBoolean()) {
        Collections.shuffle(values, random());
    }
    int[] indices = new int[array.size()];
    for (int i = 0; i < indices.length; i++) {
        indices[i] = i;
    }
    CollectionUtils.sort(array, indices);
    Collections.sort(values);
    Iterator<BytesRef> iterator = values.iterator();
    BytesRefBuilder spare = new BytesRefBuilder();
    for (int i = 0; i < values.size(); i++) {
        assertThat(iterator.hasNext(), is(true));
        assertThat(array.get(spare, indices[i]), equalTo(iterator.next()));
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRefArray(org.apache.lucene.util.BytesRefArray) ArrayList(java.util.ArrayList) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with BytesRefArray

use of org.apache.lucene.util.BytesRefArray in project elasticsearch by elastic.

the class CollectionUtilsTests method testSortAndDedupByteRefArray.

public void testSortAndDedupByteRefArray() {
    SortedSet<BytesRef> set = new TreeSet<>();
    final int numValues = scaledRandomIntBetween(0, 10000);
    List<BytesRef> tmpList = new ArrayList<>();
    BytesRefArray array = new BytesRefArray(Counter.newCounter());
    for (int i = 0; i < numValues; i++) {
        String s = randomRealisticUnicodeOfCodepointLengthBetween(1, 100);
        set.add(new BytesRef(s));
        tmpList.add(new BytesRef(s));
        array.append(new BytesRef(s));
    }
    if (randomBoolean()) {
        Collections.shuffle(tmpList, random());
        for (BytesRef ref : tmpList) {
            array.append(ref);
        }
    }
    int[] indices = new int[array.size()];
    for (int i = 0; i < indices.length; i++) {
        indices[i] = i;
    }
    int numUnique = CollectionUtils.sortAndDedup(array, indices);
    assertThat(numUnique, equalTo(set.size()));
    Iterator<BytesRef> iterator = set.iterator();
    BytesRefBuilder spare = new BytesRefBuilder();
    for (int i = 0; i < numUnique; i++) {
        assertThat(iterator.hasNext(), is(true));
        assertThat(array.get(spare, indices[i]), equalTo(iterator.next()));
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRefArray(org.apache.lucene.util.BytesRefArray) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) BytesRef(org.apache.lucene.util.BytesRef)

Example 3 with BytesRefArray

use of org.apache.lucene.util.BytesRefArray in project lucene-solr by apache.

the class TokenStreamFromTermVector method init.

//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
    assert !initialized;
    short dpEnumFlags = PostingsEnum.POSITIONS;
    if (vector.hasOffsets()) {
        dpEnumFlags |= PostingsEnum.OFFSETS;
        offsetAttribute = addAttribute(OffsetAttribute.class);
    }
    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
        //must ask for offsets too
        dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
        payloadAttribute = getAttribute(PayloadAttribute.class);
        payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
        spareBytesRefBuilder = new BytesRefBuilder();
    }
    // We put term data here
    termCharsBuilder = new CharsRefBuilder();
    //7 is over-estimate of average term len
    termCharsBuilder.grow((int) (vector.size() * 7));
    // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
    TokenLL[] positionedTokens = initTokensArray();
    int lastPosition = -1;
    final TermsEnum termsEnum = vector.iterator();
    BytesRef termBytesRef;
    PostingsEnum dpEnum = null;
    //only for UTF8->UTF16 call
    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
    //int sumFreq = 0;
    while ((termBytesRef = termsEnum.next()) != null) {
        //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
        // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
        tempCharsRefBuilder.grow(termBytesRef.length);
        final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
        final int termCharsOff = termCharsBuilder.length();
        termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
        dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
        // presumably checked by TokenSources.hasPositions earlier
        assert dpEnum != null;
        dpEnum.nextDoc();
        final int freq = dpEnum.freq();
        //sumFreq += freq;
        for (int j = 0; j < freq; j++) {
            int pos = dpEnum.nextPosition();
            TokenLL token = new TokenLL();
            token.termCharsOff = termCharsOff;
            token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
            if (offsetAttribute != null) {
                token.startOffset = dpEnum.startOffset();
                if (token.startOffset > maxStartOffset) {
                    //filter this token out; exceeds threshold
                    continue;
                }
                token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
                if (pos == -1) {
                    //divide by 8
                    pos = token.startOffset >> 3;
                }
            }
            if (payloadAttribute != null) {
                final BytesRef payload = dpEnum.getPayload();
                token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
            }
            //Add token to an array indexed by position
            if (positionedTokens.length <= pos) {
                //grow, but not 2x since we think our original length estimate is close
                TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
                System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
                positionedTokens = newPositionedTokens;
            }
            positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
            lastPosition = Math.max(lastPosition, pos);
        }
    }
    //    System.out.println(String.format(
    //        "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
    //        sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
    //        (originalPositionEstimate/(lastPosition + 1.0f))));
    // Step 2:  Link all Tokens into a linked-list and set position increments as we go
    int prevTokenPos = -1;
    TokenLL prevToken = null;
    for (int pos = 0; pos <= lastPosition; pos++) {
        TokenLL token = positionedTokens[pos];
        if (token == null) {
            continue;
        }
        //link
        if (prevToken != null) {
            assert prevToken.next == null;
            //concatenate linked-list
            prevToken.next = token;
        } else {
            assert firstToken == null;
            firstToken = token;
        }
        //set increments
        if (vector.hasPositions()) {
            token.positionIncrement = pos - prevTokenPos;
            while (token.next != null) {
                token = token.next;
                token.positionIncrement = 0;
            }
        } else {
            token.positionIncrement = 1;
            while (token.next != null) {
                prevToken = token;
                token = token.next;
                if (prevToken.startOffset == token.startOffset) {
                    token.positionIncrement = 0;
                } else {
                    token.positionIncrement = 1;
                }
            }
        }
        prevTokenPos = pos;
        prevToken = token;
    }
    initialized = true;
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRefArray(org.apache.lucene.util.BytesRefArray) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

BytesRef (org.apache.lucene.util.BytesRef)3 BytesRefArray (org.apache.lucene.util.BytesRefArray)3 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)3 ArrayList (java.util.ArrayList)2 TreeSet (java.util.TreeSet)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)1 PostingsEnum (org.apache.lucene.index.PostingsEnum)1 TermsEnum (org.apache.lucene.index.TermsEnum)1 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)1