use of org.apache.lucene.util.BytesRefArray in project elasticsearch by elastic.
the class CollectionUtilsTests method testSortByteRefArray.
public void testSortByteRefArray() {
List<BytesRef> values = new ArrayList<>();
final int numValues = scaledRandomIntBetween(0, 10000);
BytesRefArray array = new BytesRefArray(Counter.newCounter());
for (int i = 0; i < numValues; i++) {
String s = randomRealisticUnicodeOfCodepointLengthBetween(1, 100);
values.add(new BytesRef(s));
array.append(new BytesRef(s));
}
if (randomBoolean()) {
Collections.shuffle(values, random());
}
int[] indices = new int[array.size()];
for (int i = 0; i < indices.length; i++) {
indices[i] = i;
}
CollectionUtils.sort(array, indices);
Collections.sort(values);
Iterator<BytesRef> iterator = values.iterator();
BytesRefBuilder spare = new BytesRefBuilder();
for (int i = 0; i < values.size(); i++) {
assertThat(iterator.hasNext(), is(true));
assertThat(array.get(spare, indices[i]), equalTo(iterator.next()));
}
}
use of org.apache.lucene.util.BytesRefArray in project elasticsearch by elastic.
the class CollectionUtilsTests method testSortAndDedupByteRefArray.
public void testSortAndDedupByteRefArray() {
SortedSet<BytesRef> set = new TreeSet<>();
final int numValues = scaledRandomIntBetween(0, 10000);
List<BytesRef> tmpList = new ArrayList<>();
BytesRefArray array = new BytesRefArray(Counter.newCounter());
for (int i = 0; i < numValues; i++) {
String s = randomRealisticUnicodeOfCodepointLengthBetween(1, 100);
set.add(new BytesRef(s));
tmpList.add(new BytesRef(s));
array.append(new BytesRef(s));
}
if (randomBoolean()) {
Collections.shuffle(tmpList, random());
for (BytesRef ref : tmpList) {
array.append(ref);
}
}
int[] indices = new int[array.size()];
for (int i = 0; i < indices.length; i++) {
indices[i] = i;
}
int numUnique = CollectionUtils.sortAndDedup(array, indices);
assertThat(numUnique, equalTo(set.size()));
Iterator<BytesRef> iterator = set.iterator();
BytesRefBuilder spare = new BytesRefBuilder();
for (int i = 0; i < numUnique; i++) {
assertThat(iterator.hasNext(), is(true));
assertThat(array.get(spare, indices[i]), equalTo(iterator.next()));
}
}
use of org.apache.lucene.util.BytesRefArray in project lucene-solr by apache.
the class TokenStreamFromTermVector method init.
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
short dpEnumFlags = PostingsEnum.POSITIONS;
if (vector.hasOffsets()) {
dpEnumFlags |= PostingsEnum.OFFSETS;
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
//must ask for offsets too
dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
}
// We put term data here
termCharsBuilder = new CharsRefBuilder();
//7 is over-estimate of average term len
termCharsBuilder.grow((int) (vector.size() * 7));
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
int lastPosition = -1;
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
//only for UTF8->UTF16 call
CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
//int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
tempCharsRefBuilder.grow(termBytesRef.length);
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
final int termCharsOff = termCharsBuilder.length();
termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
// presumably checked by TokenSources.hasPositions earlier
assert dpEnum != null;
dpEnum.nextDoc();
final int freq = dpEnum.freq();
//sumFreq += freq;
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
token.termCharsOff = termCharsOff;
token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
if (token.startOffset > maxStartOffset) {
//filter this token out; exceeds threshold
continue;
}
token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
//divide by 8
pos = token.startOffset >> 3;
}
}
if (payloadAttribute != null) {
final BytesRef payload = dpEnum.getPayload();
token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
}
//Add token to an array indexed by position
if (positionedTokens.length <= pos) {
//grow, but not 2x since we think our original length estimate is close
TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
positionedTokens = newPositionedTokens;
}
positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
lastPosition = Math.max(lastPosition, pos);
}
}
// System.out.println(String.format(
// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
// (originalPositionEstimate/(lastPosition + 1.0f))));
// Step 2: Link all Tokens into a linked-list and set position increments as we go
int prevTokenPos = -1;
TokenLL prevToken = null;
for (int pos = 0; pos <= lastPosition; pos++) {
TokenLL token = positionedTokens[pos];
if (token == null) {
continue;
}
//link
if (prevToken != null) {
assert prevToken.next == null;
//concatenate linked-list
prevToken.next = token;
} else {
assert firstToken == null;
firstToken = token;
}
//set increments
if (vector.hasPositions()) {
token.positionIncrement = pos - prevTokenPos;
while (token.next != null) {
token = token.next;
token.positionIncrement = 0;
}
} else {
token.positionIncrement = 1;
while (token.next != null) {
prevToken = token;
token = token.next;
if (prevToken.startOffset == token.startOffset) {
token.positionIncrement = 0;
} else {
token.positionIncrement = 1;
}
}
}
prevTokenPos = pos;
prevToken = token;
}
initialized = true;
}
Aggregations