use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class CheckIndex method checkTermRanges.
/** Make an effort to visit "fake" (e.g. auto-prefix) terms. We do this by running term range intersections across an initially wide
* interval of terms, at different boundaries, and then gradually decrease the interval. This is not guaranteed to hit all non-real
* terms (doing that in general is non-trivial), but it should hit many of them, and validate their postings against the postings for the
* real terms. */
private static void checkTermRanges(String field, int maxDoc, Terms terms, long numTerms) throws IOException {
// We'll target this many terms in our interval for the current level:
double currentInterval = numTerms;
FixedBitSet normalDocs = new FixedBitSet(maxDoc);
FixedBitSet intersectDocs = new FixedBitSet(maxDoc);
while (currentInterval >= 10.0) {
//System.out.println(" cycle interval=" + currentInterval);
// We iterate this terms enum to locate min/max term for each sliding/overlapping interval we test at the current level:
TermsEnum termsEnum = terms.iterator();
long termCount = 0;
Deque<BytesRef> termBounds = new LinkedList<>();
long lastTermAdded = Long.MIN_VALUE;
BytesRefBuilder lastTerm = null;
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
//System.out.println(" top: term=" + term.utf8ToString());
if (termCount >= lastTermAdded + currentInterval / 4) {
termBounds.add(BytesRef.deepCopyOf(term));
lastTermAdded = termCount;
if (termBounds.size() == 5) {
BytesRef minTerm = termBounds.removeFirst();
BytesRef maxTerm = termBounds.getLast();
checkSingleTermRange(field, maxDoc, terms, minTerm, maxTerm, normalDocs, intersectDocs);
}
}
termCount++;
if (lastTerm == null) {
lastTerm = new BytesRefBuilder();
lastTerm.copyBytes(term);
} else {
if (lastTerm.get().compareTo(term) >= 0) {
throw new RuntimeException("terms out of order: lastTerm=" + lastTerm.get() + " term=" + term);
}
lastTerm.copyBytes(term);
}
}
if (lastTerm != null && termBounds.isEmpty() == false) {
BytesRef minTerm = termBounds.removeFirst();
BytesRef maxTerm = lastTerm.get();
checkSingleTermRange(field, maxDoc, terms, minTerm, maxTerm, normalDocs, intersectDocs);
}
currentInterval *= .75;
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class CheckIndex method getDocsFromTermRange.
/** Visits all terms in the range minTerm (inclusive) to maxTerm (exclusive), marking all doc IDs encountered into allDocsSeen, and
* returning the total number of terms visited. */
private static long getDocsFromTermRange(String field, int maxDoc, TermsEnum termsEnum, FixedBitSet docsSeen, BytesRef minTerm, BytesRef maxTerm, boolean isIntersect) throws IOException {
docsSeen.clear(0, docsSeen.length());
long termCount = 0;
PostingsEnum postingsEnum = null;
BytesRefBuilder lastTerm = null;
while (true) {
BytesRef term;
// Kinda messy: for intersect, we must first next(), but for "normal", we are already on our first term:
if (isIntersect || termCount != 0) {
term = termsEnum.next();
} else {
term = termsEnum.term();
}
if (term == null) {
if (isIntersect == false) {
throw new RuntimeException("didn't see max term field=" + field + " term=" + maxTerm);
}
//System.out.println(" terms=" + termCount);
return termCount;
}
assert term.isValid();
if (lastTerm == null) {
lastTerm = new BytesRefBuilder();
lastTerm.copyBytes(term);
} else {
if (lastTerm.get().compareTo(term) >= 0) {
throw new RuntimeException("terms out of order: lastTerm=" + lastTerm.get() + " term=" + term);
}
lastTerm.copyBytes(term);
}
// Caller already ensured terms enum positioned >= minTerm:
if (term.compareTo(minTerm) < 0) {
throw new RuntimeException("saw term before min term field=" + field + " term=" + minTerm);
}
if (isIntersect == false) {
int cmp = term.compareTo(maxTerm);
if (cmp == 0) {
//System.out.println(" terms=" + termCount);
return termCount;
} else if (cmp > 0) {
throw new RuntimeException("didn't see end term field=" + field + " term=" + maxTerm);
}
}
postingsEnum = termsEnum.postings(postingsEnum, 0);
int lastDoc = -1;
while (true) {
int doc = postingsEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
if (doc <= lastDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
}
if (doc >= maxDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
}
//System.out.println(" doc=" + doc);
docsSeen.set(doc);
lastDoc = doc;
}
termCount++;
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class TestLucene70DocValuesFormat method testSortedSetAroundBlockSize.
@Slow
public void testSortedSetAroundBlockSize() throws IOException {
final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
final Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
RAMFile buffer = new RAMFile();
RAMOutputStream out = new RAMOutputStream(buffer, false);
Document doc = new Document();
SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
doc.add(field1);
SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
doc.add(field2);
for (int i = 0; i < maxDoc; ++i) {
BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
field1.setBytesValue(s1);
field2.setBytesValue(s2);
w.addDocument(doc);
Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
out.writeVInt(set.size());
for (BytesRef ref : set) {
out.writeVInt(ref.length);
out.writeBytes(ref.bytes, ref.offset, ref.length);
}
}
out.close();
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
w.close();
LeafReader sr = getOnlyLeafReader(r);
assertEquals(maxDoc, sr.maxDoc());
SortedSetDocValues values = sr.getSortedSetDocValues("sset");
assertNotNull(values);
RAMInputStream in = new RAMInputStream("", buffer);
BytesRefBuilder b = new BytesRefBuilder();
for (int i = 0; i < maxDoc; ++i) {
assertEquals(i, values.nextDoc());
final int numValues = in.readVInt();
for (int j = 0; j < numValues; ++j) {
b.setLength(in.readVInt());
b.grow(b.length());
in.readBytes(b.bytes(), 0, b.length());
assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
}
assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
}
r.close();
dir.close();
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class SimpleTextDocValuesReader method checkIntegrity.
@Override
public void checkIntegrity() throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
IndexInput clone = data.clone();
clone.seek(0);
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included in SimpleTextUtil.CHECKSUM):
long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
while (true) {
SimpleTextUtil.readLine(input, scratch);
if (input.getFilePointer() >= footerStartPos) {
// Make sure we landed at precisely the right location:
if (input.getFilePointer() != footerStartPos) {
throw new CorruptIndexException("SimpleText failure: footer does not start at expected position current=" + input.getFilePointer() + " vs expected=" + footerStartPos, input);
}
SimpleTextUtil.checkFooter(input);
break;
}
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class SimpleTextDocValuesReader method getSorted.
@Override
public SortedDocValues getSorted(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
// valid:
assert field != null;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
final DecimalFormat ordDecoder = new DecimalFormat(field.ordPattern, new DecimalFormatSymbols(Locale.ROOT));
return new SortedDocValues() {
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(docID() + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
int ord;
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + i * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
try {
ord = (int) ordDecoder.parse(scratch.get().utf8ToString()).longValue() - 1;
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse ord", in, pe);
}
if (ord >= 0) {
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
public boolean advanceExact(int target) throws IOException {
this.doc = target;
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + target * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
try {
ord = (int) ordDecoder.parse(scratch.get().utf8ToString()).longValue() - 1;
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse ord", in, pe);
}
return ord >= 0;
}
@Override
public int ordValue() {
return ord;
}
final BytesRefBuilder term = new BytesRefBuilder();
@Override
public BytesRef lookupOrd(int ord) throws IOException {
if (ord < 0 || ord >= field.numValues) {
throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues - 1) + "; got " + ord);
}
in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH) : "got " + scratch.get().utf8ToString() + " in=" + in;
int len;
try {
len = decoder.parse(new String(scratch.bytes(), LENGTH.length, scratch.length() - LENGTH.length, StandardCharsets.UTF_8)).intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
return term.get();
}
@Override
public int getValueCount() {
return (int) field.numValues;
}
};
}
Aggregations