use of org.apache.lucene.util.bkd.HeapPointWriter in project lucene-solr by apache.
the class SimpleTextBKDWriter method build.
/** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */
private void build(int nodeID, int leafNodeOffset, PathSlice[] slices, LongBitSet ordBitSet, IndexOutput out, byte[] minPackedValue, byte[] maxPackedValue, byte[] splitPackedValues, long[] leafBlockFPs, List<Closeable> toCloseHeroically) throws IOException {
for (PathSlice slice : slices) {
assert slice.count == slices[0].count;
}
if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
// Special case for 1D, to cutover to heap once we recurse deeply enough:
slices[0] = switchToHeap(slices[0], toCloseHeroically);
}
if (nodeID >= leafNodeOffset) {
// Leaf node: write block
// We can write the block in any order so by default we write it sorted by the dimension that has the
// least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient
int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE;
for (int dim = 0; dim < numDims; dim++) {
if (slices[dim].writer instanceof HeapPointWriter == false) {
// Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
// offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
slices[dim] = switchToHeap(slices[dim], toCloseHeroically);
}
PathSlice source = slices[dim];
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
// Find common prefix by comparing first and last values, already sorted in this dimension:
heapSource.readPackedValue(Math.toIntExact(source.start), scratch1);
heapSource.readPackedValue(Math.toIntExact(source.start + source.count - 1), scratch2);
int offset = dim * bytesPerDim;
commonPrefixLengths[dim] = bytesPerDim;
for (int j = 0; j < bytesPerDim; j++) {
if (scratch1[offset + j] != scratch2[offset + j]) {
commonPrefixLengths[dim] = j;
break;
}
}
int prefix = commonPrefixLengths[dim];
if (prefix < bytesPerDim) {
int cardinality = 1;
byte previous = scratch1[offset + prefix];
for (long i = 1; i < source.count; ++i) {
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch2);
byte b = scratch2[offset + prefix];
assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b);
if (b != previous) {
cardinality++;
previous = b;
}
}
assert cardinality <= 256;
if (cardinality < sortedDimCardinality) {
sortedDim = dim;
sortedDimCardinality = cardinality;
}
}
}
PathSlice source = slices[sortedDim];
// We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
//System.out.println(" write leaf block @ fp=" + out.getFilePointer());
// Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
// loading the values:
int count = Math.toIntExact(source.count);
assert count > 0 : "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count);
// TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us
// from the index, much like how terms dict does so from the FST:
// Write the full values:
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
final BytesRef scratch = new BytesRef();
{
scratch.length = packedBytesLength;
}
@Override
public BytesRef apply(int i) {
heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratch);
return scratch;
}
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues, heapSource.docIDs, Math.toIntExact(source.start));
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
} else {
// Inner node: partition/recurse
int splitDim;
if (numDims > 1) {
splitDim = split(minPackedValue, maxPackedValue);
} else {
splitDim = 0;
}
PathSlice source = slices[splitDim];
assert nodeID < splitPackedValues.length : "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
// How many points will be in the left tree:
long rightCount = source.count / 2;
long leftCount = source.count - rightCount;
byte[] splitValue = markRightTree(rightCount, splitDim, source, ordBitSet);
int address = nodeID * (1 + bytesPerDim);
splitPackedValues[address] = (byte) splitDim;
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDim);
// Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse:
PathSlice[] leftSlices = new PathSlice[numDims];
PathSlice[] rightSlices = new PathSlice[numDims];
byte[] minSplitPackedValue = new byte[packedBytesLength];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedBytesLength);
byte[] maxSplitPackedValue = new byte[packedBytesLength];
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedBytesLength);
// When we are on this dim, below, we clear the ordBitSet:
int dimToClear;
if (numDims - 1 == splitDim) {
dimToClear = numDims - 2;
} else {
dimToClear = numDims - 1;
}
for (int dim = 0; dim < numDims; dim++) {
if (dim == splitDim) {
// No need to partition on this dim since it's a simple slice of the incoming already sorted slice, and we
// will re-use its shared reader when visiting it as we recurse:
leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount);
rightSlices[dim] = new PathSlice(source.writer, source.start + leftCount, rightCount);
System.arraycopy(splitValue, 0, minSplitPackedValue, dim * bytesPerDim, bytesPerDim);
System.arraycopy(splitValue, 0, maxSplitPackedValue, dim * bytesPerDim, bytesPerDim);
continue;
}
// Not inside the try because we don't want to close this one now, so that after recursion is done,
// we will have done a singel full sweep of the file:
PointReader reader = slices[dim].writer.getSharedReader(slices[dim].start, slices[dim].count, toCloseHeroically);
try (PointWriter leftPointWriter = getPointWriter(leftCount, "left" + dim);
PointWriter rightPointWriter = getPointWriter(source.count - leftCount, "right" + dim)) {
long nextRightCount = reader.split(source.count, ordBitSet, leftPointWriter, rightPointWriter, dim == dimToClear);
if (rightCount != nextRightCount) {
throw new IllegalStateException("wrong number of points in split: expected=" + rightCount + " but actual=" + nextRightCount);
}
leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
} catch (Throwable t) {
throw verifyChecksum(t, slices[dim].writer);
}
}
// Recurse on left tree:
build(2 * nodeID, leafNodeOffset, leftSlices, ordBitSet, out, minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, toCloseHeroically);
for (int dim = 0; dim < numDims; dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
if (dim != splitDim) {
leftSlices[dim].writer.destroy();
}
}
// TODO: we could "tail recurse" here? have our parent discard its refs as we recurse right?
// Recurse on right tree:
build(2 * nodeID + 1, leafNodeOffset, rightSlices, ordBitSet, out, minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, toCloseHeroically);
for (int dim = 0; dim < numDims; dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
if (dim != splitDim) {
rightSlices[dim].writer.destroy();
}
}
}
}
use of org.apache.lucene.util.bkd.HeapPointWriter in project lucene-solr by apache.
the class SimpleTextBKDWriter method switchToHeap.
/** Pull a partition back into heap once the point count is low enough while recursing. */
private PathSlice switchToHeap(PathSlice source, List<Closeable> toCloseHeroically) throws IOException {
int count = Math.toIntExact(source.count);
// Not inside the try because we don't want to close it here:
PointReader reader = source.writer.getSharedReader(source.start, source.count, toCloseHeroically);
try (PointWriter writer = new HeapPointWriter(count, count, packedBytesLength, longOrds, singleValuePerDoc)) {
for (int i = 0; i < count; i++) {
boolean hasNext = reader.next();
assert hasNext;
writer.append(reader.packedValue(), reader.ord(), reader.docID());
}
return new PathSlice(writer, 0, count);
} catch (Throwable t) {
throw verifyChecksum(t, source.writer);
}
}
use of org.apache.lucene.util.bkd.HeapPointWriter in project lucene-solr by apache.
the class SimpleTextBKDWriter method sort.
private PointWriter sort(int dim) throws IOException {
assert dim >= 0 && dim < numDims;
if (heapPointWriter != null) {
assert tempInput == null;
// We never spilled the incoming points to disk, so now we sort in heap:
HeapPointWriter sorted;
if (dim == 0) {
// First dim can re-use the current heap writer
sorted = heapPointWriter;
} else {
// Subsequent dims need a private copy
sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength, longOrds, singleValuePerDoc);
sorted.copyFrom(heapPointWriter);
}
//long t0 = System.nanoTime();
sortHeapPointWriter(sorted, dim);
//long t1 = System.nanoTime();
//System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec");
sorted.close();
return sorted;
} else {
// Offline sort:
assert tempInput != null;
final int offset = bytesPerDim * dim;
Comparator<BytesRef> cmp;
if (dim == numDims - 1) {
// in that case the bytes for the dimension and for the doc id are contiguous,
// so we don't need a branch
cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {
@Override
protected int byteAt(BytesRef ref, int i) {
return ref.bytes[ref.offset + offset + i] & 0xff;
}
};
} else {
cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {
@Override
protected int byteAt(BytesRef ref, int i) {
if (i < bytesPerDim) {
return ref.bytes[ref.offset + offset + i] & 0xff;
} else {
return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim] & 0xff;
}
}
};
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, null, 0) {
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@Override
protected ByteSequencesWriter getWriter(IndexOutput out, long count) {
return new ByteSequencesWriter(out) {
@Override
public void write(byte[] bytes, int off, int len) throws IOException {
assert len == bytesPerDoc : "len=" + len + " bytesPerDoc=" + bytesPerDoc;
out.writeBytes(bytes, off, len);
}
};
}
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@Override
protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
return new ByteSequencesReader(in, name) {
final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]);
@Override
public BytesRef next() throws IOException {
if (in.getFilePointer() >= end) {
return null;
}
in.readBytes(scratch.bytes, 0, bytesPerDoc);
return scratch;
}
};
}
};
String name = sorter.sort(tempInput.getName());
return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount, longOrds, singleValuePerDoc);
}
}
Aggregations