Search in sources :

Example 1 with HeapPointWriter

use of org.apache.lucene.util.bkd.HeapPointWriter in project lucene-solr by apache.

the class SimpleTextBKDWriter method build.

/** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */
private void build(int nodeID, int leafNodeOffset, PathSlice[] slices, LongBitSet ordBitSet, IndexOutput out, byte[] minPackedValue, byte[] maxPackedValue, byte[] splitPackedValues, long[] leafBlockFPs, List<Closeable> toCloseHeroically) throws IOException {
    for (PathSlice slice : slices) {
        assert slice.count == slices[0].count;
    }
    if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
        // Special case for 1D, to cutover to heap once we recurse deeply enough:
        slices[0] = switchToHeap(slices[0], toCloseHeroically);
    }
    if (nodeID >= leafNodeOffset) {
        // Leaf node: write block
        // We can write the block in any order so by default we write it sorted by the dimension that has the
        // least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient
        int sortedDim = 0;
        int sortedDimCardinality = Integer.MAX_VALUE;
        for (int dim = 0; dim < numDims; dim++) {
            if (slices[dim].writer instanceof HeapPointWriter == false) {
                // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
                // offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
                slices[dim] = switchToHeap(slices[dim], toCloseHeroically);
            }
            PathSlice source = slices[dim];
            HeapPointWriter heapSource = (HeapPointWriter) source.writer;
            // Find common prefix by comparing first and last values, already sorted in this dimension:
            heapSource.readPackedValue(Math.toIntExact(source.start), scratch1);
            heapSource.readPackedValue(Math.toIntExact(source.start + source.count - 1), scratch2);
            int offset = dim * bytesPerDim;
            commonPrefixLengths[dim] = bytesPerDim;
            for (int j = 0; j < bytesPerDim; j++) {
                if (scratch1[offset + j] != scratch2[offset + j]) {
                    commonPrefixLengths[dim] = j;
                    break;
                }
            }
            int prefix = commonPrefixLengths[dim];
            if (prefix < bytesPerDim) {
                int cardinality = 1;
                byte previous = scratch1[offset + prefix];
                for (long i = 1; i < source.count; ++i) {
                    heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch2);
                    byte b = scratch2[offset + prefix];
                    assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b);
                    if (b != previous) {
                        cardinality++;
                        previous = b;
                    }
                }
                assert cardinality <= 256;
                if (cardinality < sortedDimCardinality) {
                    sortedDim = dim;
                    sortedDimCardinality = cardinality;
                }
            }
        }
        PathSlice source = slices[sortedDim];
        // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
        HeapPointWriter heapSource = (HeapPointWriter) source.writer;
        // Save the block file pointer:
        leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
        //System.out.println("  write leaf block @ fp=" + out.getFilePointer());
        // Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
        // loading the values:
        int count = Math.toIntExact(source.count);
        assert count > 0 : "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
        writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count);
        // TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us
        // from the index, much like how terms dict does so from the FST:
        // Write the full values:
        IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {

            final BytesRef scratch = new BytesRef();

            {
                scratch.length = packedBytesLength;
            }

            @Override
            public BytesRef apply(int i) {
                heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratch);
                return scratch;
            }
        };
        assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues, heapSource.docIDs, Math.toIntExact(source.start));
        writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
    } else {
        // Inner node: partition/recurse
        int splitDim;
        if (numDims > 1) {
            splitDim = split(minPackedValue, maxPackedValue);
        } else {
            splitDim = 0;
        }
        PathSlice source = slices[splitDim];
        assert nodeID < splitPackedValues.length : "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
        // How many points will be in the left tree:
        long rightCount = source.count / 2;
        long leftCount = source.count - rightCount;
        byte[] splitValue = markRightTree(rightCount, splitDim, source, ordBitSet);
        int address = nodeID * (1 + bytesPerDim);
        splitPackedValues[address] = (byte) splitDim;
        System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDim);
        // Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse:
        PathSlice[] leftSlices = new PathSlice[numDims];
        PathSlice[] rightSlices = new PathSlice[numDims];
        byte[] minSplitPackedValue = new byte[packedBytesLength];
        System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedBytesLength);
        byte[] maxSplitPackedValue = new byte[packedBytesLength];
        System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedBytesLength);
        // When we are on this dim, below, we clear the ordBitSet:
        int dimToClear;
        if (numDims - 1 == splitDim) {
            dimToClear = numDims - 2;
        } else {
            dimToClear = numDims - 1;
        }
        for (int dim = 0; dim < numDims; dim++) {
            if (dim == splitDim) {
                // No need to partition on this dim since it's a simple slice of the incoming already sorted slice, and we
                // will re-use its shared reader when visiting it as we recurse:
                leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount);
                rightSlices[dim] = new PathSlice(source.writer, source.start + leftCount, rightCount);
                System.arraycopy(splitValue, 0, minSplitPackedValue, dim * bytesPerDim, bytesPerDim);
                System.arraycopy(splitValue, 0, maxSplitPackedValue, dim * bytesPerDim, bytesPerDim);
                continue;
            }
            // Not inside the try because we don't want to close this one now, so that after recursion is done,
            // we will have done a singel full sweep of the file:
            PointReader reader = slices[dim].writer.getSharedReader(slices[dim].start, slices[dim].count, toCloseHeroically);
            try (PointWriter leftPointWriter = getPointWriter(leftCount, "left" + dim);
                PointWriter rightPointWriter = getPointWriter(source.count - leftCount, "right" + dim)) {
                long nextRightCount = reader.split(source.count, ordBitSet, leftPointWriter, rightPointWriter, dim == dimToClear);
                if (rightCount != nextRightCount) {
                    throw new IllegalStateException("wrong number of points in split: expected=" + rightCount + " but actual=" + nextRightCount);
                }
                leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
                rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
            } catch (Throwable t) {
                throw verifyChecksum(t, slices[dim].writer);
            }
        }
        // Recurse on left tree:
        build(2 * nodeID, leafNodeOffset, leftSlices, ordBitSet, out, minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, toCloseHeroically);
        for (int dim = 0; dim < numDims; dim++) {
            // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
            if (dim != splitDim) {
                leftSlices[dim].writer.destroy();
            }
        }
        // TODO: we could "tail recurse" here?  have our parent discard its refs as we recurse right?
        // Recurse on right tree:
        build(2 * nodeID + 1, leafNodeOffset, rightSlices, ordBitSet, out, minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, toCloseHeroically);
        for (int dim = 0; dim < numDims; dim++) {
            // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
            if (dim != splitDim) {
                rightSlices[dim].writer.destroy();
            }
        }
    }
}
Also used : OfflinePointWriter(org.apache.lucene.util.bkd.OfflinePointWriter) HeapPointWriter(org.apache.lucene.util.bkd.HeapPointWriter) PointWriter(org.apache.lucene.util.bkd.PointWriter) OfflinePointWriter(org.apache.lucene.util.bkd.OfflinePointWriter) HeapPointWriter(org.apache.lucene.util.bkd.HeapPointWriter) PointReader(org.apache.lucene.util.bkd.PointReader) OfflinePointReader(org.apache.lucene.util.bkd.OfflinePointReader) IntFunction(java.util.function.IntFunction) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with HeapPointWriter

use of org.apache.lucene.util.bkd.HeapPointWriter in project lucene-solr by apache.

the class SimpleTextBKDWriter method switchToHeap.

/** Pull a partition back into heap once the point count is low enough while recursing. */
private PathSlice switchToHeap(PathSlice source, List<Closeable> toCloseHeroically) throws IOException {
    int count = Math.toIntExact(source.count);
    // Not inside the try because we don't want to close it here:
    PointReader reader = source.writer.getSharedReader(source.start, source.count, toCloseHeroically);
    try (PointWriter writer = new HeapPointWriter(count, count, packedBytesLength, longOrds, singleValuePerDoc)) {
        for (int i = 0; i < count; i++) {
            boolean hasNext = reader.next();
            assert hasNext;
            writer.append(reader.packedValue(), reader.ord(), reader.docID());
        }
        return new PathSlice(writer, 0, count);
    } catch (Throwable t) {
        throw verifyChecksum(t, source.writer);
    }
}
Also used : PointWriter(org.apache.lucene.util.bkd.PointWriter) OfflinePointWriter(org.apache.lucene.util.bkd.OfflinePointWriter) HeapPointWriter(org.apache.lucene.util.bkd.HeapPointWriter) HeapPointWriter(org.apache.lucene.util.bkd.HeapPointWriter) PointReader(org.apache.lucene.util.bkd.PointReader) OfflinePointReader(org.apache.lucene.util.bkd.OfflinePointReader)

Example 3 with HeapPointWriter

use of org.apache.lucene.util.bkd.HeapPointWriter in project lucene-solr by apache.

the class SimpleTextBKDWriter method sort.

private PointWriter sort(int dim) throws IOException {
    assert dim >= 0 && dim < numDims;
    if (heapPointWriter != null) {
        assert tempInput == null;
        // We never spilled the incoming points to disk, so now we sort in heap:
        HeapPointWriter sorted;
        if (dim == 0) {
            // First dim can re-use the current heap writer
            sorted = heapPointWriter;
        } else {
            // Subsequent dims need a private copy
            sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength, longOrds, singleValuePerDoc);
            sorted.copyFrom(heapPointWriter);
        }
        //long t0 = System.nanoTime();
        sortHeapPointWriter(sorted, dim);
        //long t1 = System.nanoTime();
        //System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec");
        sorted.close();
        return sorted;
    } else {
        // Offline sort:
        assert tempInput != null;
        final int offset = bytesPerDim * dim;
        Comparator<BytesRef> cmp;
        if (dim == numDims - 1) {
            // in that case the bytes for the dimension and for the doc id are contiguous,
            // so we don't need a branch
            cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {

                @Override
                protected int byteAt(BytesRef ref, int i) {
                    return ref.bytes[ref.offset + offset + i] & 0xff;
                }
            };
        } else {
            cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {

                @Override
                protected int byteAt(BytesRef ref, int i) {
                    if (i < bytesPerDim) {
                        return ref.bytes[ref.offset + offset + i] & 0xff;
                    } else {
                        return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim] & 0xff;
                    }
                }
            };
        }
        OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, null, 0) {

            /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
            @Override
            protected ByteSequencesWriter getWriter(IndexOutput out, long count) {
                return new ByteSequencesWriter(out) {

                    @Override
                    public void write(byte[] bytes, int off, int len) throws IOException {
                        assert len == bytesPerDoc : "len=" + len + " bytesPerDoc=" + bytesPerDoc;
                        out.writeBytes(bytes, off, len);
                    }
                };
            }

            /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
            @Override
            protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
                return new ByteSequencesReader(in, name) {

                    final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]);

                    @Override
                    public BytesRef next() throws IOException {
                        if (in.getFilePointer() >= end) {
                            return null;
                        }
                        in.readBytes(scratch.bytes, 0, bytesPerDoc);
                        return scratch;
                    }
                };
            }
        };
        String name = sorter.sort(tempInput.getName());
        return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount, longOrds, singleValuePerDoc);
    }
}
Also used : HeapPointWriter(org.apache.lucene.util.bkd.HeapPointWriter) OfflineSorter(org.apache.lucene.util.OfflineSorter) OfflinePointWriter(org.apache.lucene.util.bkd.OfflinePointWriter) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) BytesRefComparator(org.apache.lucene.util.BytesRefComparator) IndexOutput(org.apache.lucene.store.IndexOutput) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

HeapPointWriter (org.apache.lucene.util.bkd.HeapPointWriter)3 OfflinePointWriter (org.apache.lucene.util.bkd.OfflinePointWriter)3 BytesRef (org.apache.lucene.util.BytesRef)2 OfflinePointReader (org.apache.lucene.util.bkd.OfflinePointReader)2 PointReader (org.apache.lucene.util.bkd.PointReader)2 PointWriter (org.apache.lucene.util.bkd.PointWriter)2 IntFunction (java.util.function.IntFunction)1 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)1 IndexOutput (org.apache.lucene.store.IndexOutput)1 BytesRefComparator (org.apache.lucene.util.BytesRefComparator)1 OfflineSorter (org.apache.lucene.util.OfflineSorter)1