Search in sources :

Example 6 with OfflineSorter

use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.

the class BytesRefSortersTest method testExternalRefSorter.

@Test
public void testExternalRefSorter() throws Exception {
    Directory tempDir = newDirectory();
    ExternalRefSorter s = new ExternalRefSorter(new OfflineSorter(tempDir, "temp"));
    check(s);
    IOUtils.close(s, tempDir);
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) Directory(org.apache.lucene.store.Directory) Test(org.junit.Test)

Example 7 with OfflineSorter

use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.

the class BKDWriter method sort.

// useful for debugging:
/*
  private void printPathSlice(String desc, PathSlice slice, int dim) throws IOException {
    System.out.println("    " + desc + " dim=" + dim + " count=" + slice.count + ":");    
    try(PointReader r = slice.writer.getReader(slice.start, slice.count)) {
      int count = 0;
      while (r.next()) {
        byte[] v = r.packedValue();
        System.out.println("      " + count + ": " + new BytesRef(v, dim*bytesPerDim, bytesPerDim));
        count++;
        if (count == slice.count) {
          break;
        }
      }
    }
  }
  */
private PointWriter sort(int dim) throws IOException {
    assert dim >= 0 && dim < numDims;
    if (heapPointWriter != null) {
        assert tempInput == null;
        // We never spilled the incoming points to disk, so now we sort in heap:
        HeapPointWriter sorted;
        if (dim == 0) {
            // First dim can re-use the current heap writer
            sorted = heapPointWriter;
        } else {
            // Subsequent dims need a private copy
            sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength, longOrds, singleValuePerDoc);
            sorted.copyFrom(heapPointWriter);
        }
        //long t0 = System.nanoTime();
        sortHeapPointWriter(sorted, dim);
        //long t1 = System.nanoTime();
        //System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec");
        sorted.close();
        return sorted;
    } else {
        // Offline sort:
        assert tempInput != null;
        final int offset = bytesPerDim * dim;
        Comparator<BytesRef> cmp;
        if (dim == numDims - 1) {
            // in that case the bytes for the dimension and for the doc id are contiguous,
            // so we don't need a branch
            cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {

                @Override
                protected int byteAt(BytesRef ref, int i) {
                    return ref.bytes[ref.offset + offset + i] & 0xff;
                }
            };
        } else {
            cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {

                @Override
                protected int byteAt(BytesRef ref, int i) {
                    if (i < bytesPerDim) {
                        return ref.bytes[ref.offset + offset + i] & 0xff;
                    } else {
                        return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim] & 0xff;
                    }
                }
            };
        }
        OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, null, 0) {

            /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
            @Override
            protected ByteSequencesWriter getWriter(IndexOutput out, long count) {
                return new ByteSequencesWriter(out) {

                    @Override
                    public void write(byte[] bytes, int off, int len) throws IOException {
                        assert len == bytesPerDoc : "len=" + len + " bytesPerDoc=" + bytesPerDoc;
                        out.writeBytes(bytes, off, len);
                    }
                };
            }

            /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
            @Override
            protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
                return new ByteSequencesReader(in, name) {

                    final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]);

                    @Override
                    public BytesRef next() throws IOException {
                        if (in.getFilePointer() >= end) {
                            return null;
                        }
                        in.readBytes(scratch.bytes, 0, bytesPerDoc);
                        return scratch;
                    }
                };
            }
        };
        String name = sorter.sort(tempInput.getName());
        return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount, longOrds, singleValuePerDoc);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) BytesRefComparator(org.apache.lucene.util.BytesRefComparator) IndexOutput(org.apache.lucene.store.IndexOutput) BytesRef(org.apache.lucene.util.BytesRef)

Example 8 with OfflineSorter

use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.

the class SimpleTextBKDWriter method sort.

private PointWriter sort(int dim) throws IOException {
    assert dim >= 0 && dim < numDims;
    if (heapPointWriter != null) {
        assert tempInput == null;
        // We never spilled the incoming points to disk, so now we sort in heap:
        HeapPointWriter sorted;
        if (dim == 0) {
            // First dim can re-use the current heap writer
            sorted = heapPointWriter;
        } else {
            // Subsequent dims need a private copy
            sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength, longOrds, singleValuePerDoc);
            sorted.copyFrom(heapPointWriter);
        }
        //long t0 = System.nanoTime();
        sortHeapPointWriter(sorted, dim);
        //long t1 = System.nanoTime();
        //System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec");
        sorted.close();
        return sorted;
    } else {
        // Offline sort:
        assert tempInput != null;
        final int offset = bytesPerDim * dim;
        Comparator<BytesRef> cmp;
        if (dim == numDims - 1) {
            // in that case the bytes for the dimension and for the doc id are contiguous,
            // so we don't need a branch
            cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {

                @Override
                protected int byteAt(BytesRef ref, int i) {
                    return ref.bytes[ref.offset + offset + i] & 0xff;
                }
            };
        } else {
            cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {

                @Override
                protected int byteAt(BytesRef ref, int i) {
                    if (i < bytesPerDim) {
                        return ref.bytes[ref.offset + offset + i] & 0xff;
                    } else {
                        return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim] & 0xff;
                    }
                }
            };
        }
        OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, null, 0) {

            /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
            @Override
            protected ByteSequencesWriter getWriter(IndexOutput out, long count) {
                return new ByteSequencesWriter(out) {

                    @Override
                    public void write(byte[] bytes, int off, int len) throws IOException {
                        assert len == bytesPerDoc : "len=" + len + " bytesPerDoc=" + bytesPerDoc;
                        out.writeBytes(bytes, off, len);
                    }
                };
            }

            /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
            @Override
            protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
                return new ByteSequencesReader(in, name) {

                    final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]);

                    @Override
                    public BytesRef next() throws IOException {
                        if (in.getFilePointer() >= end) {
                            return null;
                        }
                        in.readBytes(scratch.bytes, 0, bytesPerDoc);
                        return scratch;
                    }
                };
            }
        };
        String name = sorter.sort(tempInput.getName());
        return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount, longOrds, singleValuePerDoc);
    }
}
Also used : HeapPointWriter(org.apache.lucene.util.bkd.HeapPointWriter) OfflineSorter(org.apache.lucene.util.OfflineSorter) OfflinePointWriter(org.apache.lucene.util.bkd.OfflinePointWriter) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) BytesRefComparator(org.apache.lucene.util.BytesRefComparator) IndexOutput(org.apache.lucene.store.IndexOutput) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

OfflineSorter (org.apache.lucene.util.OfflineSorter)8 BytesRef (org.apache.lucene.util.BytesRef)7 IndexOutput (org.apache.lucene.store.IndexOutput)6 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)4 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)3 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)3 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)3 HashSet (java.util.HashSet)2 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)2 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)2 Directory (org.apache.lucene.store.Directory)2 BytesRefComparator (org.apache.lucene.util.BytesRefComparator)2 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)2 IntsRef (org.apache.lucene.util.IntsRef)2 ByteSequencesReader (org.apache.lucene.util.OfflineSorter.ByteSequencesReader)2 ByteSequencesWriter (org.apache.lucene.util.OfflineSorter.ByteSequencesWriter)2 LimitedFiniteStringsIterator (org.apache.lucene.util.automaton.LimitedFiniteStringsIterator)2 Builder (org.apache.lucene.util.fst.Builder)2 PairOutputs (org.apache.lucene.util.fst.PairOutputs)2 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)2