Search in sources :

Example 1 with MonotonicBlockPackedWriter

use of org.apache.lucene.util.packed.MonotonicBlockPackedWriter in project lucene-solr by apache.

the class MemoryDocValuesConsumer method addBinaryField.

private void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
    // write the byte[] data
    meta.writeVInt(field.number);
    meta.writeByte(BYTES);
    int minLength = Integer.MAX_VALUE;
    int maxLength = Integer.MIN_VALUE;
    final long startFP = data.getFilePointer();
    boolean missing = false;
    int upto = 0;
    for (BytesRef v : values) {
        final int length;
        if (v == null) {
            length = 0;
            missing = true;
        } else {
            length = v.length;
        }
        if (length > MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH) {
            throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH + " but got length=" + length + " v=" + v + "; upto=" + upto + " values=" + values);
        }
        upto++;
        minLength = Math.min(minLength, length);
        maxLength = Math.max(maxLength, length);
        if (v != null) {
            data.writeBytes(v.bytes, v.offset, v.length);
        }
    }
    meta.writeLong(startFP);
    meta.writeLong(data.getFilePointer() - startFP);
    if (missing) {
        long start = data.getFilePointer();
        writeMissingBitset(values);
        meta.writeLong(start);
        meta.writeLong(data.getFilePointer() - start);
    } else {
        meta.writeLong(-1L);
    }
    meta.writeVInt(minLength);
    meta.writeVInt(maxLength);
    // otherwise, we need to record the length fields...
    if (minLength != maxLength) {
        meta.writeVInt(PackedInts.VERSION_CURRENT);
        meta.writeVInt(BLOCK_SIZE);
        final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
        long addr = 0;
        for (BytesRef v : values) {
            if (v != null) {
                addr += v.length;
            }
            writer.add(addr);
        }
        writer.finish();
    }
}
Also used : MonotonicBlockPackedWriter(org.apache.lucene.util.packed.MonotonicBlockPackedWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with MonotonicBlockPackedWriter

use of org.apache.lucene.util.packed.MonotonicBlockPackedWriter in project lucene-solr by apache.

the class MemoryDocValuesConsumer method addSortedNumericField.

@Override
public void addSortedNumericField(FieldInfo field, final DocValuesProducer valuesProducer) throws IOException {
    final Iterable<Number> docToValueCount = LegacyDocValuesIterables.sortedNumericToDocCount(valuesProducer, field, maxDoc);
    final Iterable<Number> values = LegacyDocValuesIterables.sortedNumericToValues(valuesProducer, field);
    meta.writeVInt(field.number);
    if (isSingleValued(docToValueCount)) {
        meta.writeByte(SORTED_NUMERIC_SINGLETON);
        addNumericField(field, singletonView(docToValueCount, values, null), true);
    } else {
        meta.writeByte(SORTED_NUMERIC);
        // write the addresses:
        meta.writeVInt(PackedInts.VERSION_CURRENT);
        meta.writeVInt(BLOCK_SIZE);
        meta.writeLong(data.getFilePointer());
        final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
        long addr = 0;
        writer.add(addr);
        for (Number v : docToValueCount) {
            addr += v.longValue();
            writer.add(addr);
        }
        writer.finish();
        long valueCount = writer.ord();
        meta.writeLong(valueCount);
        // write the values
        addNumericField(field, values, true);
    }
}
Also used : MonotonicBlockPackedWriter(org.apache.lucene.util.packed.MonotonicBlockPackedWriter)

Example 3 with MonotonicBlockPackedWriter

use of org.apache.lucene.util.packed.MonotonicBlockPackedWriter in project lucene-solr by apache.

the class Lucene54DocValuesConsumer method addTermsDict.

/** expert: writes a value dictionary for a sorted/sortedset field */
private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
    // first check if it's a "fixed-length" terms dict, and compressibility if so
    int minLength = Integer.MAX_VALUE;
    int maxLength = Integer.MIN_VALUE;
    long numValues = 0;
    BytesRefBuilder previousValue = new BytesRefBuilder();
    // only valid for fixed-width data, as we have a choice there
    long prefixSum = 0;
    for (BytesRef v : values) {
        minLength = Math.min(minLength, v.length);
        maxLength = Math.max(maxLength, v.length);
        if (minLength == maxLength) {
            int termPosition = (int) (numValues & INTERVAL_MASK);
            if (termPosition == 0) {
                // first term in block, save it away to compare against the last term later
                previousValue.copyBytes(v);
            } else if (termPosition == INTERVAL_COUNT - 1) {
                // last term in block, accumulate shared prefix against first term
                prefixSum += StringHelper.bytesDifference(previousValue.get(), v);
            }
        }
        numValues++;
    }
    // so if we share at least 3 bytes on average, always compress.
    if (minLength == maxLength && prefixSum <= 3 * (numValues >> INTERVAL_SHIFT)) {
        // no index needed: not very compressible, direct addressing by mult
        addBinaryField(field, values);
    } else if (numValues < REVERSE_INTERVAL_COUNT) {
        // low cardinality: waste a few KB of ram, but can't really use fancy index etc
        addBinaryField(field, values);
    } else {
        // we don't have to handle the empty case
        assert numValues > 0;
        // header
        meta.writeVInt(field.number);
        meta.writeByte(Lucene54DocValuesFormat.BINARY);
        meta.writeVInt(BINARY_PREFIX_COMPRESSED);
        meta.writeLong(-1L);
        // now write the bytes: sharing prefixes within a block
        final long startFP = data.getFilePointer();
        // currently, we have to store the delta from expected for every 1/nth term
        // we could avoid this, but it's not much and less overall RAM than the previous approach!
        RAMOutputStream addressBuffer = new RAMOutputStream();
        MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, MONOTONIC_BLOCK_SIZE);
        // buffers up 16 terms
        RAMOutputStream bytesBuffer = new RAMOutputStream();
        // buffers up block header
        RAMOutputStream headerBuffer = new RAMOutputStream();
        BytesRefBuilder lastTerm = new BytesRefBuilder();
        lastTerm.grow(maxLength);
        long count = 0;
        int[] suffixDeltas = new int[INTERVAL_COUNT];
        for (BytesRef v : values) {
            int termPosition = (int) (count & INTERVAL_MASK);
            if (termPosition == 0) {
                termAddresses.add(data.getFilePointer() - startFP);
                // abs-encode first term
                headerBuffer.writeVInt(v.length);
                headerBuffer.writeBytes(v.bytes, v.offset, v.length);
                lastTerm.copyBytes(v);
            } else {
                // prefix-code: we only share at most 255 characters, to encode the length as a single
                // byte and have random access. Larger terms just get less compression.
                int sharedPrefix = Math.min(255, StringHelper.bytesDifference(lastTerm.get(), v));
                bytesBuffer.writeByte((byte) sharedPrefix);
                bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
                // we can encode one smaller, because terms are unique.
                suffixDeltas[termPosition] = v.length - sharedPrefix - 1;
            }
            count++;
            // flush block
            if ((count & INTERVAL_MASK) == 0) {
                flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
            }
        }
        // flush trailing crap
        int leftover = (int) (count & INTERVAL_MASK);
        if (leftover > 0) {
            Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0);
            flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
        }
        final long indexStartFP = data.getFilePointer();
        // write addresses of indexed terms
        termAddresses.finish();
        addressBuffer.writeTo(data);
        addressBuffer = null;
        termAddresses = null;
        meta.writeVInt(minLength);
        meta.writeVInt(maxLength);
        meta.writeVLong(count);
        meta.writeLong(startFP);
        meta.writeLong(indexStartFP);
        meta.writeVInt(PackedInts.VERSION_CURRENT);
        meta.writeVInt(MONOTONIC_BLOCK_SIZE);
        addReverseTermIndex(field, values, maxLength);
    }
}
Also used : MonotonicBlockPackedWriter(org.apache.lucene.util.packed.MonotonicBlockPackedWriter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) RAMOutputStream(org.apache.lucene.store.RAMOutputStream) BytesRef(org.apache.lucene.util.BytesRef)

Example 4 with MonotonicBlockPackedWriter

use of org.apache.lucene.util.packed.MonotonicBlockPackedWriter in project lucene-solr by apache.

the class Lucene54DocValuesConsumer method addReverseTermIndex.

// writes reverse term index: used for binary searching a term into a range of 64 blocks
// for every 64 blocks (1024 terms) we store a term, trimming any suffix unnecessary for comparison
// terms are written as a contiguous byte[], but never spanning 2^15 byte boundaries.
private void addReverseTermIndex(FieldInfo field, final Iterable<BytesRef> values, int maxLength) throws IOException {
    long count = 0;
    BytesRefBuilder priorTerm = new BytesRefBuilder();
    priorTerm.grow(maxLength);
    BytesRef indexTerm = new BytesRef();
    long startFP = data.getFilePointer();
    PagedBytes pagedBytes = new PagedBytes(15);
    MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE);
    for (BytesRef b : values) {
        int termPosition = (int) (count & REVERSE_INTERVAL_MASK);
        if (termPosition == 0) {
            int len = StringHelper.sortKeyLength(priorTerm.get(), b);
            indexTerm.bytes = b.bytes;
            indexTerm.offset = b.offset;
            indexTerm.length = len;
            addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm));
        } else if (termPosition == REVERSE_INTERVAL_MASK) {
            priorTerm.copyBytes(b);
        }
        count++;
    }
    addresses.finish();
    long numBytes = pagedBytes.getPointer();
    pagedBytes.freeze(true);
    PagedBytesDataInput in = pagedBytes.getDataInput();
    meta.writeLong(startFP);
    data.writeVLong(numBytes);
    data.copyBytes(in, numBytes);
}
Also used : MonotonicBlockPackedWriter(org.apache.lucene.util.packed.MonotonicBlockPackedWriter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) PagedBytes(org.apache.lucene.util.PagedBytes) PagedBytesDataInput(org.apache.lucene.util.PagedBytes.PagedBytesDataInput) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

MonotonicBlockPackedWriter (org.apache.lucene.util.packed.MonotonicBlockPackedWriter)4 BytesRef (org.apache.lucene.util.BytesRef)3 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)2 RAMOutputStream (org.apache.lucene.store.RAMOutputStream)1 PagedBytes (org.apache.lucene.util.PagedBytes)1 PagedBytesDataInput (org.apache.lucene.util.PagedBytes.PagedBytesDataInput)1