use of org.apache.lucene.util.packed.MonotonicBlockPackedWriter in project lucene-solr by apache.
the class MemoryDocValuesConsumer method addBinaryField.
private void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
// write the byte[] data
meta.writeVInt(field.number);
meta.writeByte(BYTES);
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
final long startFP = data.getFilePointer();
boolean missing = false;
int upto = 0;
for (BytesRef v : values) {
final int length;
if (v == null) {
length = 0;
missing = true;
} else {
length = v.length;
}
if (length > MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH) {
throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH + " but got length=" + length + " v=" + v + "; upto=" + upto + " values=" + values);
}
upto++;
minLength = Math.min(minLength, length);
maxLength = Math.max(maxLength, length);
if (v != null) {
data.writeBytes(v.bytes, v.offset, v.length);
}
}
meta.writeLong(startFP);
meta.writeLong(data.getFilePointer() - startFP);
if (missing) {
long start = data.getFilePointer();
writeMissingBitset(values);
meta.writeLong(start);
meta.writeLong(data.getFilePointer() - start);
} else {
meta.writeLong(-1L);
}
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
// otherwise, we need to record the length fields...
if (minLength != maxLength) {
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeVInt(BLOCK_SIZE);
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
long addr = 0;
for (BytesRef v : values) {
if (v != null) {
addr += v.length;
}
writer.add(addr);
}
writer.finish();
}
}
use of org.apache.lucene.util.packed.MonotonicBlockPackedWriter in project lucene-solr by apache.
the class MemoryDocValuesConsumer method addSortedNumericField.
@Override
public void addSortedNumericField(FieldInfo field, final DocValuesProducer valuesProducer) throws IOException {
final Iterable<Number> docToValueCount = LegacyDocValuesIterables.sortedNumericToDocCount(valuesProducer, field, maxDoc);
final Iterable<Number> values = LegacyDocValuesIterables.sortedNumericToValues(valuesProducer, field);
meta.writeVInt(field.number);
if (isSingleValued(docToValueCount)) {
meta.writeByte(SORTED_NUMERIC_SINGLETON);
addNumericField(field, singletonView(docToValueCount, values, null), true);
} else {
meta.writeByte(SORTED_NUMERIC);
// write the addresses:
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeVInt(BLOCK_SIZE);
meta.writeLong(data.getFilePointer());
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
long addr = 0;
writer.add(addr);
for (Number v : docToValueCount) {
addr += v.longValue();
writer.add(addr);
}
writer.finish();
long valueCount = writer.ord();
meta.writeLong(valueCount);
// write the values
addNumericField(field, values, true);
}
}
use of org.apache.lucene.util.packed.MonotonicBlockPackedWriter in project lucene-solr by apache.
the class Lucene54DocValuesConsumer method addTermsDict.
/** expert: writes a value dictionary for a sorted/sortedset field */
private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
// first check if it's a "fixed-length" terms dict, and compressibility if so
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
long numValues = 0;
BytesRefBuilder previousValue = new BytesRefBuilder();
// only valid for fixed-width data, as we have a choice there
long prefixSum = 0;
for (BytesRef v : values) {
minLength = Math.min(minLength, v.length);
maxLength = Math.max(maxLength, v.length);
if (minLength == maxLength) {
int termPosition = (int) (numValues & INTERVAL_MASK);
if (termPosition == 0) {
// first term in block, save it away to compare against the last term later
previousValue.copyBytes(v);
} else if (termPosition == INTERVAL_COUNT - 1) {
// last term in block, accumulate shared prefix against first term
prefixSum += StringHelper.bytesDifference(previousValue.get(), v);
}
}
numValues++;
}
// so if we share at least 3 bytes on average, always compress.
if (minLength == maxLength && prefixSum <= 3 * (numValues >> INTERVAL_SHIFT)) {
// no index needed: not very compressible, direct addressing by mult
addBinaryField(field, values);
} else if (numValues < REVERSE_INTERVAL_COUNT) {
// low cardinality: waste a few KB of ram, but can't really use fancy index etc
addBinaryField(field, values);
} else {
// we don't have to handle the empty case
assert numValues > 0;
// header
meta.writeVInt(field.number);
meta.writeByte(Lucene54DocValuesFormat.BINARY);
meta.writeVInt(BINARY_PREFIX_COMPRESSED);
meta.writeLong(-1L);
// now write the bytes: sharing prefixes within a block
final long startFP = data.getFilePointer();
// currently, we have to store the delta from expected for every 1/nth term
// we could avoid this, but it's not much and less overall RAM than the previous approach!
RAMOutputStream addressBuffer = new RAMOutputStream();
MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, MONOTONIC_BLOCK_SIZE);
// buffers up 16 terms
RAMOutputStream bytesBuffer = new RAMOutputStream();
// buffers up block header
RAMOutputStream headerBuffer = new RAMOutputStream();
BytesRefBuilder lastTerm = new BytesRefBuilder();
lastTerm.grow(maxLength);
long count = 0;
int[] suffixDeltas = new int[INTERVAL_COUNT];
for (BytesRef v : values) {
int termPosition = (int) (count & INTERVAL_MASK);
if (termPosition == 0) {
termAddresses.add(data.getFilePointer() - startFP);
// abs-encode first term
headerBuffer.writeVInt(v.length);
headerBuffer.writeBytes(v.bytes, v.offset, v.length);
lastTerm.copyBytes(v);
} else {
// prefix-code: we only share at most 255 characters, to encode the length as a single
// byte and have random access. Larger terms just get less compression.
int sharedPrefix = Math.min(255, StringHelper.bytesDifference(lastTerm.get(), v));
bytesBuffer.writeByte((byte) sharedPrefix);
bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
// we can encode one smaller, because terms are unique.
suffixDeltas[termPosition] = v.length - sharedPrefix - 1;
}
count++;
// flush block
if ((count & INTERVAL_MASK) == 0) {
flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
}
}
// flush trailing crap
int leftover = (int) (count & INTERVAL_MASK);
if (leftover > 0) {
Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0);
flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
}
final long indexStartFP = data.getFilePointer();
// write addresses of indexed terms
termAddresses.finish();
addressBuffer.writeTo(data);
addressBuffer = null;
termAddresses = null;
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
meta.writeVLong(count);
meta.writeLong(startFP);
meta.writeLong(indexStartFP);
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeVInt(MONOTONIC_BLOCK_SIZE);
addReverseTermIndex(field, values, maxLength);
}
}
use of org.apache.lucene.util.packed.MonotonicBlockPackedWriter in project lucene-solr by apache.
the class Lucene54DocValuesConsumer method addReverseTermIndex.
// writes reverse term index: used for binary searching a term into a range of 64 blocks
// for every 64 blocks (1024 terms) we store a term, trimming any suffix unnecessary for comparison
// terms are written as a contiguous byte[], but never spanning 2^15 byte boundaries.
private void addReverseTermIndex(FieldInfo field, final Iterable<BytesRef> values, int maxLength) throws IOException {
long count = 0;
BytesRefBuilder priorTerm = new BytesRefBuilder();
priorTerm.grow(maxLength);
BytesRef indexTerm = new BytesRef();
long startFP = data.getFilePointer();
PagedBytes pagedBytes = new PagedBytes(15);
MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE);
for (BytesRef b : values) {
int termPosition = (int) (count & REVERSE_INTERVAL_MASK);
if (termPosition == 0) {
int len = StringHelper.sortKeyLength(priorTerm.get(), b);
indexTerm.bytes = b.bytes;
indexTerm.offset = b.offset;
indexTerm.length = len;
addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm));
} else if (termPosition == REVERSE_INTERVAL_MASK) {
priorTerm.copyBytes(b);
}
count++;
}
addresses.finish();
long numBytes = pagedBytes.getPointer();
pagedBytes.freeze(true);
PagedBytesDataInput in = pagedBytes.getDataInput();
meta.writeLong(startFP);
data.writeVLong(numBytes);
data.copyBytes(in, numBytes);
}
Aggregations