use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class TestLegacyNumericUtils method testLongSpecialValues.
public void testLongSpecialValues() throws Exception {
long[] vals = new long[] { Long.MIN_VALUE, Long.MIN_VALUE + 1, Long.MIN_VALUE + 2, -5003400000000L, -4000L, -3000L, -2000L, -1000L, -1L, 0L, 1L, 10L, 300L, 50006789999999999L, Long.MAX_VALUE - 2, Long.MAX_VALUE - 1, Long.MAX_VALUE };
BytesRefBuilder[] prefixVals = new BytesRefBuilder[vals.length];
for (int i = 0; i < vals.length; i++) {
prefixVals[i] = new BytesRefBuilder();
LegacyNumericUtils.longToPrefixCoded(vals[i], 0, prefixVals[i]);
// check forward and back conversion
assertEquals("forward and back conversion should generate same long", vals[i], LegacyNumericUtils.prefixCodedToLong(prefixVals[i].get()));
// test if decoding values as int fails correctly
final int index = i;
expectThrows(NumberFormatException.class, () -> {
LegacyNumericUtils.prefixCodedToInt(prefixVals[index].get());
});
}
// check sort order (prefixVals should be ascending)
for (int i = 1; i < prefixVals.length; i++) {
assertTrue("check sort order", prefixVals[i - 1].get().compareTo(prefixVals[i].get()) < 0);
}
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
final BytesRefBuilder ref = new BytesRefBuilder();
for (int i = 0; i < vals.length; i++) {
for (int j = 0; j < 64; j++) {
LegacyNumericUtils.longToPrefixCoded(vals[i], j, ref);
long prefixVal = LegacyNumericUtils.prefixCodedToLong(ref.get());
long mask = (1L << j) - 1L;
assertEquals("difference between prefix val and original value for " + vals[i] + " with shift=" + j, vals[i] & mask, vals[i] - prefixVal);
}
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class AtomicUpdateDocumentMerger method doInc.
protected void doInc(SolrInputDocument toDoc, SolrInputField sif, Object fieldVal) {
SolrInputField numericField = toDoc.get(sif.getName());
SchemaField sf = schema.getField(sif.getName());
if (numericField != null || sf.getDefaultValue() != null) {
// TODO: fieldtype needs externalToObject?
String oldValS = (numericField != null) ? numericField.getFirstValue().toString() : sf.getDefaultValue().toString();
BytesRefBuilder term = new BytesRefBuilder();
sf.getType().readableToIndexed(oldValS, term);
Object oldVal = sf.getType().toObject(sf, term.get());
String fieldValS = fieldVal.toString();
Number result;
if (oldVal instanceof Long) {
result = ((Long) oldVal).longValue() + Long.parseLong(fieldValS);
} else if (oldVal instanceof Float) {
result = ((Float) oldVal).floatValue() + Float.parseFloat(fieldValS);
} else if (oldVal instanceof Double) {
result = ((Double) oldVal).doubleValue() + Double.parseDouble(fieldValS);
} else {
// int, short, byte
result = ((Integer) oldVal).intValue() + Integer.parseInt(fieldValS);
}
toDoc.setField(sif.getName(), result);
} else {
toDoc.setField(sif.getName(), fieldVal);
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class Lucene54DocValuesConsumer method addTermsDict.
/** expert: writes a value dictionary for a sorted/sortedset field */
private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
// first check if it's a "fixed-length" terms dict, and compressibility if so
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
long numValues = 0;
BytesRefBuilder previousValue = new BytesRefBuilder();
// only valid for fixed-width data, as we have a choice there
long prefixSum = 0;
for (BytesRef v : values) {
minLength = Math.min(minLength, v.length);
maxLength = Math.max(maxLength, v.length);
if (minLength == maxLength) {
int termPosition = (int) (numValues & INTERVAL_MASK);
if (termPosition == 0) {
// first term in block, save it away to compare against the last term later
previousValue.copyBytes(v);
} else if (termPosition == INTERVAL_COUNT - 1) {
// last term in block, accumulate shared prefix against first term
prefixSum += StringHelper.bytesDifference(previousValue.get(), v);
}
}
numValues++;
}
// so if we share at least 3 bytes on average, always compress.
if (minLength == maxLength && prefixSum <= 3 * (numValues >> INTERVAL_SHIFT)) {
// no index needed: not very compressible, direct addressing by mult
addBinaryField(field, values);
} else if (numValues < REVERSE_INTERVAL_COUNT) {
// low cardinality: waste a few KB of ram, but can't really use fancy index etc
addBinaryField(field, values);
} else {
// we don't have to handle the empty case
assert numValues > 0;
// header
meta.writeVInt(field.number);
meta.writeByte(Lucene54DocValuesFormat.BINARY);
meta.writeVInt(BINARY_PREFIX_COMPRESSED);
meta.writeLong(-1L);
// now write the bytes: sharing prefixes within a block
final long startFP = data.getFilePointer();
// currently, we have to store the delta from expected for every 1/nth term
// we could avoid this, but it's not much and less overall RAM than the previous approach!
RAMOutputStream addressBuffer = new RAMOutputStream();
MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, MONOTONIC_BLOCK_SIZE);
// buffers up 16 terms
RAMOutputStream bytesBuffer = new RAMOutputStream();
// buffers up block header
RAMOutputStream headerBuffer = new RAMOutputStream();
BytesRefBuilder lastTerm = new BytesRefBuilder();
lastTerm.grow(maxLength);
long count = 0;
int[] suffixDeltas = new int[INTERVAL_COUNT];
for (BytesRef v : values) {
int termPosition = (int) (count & INTERVAL_MASK);
if (termPosition == 0) {
termAddresses.add(data.getFilePointer() - startFP);
// abs-encode first term
headerBuffer.writeVInt(v.length);
headerBuffer.writeBytes(v.bytes, v.offset, v.length);
lastTerm.copyBytes(v);
} else {
// prefix-code: we only share at most 255 characters, to encode the length as a single
// byte and have random access. Larger terms just get less compression.
int sharedPrefix = Math.min(255, StringHelper.bytesDifference(lastTerm.get(), v));
bytesBuffer.writeByte((byte) sharedPrefix);
bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
// we can encode one smaller, because terms are unique.
suffixDeltas[termPosition] = v.length - sharedPrefix - 1;
}
count++;
// flush block
if ((count & INTERVAL_MASK) == 0) {
flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
}
}
// flush trailing crap
int leftover = (int) (count & INTERVAL_MASK);
if (leftover > 0) {
Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0);
flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
}
final long indexStartFP = data.getFilePointer();
// write addresses of indexed terms
termAddresses.finish();
addressBuffer.writeTo(data);
addressBuffer = null;
termAddresses = null;
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
meta.writeVLong(count);
meta.writeLong(startFP);
meta.writeLong(indexStartFP);
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeVInt(MONOTONIC_BLOCK_SIZE);
addReverseTermIndex(field, values, maxLength);
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class Dictionary method readDictionaryFiles.
/**
* Reads the dictionary file through the provided InputStreams, building up the words map
*
* @param dictionaries InputStreams to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
StringBuilder sb = new StringBuilder();
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
// first line is number of entries (approximately, sometimes)
String line = lines.readLine();
while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules
if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
continue;
}
line = unescapeEntry(line);
// if we havent seen any stem exceptions, try to parse one
if (hasStemExceptions == false) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0 && morphStart < line.length()) {
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
}
}
if (needsInputCleaning) {
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
flagSep = line.indexOf(MORPH_SEPARATOR);
}
if (flagSep == -1) {
CharSequence cleansed = cleanInput(line, sb);
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
} else {
String text = line.substring(0, flagSep);
CharSequence cleansed = cleanInput(text, sb);
if (cleansed != sb) {
sb.setLength(0);
sb.append(cleansed);
}
sb.append(line.substring(flagSep));
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
}
} else {
writer.write(line.getBytes(StandardCharsets.UTF_8));
}
}
}
CodecUtil.writeFooter(unsorted);
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {
BytesRef scratch1 = new BytesRef();
BytesRef scratch2 = new BytesRef();
@Override
public int compare(BytesRef o1, BytesRef o2) {
scratch1.bytes = o1.bytes;
scratch1.offset = o1.offset;
scratch1.length = o1.length;
for (int i = scratch1.length - 1; i >= 0; i--) {
if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
scratch1.length = i;
break;
}
}
scratch2.bytes = o2.bytes;
scratch2.offset = o2.offset;
scratch2.length = o2.length;
for (int i = scratch2.length - 1; i >= 0; i--) {
if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
scratch2.length = i;
break;
}
}
int cmp = scratch1.compareTo(scratch2);
if (cmp == 0) {
// tie break on whole row
return o1.compareTo(o2);
} else {
return cmp;
}
}
});
String sorted;
boolean success = false;
try {
sorted = sorter.sort(unsorted.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(unsorted.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
}
}
boolean success2 = false;
try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
String currentEntry = null;
IntsRefBuilder currentOrds = new IntsRefBuilder();
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
String line = scratch.utf8ToString();
String entry;
char[] wordForm;
int end;
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
end = line.indexOf(MORPH_SEPARATOR);
entry = line.substring(0, end);
} else {
end = line.indexOf(MORPH_SEPARATOR);
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(wordForm);
entry = line.substring(0, flagSep);
}
// we possibly have morphological data
int stemExceptionID = 0;
if (hasStemExceptions && end + 1 < line.length()) {
String stemException = parseStemException(line.substring(end + 1));
if (stemException != null) {
if (stemExceptionCount == stemExceptions.length) {
int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
stemExceptions = Arrays.copyOf(stemExceptions, newSize);
}
// we use '0' to indicate no exception for the form
stemExceptionID = stemExceptionCount + 1;
stemExceptions[stemExceptionCount++] = stemException;
}
}
int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
if (cmp < 0) {
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
} else {
encodeFlags(flagsScratch, wordForm);
int ord = flagLookup.add(flagsScratch.get());
if (ord < 0) {
// already exists in our hash
ord = (-ord) - 1;
}
// finalize current entry, and switch "current" if necessary
if (cmp > 0 && currentEntry != null) {
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
}
// swap current
if (cmp > 0 || currentEntry == null) {
currentEntry = entry;
// must be this way
currentOrds = new IntsRefBuilder();
}
if (hasStemExceptions) {
currentOrds.append(ord);
currentOrds.append(stemExceptionID);
} else {
currentOrds.append(ord);
}
}
}
// finalize last entry
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
success2 = true;
} finally {
if (success2) {
tempDir.deleteFile(sorted);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
}
}
}
use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.
the class TestLucene54DocValuesFormat method testSortedSetAroundBlockSize.
@Slow
public void testSortedSetAroundBlockSize() throws IOException {
final int frontier = 1 << Lucene54DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
final Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
RAMFile buffer = new RAMFile();
RAMOutputStream out = new RAMOutputStream(buffer, false);
Document doc = new Document();
SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
doc.add(field1);
SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
doc.add(field2);
for (int i = 0; i < maxDoc; ++i) {
BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
field1.setBytesValue(s1);
field2.setBytesValue(s2);
w.addDocument(doc);
Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
out.writeVInt(set.size());
for (BytesRef ref : set) {
out.writeVInt(ref.length);
out.writeBytes(ref.bytes, ref.offset, ref.length);
}
}
out.close();
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
w.close();
LeafReader sr = getOnlyLeafReader(r);
assertEquals(maxDoc, sr.maxDoc());
SortedSetDocValues values = sr.getSortedSetDocValues("sset");
assertNotNull(values);
RAMInputStream in = new RAMInputStream("", buffer);
BytesRefBuilder b = new BytesRefBuilder();
for (int i = 0; i < maxDoc; ++i) {
assertEquals(i, values.nextDoc());
final int numValues = in.readVInt();
for (int j = 0; j < numValues; ++j) {
b.setLength(in.readVInt());
b.grow(b.length());
in.readBytes(b.bytes(), 0, b.length());
assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
}
assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
}
r.close();
dir.close();
}
}
Aggregations