Search in sources :

Example 46 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class TestLegacyNumericUtils method testLongSpecialValues.

public void testLongSpecialValues() throws Exception {
    long[] vals = new long[] { Long.MIN_VALUE, Long.MIN_VALUE + 1, Long.MIN_VALUE + 2, -5003400000000L, -4000L, -3000L, -2000L, -1000L, -1L, 0L, 1L, 10L, 300L, 50006789999999999L, Long.MAX_VALUE - 2, Long.MAX_VALUE - 1, Long.MAX_VALUE };
    BytesRefBuilder[] prefixVals = new BytesRefBuilder[vals.length];
    for (int i = 0; i < vals.length; i++) {
        prefixVals[i] = new BytesRefBuilder();
        LegacyNumericUtils.longToPrefixCoded(vals[i], 0, prefixVals[i]);
        // check forward and back conversion
        assertEquals("forward and back conversion should generate same long", vals[i], LegacyNumericUtils.prefixCodedToLong(prefixVals[i].get()));
        // test if decoding values as int fails correctly
        final int index = i;
        expectThrows(NumberFormatException.class, () -> {
            LegacyNumericUtils.prefixCodedToInt(prefixVals[index].get());
        });
    }
    // check sort order (prefixVals should be ascending)
    for (int i = 1; i < prefixVals.length; i++) {
        assertTrue("check sort order", prefixVals[i - 1].get().compareTo(prefixVals[i].get()) < 0);
    }
    // check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
    final BytesRefBuilder ref = new BytesRefBuilder();
    for (int i = 0; i < vals.length; i++) {
        for (int j = 0; j < 64; j++) {
            LegacyNumericUtils.longToPrefixCoded(vals[i], j, ref);
            long prefixVal = LegacyNumericUtils.prefixCodedToLong(ref.get());
            long mask = (1L << j) - 1L;
            assertEquals("difference between prefix val and original value for " + vals[i] + " with shift=" + j, vals[i] & mask, vals[i] - prefixVal);
        }
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder)

Example 47 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class AtomicUpdateDocumentMerger method doInc.

protected void doInc(SolrInputDocument toDoc, SolrInputField sif, Object fieldVal) {
    SolrInputField numericField = toDoc.get(sif.getName());
    SchemaField sf = schema.getField(sif.getName());
    if (numericField != null || sf.getDefaultValue() != null) {
        // TODO: fieldtype needs externalToObject?
        String oldValS = (numericField != null) ? numericField.getFirstValue().toString() : sf.getDefaultValue().toString();
        BytesRefBuilder term = new BytesRefBuilder();
        sf.getType().readableToIndexed(oldValS, term);
        Object oldVal = sf.getType().toObject(sf, term.get());
        String fieldValS = fieldVal.toString();
        Number result;
        if (oldVal instanceof Long) {
            result = ((Long) oldVal).longValue() + Long.parseLong(fieldValS);
        } else if (oldVal instanceof Float) {
            result = ((Float) oldVal).floatValue() + Float.parseFloat(fieldValS);
        } else if (oldVal instanceof Double) {
            result = ((Double) oldVal).doubleValue() + Double.parseDouble(fieldValS);
        } else {
            // int, short, byte
            result = ((Integer) oldVal).intValue() + Integer.parseInt(fieldValS);
        }
        toDoc.setField(sif.getName(), result);
    } else {
        toDoc.setField(sif.getName(), fieldVal);
    }
}
Also used : SchemaField(org.apache.solr.schema.SchemaField) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) SolrInputField(org.apache.solr.common.SolrInputField)

Example 48 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class Lucene54DocValuesConsumer method addTermsDict.

/** expert: writes a value dictionary for a sorted/sortedset field */
private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
    // first check if it's a "fixed-length" terms dict, and compressibility if so
    int minLength = Integer.MAX_VALUE;
    int maxLength = Integer.MIN_VALUE;
    long numValues = 0;
    BytesRefBuilder previousValue = new BytesRefBuilder();
    // only valid for fixed-width data, as we have a choice there
    long prefixSum = 0;
    for (BytesRef v : values) {
        minLength = Math.min(minLength, v.length);
        maxLength = Math.max(maxLength, v.length);
        if (minLength == maxLength) {
            int termPosition = (int) (numValues & INTERVAL_MASK);
            if (termPosition == 0) {
                // first term in block, save it away to compare against the last term later
                previousValue.copyBytes(v);
            } else if (termPosition == INTERVAL_COUNT - 1) {
                // last term in block, accumulate shared prefix against first term
                prefixSum += StringHelper.bytesDifference(previousValue.get(), v);
            }
        }
        numValues++;
    }
    // so if we share at least 3 bytes on average, always compress.
    if (minLength == maxLength && prefixSum <= 3 * (numValues >> INTERVAL_SHIFT)) {
        // no index needed: not very compressible, direct addressing by mult
        addBinaryField(field, values);
    } else if (numValues < REVERSE_INTERVAL_COUNT) {
        // low cardinality: waste a few KB of ram, but can't really use fancy index etc
        addBinaryField(field, values);
    } else {
        // we don't have to handle the empty case
        assert numValues > 0;
        // header
        meta.writeVInt(field.number);
        meta.writeByte(Lucene54DocValuesFormat.BINARY);
        meta.writeVInt(BINARY_PREFIX_COMPRESSED);
        meta.writeLong(-1L);
        // now write the bytes: sharing prefixes within a block
        final long startFP = data.getFilePointer();
        // currently, we have to store the delta from expected for every 1/nth term
        // we could avoid this, but it's not much and less overall RAM than the previous approach!
        RAMOutputStream addressBuffer = new RAMOutputStream();
        MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, MONOTONIC_BLOCK_SIZE);
        // buffers up 16 terms
        RAMOutputStream bytesBuffer = new RAMOutputStream();
        // buffers up block header
        RAMOutputStream headerBuffer = new RAMOutputStream();
        BytesRefBuilder lastTerm = new BytesRefBuilder();
        lastTerm.grow(maxLength);
        long count = 0;
        int[] suffixDeltas = new int[INTERVAL_COUNT];
        for (BytesRef v : values) {
            int termPosition = (int) (count & INTERVAL_MASK);
            if (termPosition == 0) {
                termAddresses.add(data.getFilePointer() - startFP);
                // abs-encode first term
                headerBuffer.writeVInt(v.length);
                headerBuffer.writeBytes(v.bytes, v.offset, v.length);
                lastTerm.copyBytes(v);
            } else {
                // prefix-code: we only share at most 255 characters, to encode the length as a single
                // byte and have random access. Larger terms just get less compression.
                int sharedPrefix = Math.min(255, StringHelper.bytesDifference(lastTerm.get(), v));
                bytesBuffer.writeByte((byte) sharedPrefix);
                bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
                // we can encode one smaller, because terms are unique.
                suffixDeltas[termPosition] = v.length - sharedPrefix - 1;
            }
            count++;
            // flush block
            if ((count & INTERVAL_MASK) == 0) {
                flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
            }
        }
        // flush trailing crap
        int leftover = (int) (count & INTERVAL_MASK);
        if (leftover > 0) {
            Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0);
            flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas);
        }
        final long indexStartFP = data.getFilePointer();
        // write addresses of indexed terms
        termAddresses.finish();
        addressBuffer.writeTo(data);
        addressBuffer = null;
        termAddresses = null;
        meta.writeVInt(minLength);
        meta.writeVInt(maxLength);
        meta.writeVLong(count);
        meta.writeLong(startFP);
        meta.writeLong(indexStartFP);
        meta.writeVInt(PackedInts.VERSION_CURRENT);
        meta.writeVInt(MONOTONIC_BLOCK_SIZE);
        addReverseTermIndex(field, values, maxLength);
    }
}
Also used : MonotonicBlockPackedWriter(org.apache.lucene.util.packed.MonotonicBlockPackedWriter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) RAMOutputStream(org.apache.lucene.store.RAMOutputStream) BytesRef(org.apache.lucene.util.BytesRef)

Example 49 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class Dictionary method readDictionaryFiles.

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
    BytesRefBuilder flagsScratch = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    StringBuilder sb = new StringBuilder();
    IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
        for (InputStream dictionary : dictionaries) {
            BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
            // first line is number of entries (approximately, sometimes)
            String line = lines.readLine();
            while ((line = lines.readLine()) != null) {
                // wild and unpredictable code comment rules
                if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
                    continue;
                }
                line = unescapeEntry(line);
                // if we havent seen any stem exceptions, try to parse one
                if (hasStemExceptions == false) {
                    int morphStart = line.indexOf(MORPH_SEPARATOR);
                    if (morphStart >= 0 && morphStart < line.length()) {
                        hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                    }
                }
                if (needsInputCleaning) {
                    int flagSep = line.indexOf(FLAG_SEPARATOR);
                    if (flagSep == -1) {
                        flagSep = line.indexOf(MORPH_SEPARATOR);
                    }
                    if (flagSep == -1) {
                        CharSequence cleansed = cleanInput(line, sb);
                        writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                    } else {
                        String text = line.substring(0, flagSep);
                        CharSequence cleansed = cleanInput(text, sb);
                        if (cleansed != sb) {
                            sb.setLength(0);
                            sb.append(cleansed);
                        }
                        sb.append(line.substring(flagSep));
                        writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                    }
                } else {
                    writer.write(line.getBytes(StandardCharsets.UTF_8));
                }
            }
        }
        CodecUtil.writeFooter(unsorted);
    }
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {

        BytesRef scratch1 = new BytesRef();

        BytesRef scratch2 = new BytesRef();

        @Override
        public int compare(BytesRef o1, BytesRef o2) {
            scratch1.bytes = o1.bytes;
            scratch1.offset = o1.offset;
            scratch1.length = o1.length;
            for (int i = scratch1.length - 1; i >= 0; i--) {
                if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                    scratch1.length = i;
                    break;
                }
            }
            scratch2.bytes = o2.bytes;
            scratch2.offset = o2.offset;
            scratch2.length = o2.length;
            for (int i = scratch2.length - 1; i >= 0; i--) {
                if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                    scratch2.length = i;
                    break;
                }
            }
            int cmp = scratch1.compareTo(scratch2);
            if (cmp == 0) {
                // tie break on whole row
                return o1.compareTo(o2);
            } else {
                return cmp;
            }
        }
    });
    String sorted;
    boolean success = false;
    try {
        sorted = sorter.sort(unsorted.getName());
        success = true;
    } finally {
        if (success) {
            tempDir.deleteFile(unsorted.getName());
        } else {
            IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
        }
    }
    boolean success2 = false;
    try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
        // TODO: the flags themselves can be double-chars (long) or also numeric
        // either way the trick is to encode them as char... but they must be parsed differently
        String currentEntry = null;
        IntsRefBuilder currentOrds = new IntsRefBuilder();
        while (true) {
            BytesRef scratch = reader.next();
            if (scratch == null) {
                break;
            }
            String line = scratch.utf8ToString();
            String entry;
            char[] wordForm;
            int end;
            int flagSep = line.indexOf(FLAG_SEPARATOR);
            if (flagSep == -1) {
                wordForm = NOFLAGS;
                end = line.indexOf(MORPH_SEPARATOR);
                entry = line.substring(0, end);
            } else {
                end = line.indexOf(MORPH_SEPARATOR);
                String flagPart = line.substring(flagSep + 1, end);
                if (aliasCount > 0) {
                    flagPart = getAliasValue(Integer.parseInt(flagPart));
                }
                wordForm = flagParsingStrategy.parseFlags(flagPart);
                Arrays.sort(wordForm);
                entry = line.substring(0, flagSep);
            }
            // we possibly have morphological data
            int stemExceptionID = 0;
            if (hasStemExceptions && end + 1 < line.length()) {
                String stemException = parseStemException(line.substring(end + 1));
                if (stemException != null) {
                    if (stemExceptionCount == stemExceptions.length) {
                        int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                        stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                    }
                    // we use '0' to indicate no exception for the form
                    stemExceptionID = stemExceptionCount + 1;
                    stemExceptions[stemExceptionCount++] = stemException;
                }
            }
            int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
            if (cmp < 0) {
                throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
            } else {
                encodeFlags(flagsScratch, wordForm);
                int ord = flagLookup.add(flagsScratch.get());
                if (ord < 0) {
                    // already exists in our hash
                    ord = (-ord) - 1;
                }
                // finalize current entry, and switch "current" if necessary
                if (cmp > 0 && currentEntry != null) {
                    Util.toUTF32(currentEntry, scratchInts);
                    words.add(scratchInts.get(), currentOrds.get());
                }
                // swap current
                if (cmp > 0 || currentEntry == null) {
                    currentEntry = entry;
                    // must be this way
                    currentOrds = new IntsRefBuilder();
                }
                if (hasStemExceptions) {
                    currentOrds.append(ord);
                    currentOrds.append(stemExceptionID);
                } else {
                    currentOrds.append(ord);
                }
            }
        }
        // finalize last entry
        Util.toUTF32(currentEntry, scratchInts);
        words.add(scratchInts.get(), currentOrds.get());
        success2 = true;
    } finally {
        if (success2) {
            tempDir.deleteFile(sorted);
        } else {
            IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
        }
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) InputStreamReader(java.io.InputStreamReader) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) IndexOutput(org.apache.lucene.store.IndexOutput) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteSequencesReader(org.apache.lucene.util.OfflineSorter.ByteSequencesReader) BufferedReader(java.io.BufferedReader) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 50 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class TestLucene54DocValuesFormat method testSortedSetAroundBlockSize.

@Slow
public void testSortedSetAroundBlockSize() throws IOException {
    final int frontier = 1 << Lucene54DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
    for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
        final Directory dir = newDirectory();
        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
        RAMFile buffer = new RAMFile();
        RAMOutputStream out = new RAMOutputStream(buffer, false);
        Document doc = new Document();
        SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
        doc.add(field1);
        SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
        doc.add(field2);
        for (int i = 0; i < maxDoc; ++i) {
            BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
            BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
            field1.setBytesValue(s1);
            field2.setBytesValue(s2);
            w.addDocument(doc);
            Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
            out.writeVInt(set.size());
            for (BytesRef ref : set) {
                out.writeVInt(ref.length);
                out.writeBytes(ref.bytes, ref.offset, ref.length);
            }
        }
        out.close();
        w.forceMerge(1);
        DirectoryReader r = DirectoryReader.open(w);
        w.close();
        LeafReader sr = getOnlyLeafReader(r);
        assertEquals(maxDoc, sr.maxDoc());
        SortedSetDocValues values = sr.getSortedSetDocValues("sset");
        assertNotNull(values);
        RAMInputStream in = new RAMInputStream("", buffer);
        BytesRefBuilder b = new BytesRefBuilder();
        for (int i = 0; i < maxDoc; ++i) {
            assertEquals(i, values.nextDoc());
            final int numValues = in.readVInt();
            for (int j = 0; j < numValues; ++j) {
                b.setLength(in.readVInt());
                b.grow(b.length());
                in.readBytes(b.bytes(), 0, b.length());
                assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
            }
            assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
        }
        r.close();
        dir.close();
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) RAMInputStream(org.apache.lucene.store.RAMInputStream) Document(org.apache.lucene.document.Document) RAMFile(org.apache.lucene.store.RAMFile) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) TreeSet(java.util.TreeSet) RAMOutputStream(org.apache.lucene.store.RAMOutputStream) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)150 BytesRef (org.apache.lucene.util.BytesRef)79 ArrayList (java.util.ArrayList)21 IOException (java.io.IOException)17 Term (org.apache.lucene.index.Term)16 HashSet (java.util.HashSet)15 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)14 FieldType (org.apache.solr.schema.FieldType)14 IndexInput (org.apache.lucene.store.IndexInput)12 BytesRefIterator (org.apache.lucene.util.BytesRefIterator)10 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)10 IntsRef (org.apache.lucene.util.IntsRef)10 SchemaField (org.apache.solr.schema.SchemaField)10 BufferedChecksumIndexInput (org.apache.lucene.store.BufferedChecksumIndexInput)9 ParseException (java.text.ParseException)8 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)8 DecimalFormat (java.text.DecimalFormat)7 HashMap (java.util.HashMap)7 Map (java.util.Map)7 Directory (org.apache.lucene.store.Directory)7