Search in sources :

Example 6 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.

the class BytesRefHashTests method testFind.

public void testFind() throws Exception {
    BytesRefBuilder ref = new BytesRefBuilder();
    BytesRef scratch = new BytesRef();
    int num = scaledRandomIntBetween(2, 20);
    for (int j = 0; j < num; j++) {
        Set<String> strings = new HashSet<>();
        int uniqueCount = 0;
        for (int i = 0; i < 797; i++) {
            String str;
            do {
                str = TestUtil.randomRealisticUnicodeString(random(), 1000);
            } while (str.length() == 0);
            ref.copyChars(str);
            long count = hash.size();
            //hash.add(ref);
            long key = hash.find(ref.get());
            if (key >= 0) {
                // string found in hash
                assertFalse(strings.add(str));
                assertTrue(key < count);
                assertEquals(str, hash.get(key, scratch).utf8ToString());
                assertEquals(count, hash.size());
            } else {
                key = hash.add(ref.get());
                assertTrue(strings.add(str));
                assertEquals(uniqueCount, key);
                assertEquals(hash.size(), count + 1);
                uniqueCount++;
            }
        }
        assertAllIn(strings, hash);
        newHash();
    }
    hash.close();
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 7 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.

the class XAnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    String prefix = getClass().getSimpleName();
    Directory tempDir = getTempDir();
    OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    hasPayloads = iterator.hasPayloads();
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == payloadSep) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = (byte) payloadSep;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //PrintWriter pw = new PrintWriter("/tmp/out.dot");
    //Util.toDot(fst, pw, true, true);
    //pw.close();
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 8 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.

the class CommonTermsQueryBuilder method parseQueryString.

private static Query parseQueryString(ExtendedCommonTermsQuery query, Object queryString, String field, Analyzer analyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException {
    // Logic similar to QueryParser#getFieldQuery
    try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) {
        source.reset();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        BytesRefBuilder builder = new BytesRefBuilder();
        while (source.incrementToken()) {
            // UTF-8
            builder.copyChars(termAtt);
            query.add(new Term(field, builder.toBytesRef()));
        }
    }
    query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
    query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
    return query;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Term(org.apache.lucene.index.Term)

Example 9 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.

the class Uid method createUidsForTypesAndIds.

public static BytesRef[] createUidsForTypesAndIds(Collection<String> types, Collection<?> ids) {
    BytesRef[] uids = new BytesRef[types.size() * ids.size()];
    BytesRefBuilder typeBytes = new BytesRefBuilder();
    BytesRefBuilder idBytes = new BytesRefBuilder();
    int index = 0;
    for (String type : types) {
        typeBytes.copyChars(type);
        for (Object id : ids) {
            uids[index++] = Uid.createUidAsBytes(typeBytes.get(), BytesRefs.toBytesRef(id, idBytes));
        }
    }
    return uids;
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 10 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.

the class SortingBinaryDocValues method grow.

/**
     * Make sure the {@link #values} array can store at least {@link #count} entries.
     */
protected final void grow() {
    if (values.length < count) {
        final int oldLen = values.length;
        final int newLen = ArrayUtil.oversize(count, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
        values = Arrays.copyOf(values, newLen);
        for (int i = oldLen; i < newLen; ++i) {
            values[i] = new BytesRefBuilder();
        }
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder)

Aggregations

BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)141 BytesRef (org.apache.lucene.util.BytesRef)73 ArrayList (java.util.ArrayList)20 IOException (java.io.IOException)16 HashSet (java.util.HashSet)14 Term (org.apache.lucene.index.Term)14 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)14 FieldType (org.apache.solr.schema.FieldType)13 IndexInput (org.apache.lucene.store.IndexInput)12 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)10 IntsRef (org.apache.lucene.util.IntsRef)10 BufferedChecksumIndexInput (org.apache.lucene.store.BufferedChecksumIndexInput)9 SchemaField (org.apache.solr.schema.SchemaField)9 ParseException (java.text.ParseException)8 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)8 DecimalFormat (java.text.DecimalFormat)7 HashMap (java.util.HashMap)7 Map (java.util.Map)7 Directory (org.apache.lucene.store.Directory)7 IndexOutput (org.apache.lucene.store.IndexOutput)6