Search in sources :

Example 6 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class TestSynonymGraphFilter method add.

private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
    if (VERBOSE) {
    //System.out.println("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
    }
    CharsRefBuilder inputCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(input.split(" +"), inputCharsRef);
    CharsRefBuilder outputCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(output.split(" +"), outputCharsRef);
    b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig);
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder)

Example 7 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class SimpleTextLiveDocsFormat method readLiveDocs.

@Override
public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) throws IOException {
    assert info.hasDeletions();
    BytesRefBuilder scratch = new BytesRefBuilder();
    CharsRefBuilder scratchUTF16 = new CharsRefBuilder();
    String fileName = IndexFileNames.fileNameFromGeneration(info.info.name, LIVEDOCS_EXTENSION, info.getDelGen());
    ChecksumIndexInput in = null;
    boolean success = false;
    try {
        in = dir.openChecksumInput(fileName, context);
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch.get(), SIZE);
        int size = parseIntAt(scratch.get(), SIZE.length, scratchUTF16);
        BitSet bits = new BitSet(size);
        SimpleTextUtil.readLine(in, scratch);
        while (!scratch.get().equals(END)) {
            assert StringHelper.startsWith(scratch.get(), DOC);
            int docid = parseIntAt(scratch.get(), DOC.length, scratchUTF16);
            bits.set(docid);
            SimpleTextUtil.readLine(in, scratch);
        }
        SimpleTextUtil.checkFooter(in);
        success = true;
        return new SimpleTextBits(bits, size);
    } finally {
        if (success) {
            IOUtils.close(in);
        } else {
            IOUtils.closeWhileHandlingException(in);
        }
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) BitSet(java.util.BitSet) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder)

Example 8 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class TestIndexWriterUnicode method testRandomUnicodeStrings.

// LUCENE-510
public void testRandomUnicodeStrings() throws Throwable {
    char[] buffer = new char[20];
    char[] expected = new char[20];
    CharsRefBuilder utf16 = new CharsRefBuilder();
    int num = atLeast(100000);
    for (int iter = 0; iter < num; iter++) {
        boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);
        BytesRef utf8 = new BytesRef(CharBuffer.wrap(buffer, 0, 20));
        if (!hasIllegal) {
            byte[] b = new String(buffer, 0, 20).getBytes(StandardCharsets.UTF_8);
            assertEquals(b.length, utf8.length);
            for (int i = 0; i < b.length; i++) assertEquals(b[i], utf8.bytes[i]);
        }
        utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
        assertEquals(utf16.length(), 20);
        for (int i = 0; i < 20; i++) assertEquals(expected[i], utf16.charAt(i));
    }
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 9 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class CompletionTokenStreamTest method testWithMultipleTokens.

@Test
public void testWithMultipleTokens() throws Exception {
    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    String input = "mykeyword another keyword";
    tokenStream.setReader(new StringReader(input));
    BytesRef payload = new BytesRef("payload");
    CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream);
    completionTokenStream.setPayload(payload);
    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
    CharsRefBuilder builder = new CharsRefBuilder();
    builder.append("mykeyword");
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("another");
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("keyword");
    assertTokenStreamContents(stream, new String[] { builder.toCharsRef().toString() }, null, null, new String[] { payload.utf8ToString() }, new int[] { 1 }, null, null);
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) StringReader(java.io.StringReader) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 10 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class DocValuesFacets method getCounts.

public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, FacetDebugInfo fdebug) throws IOException {
    SchemaField schemaField = searcher.getSchema().getField(fieldName);
    FieldType ft = schemaField.getType();
    NamedList<Integer> res = new NamedList<>();
    // TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
    final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
    // for term lookups only
    final SortedSetDocValues si;
    // for mapping per-segment ords to global ones
    OrdinalMap ordinalMap = null;
    if (multiValued) {
        si = searcher.getSlowAtomicReader().getSortedSetDocValues(fieldName);
        if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
            ordinalMap = ((MultiSortedSetDocValues) si).mapping;
        }
    } else {
        SortedDocValues single = searcher.getSlowAtomicReader().getSortedDocValues(fieldName);
        si = single == null ? null : DocValues.singleton(single);
        if (single instanceof MultiDocValues.MultiSortedDocValues) {
            ordinalMap = ((MultiDocValues.MultiSortedDocValues) single).mapping;
        }
    }
    if (si == null) {
        return finalize(res, searcher, schemaField, docs, -1, missing);
    }
    if (si.getValueCount() >= Integer.MAX_VALUE) {
        throw new UnsupportedOperationException("Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms");
    }
    final BytesRefBuilder prefixRef;
    if (prefix == null) {
        prefixRef = null;
    } else if (prefix.length() == 0) {
        prefix = null;
        prefixRef = null;
    } else {
        prefixRef = new BytesRefBuilder();
        prefixRef.copyChars(prefix);
    }
    int startTermIndex, endTermIndex;
    if (prefix != null) {
        startTermIndex = (int) si.lookupTerm(prefixRef.get());
        if (startTermIndex < 0)
            startTermIndex = -startTermIndex - 1;
        prefixRef.append(UnicodeUtil.BIG_TERM);
        endTermIndex = (int) si.lookupTerm(prefixRef.get());
        assert endTermIndex < 0;
        endTermIndex = -endTermIndex - 1;
    } else {
        startTermIndex = -1;
        endTermIndex = (int) si.getValueCount();
    }
    final int nTerms = endTermIndex - startTermIndex;
    int missingCount = -1;
    final CharsRefBuilder charsRef = new CharsRefBuilder();
    if (nTerms > 0 && docs.size() >= mincount) {
        // count collection array only needs to be as big as the number of terms we are
        // going to collect counts for.
        final int[] counts = new int[nTerms];
        if (fdebug != null) {
            fdebug.putInfoItem("numBuckets", nTerms);
        }
        Filter filter = docs.getTopFilter();
        List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
        for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
            LeafReaderContext leaf = leaves.get(subIndex);
            // solr docsets already exclude any deleted docs
            DocIdSet dis = filter.getDocIdSet(leaf, null);
            DocIdSetIterator disi = null;
            if (dis != null) {
                disi = dis.iterator();
            }
            if (disi != null) {
                if (multiValued) {
                    SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
                    if (sub == null) {
                        sub = DocValues.emptySortedSet();
                    }
                    final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
                    if (singleton != null) {
                        // some codecs may optimize SORTED_SET storage for single-valued fields
                        accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap);
                    } else {
                        accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
                    }
                } else {
                    SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
                    if (sub == null) {
                        sub = DocValues.emptySorted();
                    }
                    accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
                }
            }
        }
        if (startTermIndex == -1) {
            missingCount = counts[0];
        }
        // IDEA: we could also maintain a count of "other"... everything that fell outside
        // of the top 'N'
        int off = offset;
        int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
        if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
            int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
            maxsize = Math.min(maxsize, nTerms);
            LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE);
            // the smallest value in the top 'N' values
            int min = mincount - 1;
            for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) {
                int c = counts[i];
                if (c > min) {
                    if (termFilter != null) {
                        final BytesRef term = si.lookupOrd(startTermIndex + i);
                        if (!termFilter.test(term)) {
                            continue;
                        }
                    }
                    // smaller term numbers sort higher, so subtract the term number instead
                    long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i);
                    boolean displaced = queue.insert(pair);
                    if (displaced)
                        min = (int) (queue.top() >>> 32);
                }
            }
            // if we are deep paging, we don't have to order the highest "offset" counts.
            int collectCount = Math.max(0, queue.size() - off);
            assert collectCount <= lim;
            // the start and end indexes of our list "sorted" (starting with the highest value)
            int sortedIdxStart = queue.size() - (collectCount - 1);
            int sortedIdxEnd = queue.size() + 1;
            final long[] sorted = queue.sort(collectCount);
            for (int i = sortedIdxStart; i < sortedIdxEnd; i++) {
                long pair = sorted[i];
                int c = (int) (pair >>> 32);
                int tnum = Integer.MAX_VALUE - (int) pair;
                final BytesRef term = si.lookupOrd(startTermIndex + tnum);
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
            }
        } else {
            // add results in index order
            int i = (startTermIndex == -1) ? 1 : 0;
            if (mincount <= 0 && termFilter == null) {
                // if mincount<=0 and we're not examining the values for the term filter, then
                // we won't discard any terms and we know exactly where to start.
                i += off;
                off = 0;
            }
            for (; i < nTerms; i++) {
                int c = counts[i];
                if (c < mincount)
                    continue;
                BytesRef term = null;
                if (termFilter != null) {
                    term = si.lookupOrd(startTermIndex + i);
                    if (!termFilter.test(term)) {
                        continue;
                    }
                }
                if (--off >= 0)
                    continue;
                if (--lim < 0)
                    break;
                if (term == null) {
                    term = si.lookupOrd(startTermIndex + i);
                }
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
            }
        }
    }
    return finalize(res, searcher, schemaField, docs, missingCount, missing);
}
Also used : MultiSortedSetDocValues(org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues) DocIdSet(org.apache.lucene.search.DocIdSet) MultiDocValues(org.apache.lucene.index.MultiDocValues) OrdinalMap(org.apache.lucene.index.MultiDocValues.OrdinalMap) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) LongPriorityQueue(org.apache.solr.util.LongPriorityQueue) NamedList(org.apache.solr.common.util.NamedList) SortedDocValues(org.apache.lucene.index.SortedDocValues) FieldType(org.apache.solr.schema.FieldType) SchemaField(org.apache.solr.schema.SchemaField) MultiSortedSetDocValues(org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) Filter(org.apache.solr.search.Filter) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator)

Aggregations

CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)52 BytesRef (org.apache.lucene.util.BytesRef)30 ArrayList (java.util.ArrayList)11 IOException (java.io.IOException)10 NamedList (org.apache.solr.common.util.NamedList)10 FieldType (org.apache.solr.schema.FieldType)10 TermsEnum (org.apache.lucene.index.TermsEnum)9 SchemaField (org.apache.solr.schema.SchemaField)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 HashSet (java.util.HashSet)5 Test (org.junit.Test)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 PostingsEnum (org.apache.lucene.index.PostingsEnum)4 Terms (org.apache.lucene.index.Terms)4 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)4 LeafReader (org.apache.lucene.index.LeafReader)3 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)3 CharsRef (org.apache.lucene.util.CharsRef)3 Util (org.apache.lucene.util.fst.Util)3 SolrException (org.apache.solr.common.SolrException)3