use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class TestSynonymGraphFilter method add.
private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
if (VERBOSE) {
//System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
}
CharsRefBuilder inputCharsRef = new CharsRefBuilder();
SynonymMap.Builder.join(input.split(" +"), inputCharsRef);
CharsRefBuilder outputCharsRef = new CharsRefBuilder();
SynonymMap.Builder.join(output.split(" +"), outputCharsRef);
b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig);
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class SimpleTextLiveDocsFormat method readLiveDocs.
@Override
public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) throws IOException {
assert info.hasDeletions();
BytesRefBuilder scratch = new BytesRefBuilder();
CharsRefBuilder scratchUTF16 = new CharsRefBuilder();
String fileName = IndexFileNames.fileNameFromGeneration(info.info.name, LIVEDOCS_EXTENSION, info.getDelGen());
ChecksumIndexInput in = null;
boolean success = false;
try {
in = dir.openChecksumInput(fileName, context);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), SIZE);
int size = parseIntAt(scratch.get(), SIZE.length, scratchUTF16);
BitSet bits = new BitSet(size);
SimpleTextUtil.readLine(in, scratch);
while (!scratch.get().equals(END)) {
assert StringHelper.startsWith(scratch.get(), DOC);
int docid = parseIntAt(scratch.get(), DOC.length, scratchUTF16);
bits.set(docid);
SimpleTextUtil.readLine(in, scratch);
}
SimpleTextUtil.checkFooter(in);
success = true;
return new SimpleTextBits(bits, size);
} finally {
if (success) {
IOUtils.close(in);
} else {
IOUtils.closeWhileHandlingException(in);
}
}
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class TestIndexWriterUnicode method testRandomUnicodeStrings.
// LUCENE-510
public void testRandomUnicodeStrings() throws Throwable {
char[] buffer = new char[20];
char[] expected = new char[20];
CharsRefBuilder utf16 = new CharsRefBuilder();
int num = atLeast(100000);
for (int iter = 0; iter < num; iter++) {
boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);
BytesRef utf8 = new BytesRef(CharBuffer.wrap(buffer, 0, 20));
if (!hasIllegal) {
byte[] b = new String(buffer, 0, 20).getBytes(StandardCharsets.UTF_8);
assertEquals(b.length, utf8.length);
for (int i = 0; i < b.length; i++) assertEquals(b[i], utf8.bytes[i]);
}
utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
assertEquals(utf16.length(), 20);
for (int i = 0; i < 20; i++) assertEquals(expected[i], utf16.charAt(i));
}
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class CompletionTokenStreamTest method testWithMultipleTokens.
@Test
public void testWithMultipleTokens() throws Exception {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "mykeyword another keyword";
tokenStream.setReader(new StringReader(input));
BytesRef payload = new BytesRef("payload");
CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream);
completionTokenStream.setPayload(payload);
PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
CharsRefBuilder builder = new CharsRefBuilder();
builder.append("mykeyword");
builder.append(((char) CompletionAnalyzer.SEP_LABEL));
builder.append("another");
builder.append(((char) CompletionAnalyzer.SEP_LABEL));
builder.append("keyword");
assertTokenStreamContents(stream, new String[] { builder.toCharsRef().toString() }, null, null, new String[] { payload.utf8ToString() }, new int[] { 1 }, null, null);
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class DocValuesFacets method getCounts.
public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, FacetDebugInfo fdebug) throws IOException {
SchemaField schemaField = searcher.getSchema().getField(fieldName);
FieldType ft = schemaField.getType();
NamedList<Integer> res = new NamedList<>();
// TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
// for term lookups only
final SortedSetDocValues si;
// for mapping per-segment ords to global ones
OrdinalMap ordinalMap = null;
if (multiValued) {
si = searcher.getSlowAtomicReader().getSortedSetDocValues(fieldName);
if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
ordinalMap = ((MultiSortedSetDocValues) si).mapping;
}
} else {
SortedDocValues single = searcher.getSlowAtomicReader().getSortedDocValues(fieldName);
si = single == null ? null : DocValues.singleton(single);
if (single instanceof MultiDocValues.MultiSortedDocValues) {
ordinalMap = ((MultiDocValues.MultiSortedDocValues) single).mapping;
}
}
if (si == null) {
return finalize(res, searcher, schemaField, docs, -1, missing);
}
if (si.getValueCount() >= Integer.MAX_VALUE) {
throw new UnsupportedOperationException("Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms");
}
final BytesRefBuilder prefixRef;
if (prefix == null) {
prefixRef = null;
} else if (prefix.length() == 0) {
prefix = null;
prefixRef = null;
} else {
prefixRef = new BytesRefBuilder();
prefixRef.copyChars(prefix);
}
int startTermIndex, endTermIndex;
if (prefix != null) {
startTermIndex = (int) si.lookupTerm(prefixRef.get());
if (startTermIndex < 0)
startTermIndex = -startTermIndex - 1;
prefixRef.append(UnicodeUtil.BIG_TERM);
endTermIndex = (int) si.lookupTerm(prefixRef.get());
assert endTermIndex < 0;
endTermIndex = -endTermIndex - 1;
} else {
startTermIndex = -1;
endTermIndex = (int) si.getValueCount();
}
final int nTerms = endTermIndex - startTermIndex;
int missingCount = -1;
final CharsRefBuilder charsRef = new CharsRefBuilder();
if (nTerms > 0 && docs.size() >= mincount) {
// count collection array only needs to be as big as the number of terms we are
// going to collect counts for.
final int[] counts = new int[nTerms];
if (fdebug != null) {
fdebug.putInfoItem("numBuckets", nTerms);
}
Filter filter = docs.getTopFilter();
List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
LeafReaderContext leaf = leaves.get(subIndex);
// solr docsets already exclude any deleted docs
DocIdSet dis = filter.getDocIdSet(leaf, null);
DocIdSetIterator disi = null;
if (dis != null) {
disi = dis.iterator();
}
if (disi != null) {
if (multiValued) {
SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
if (sub == null) {
sub = DocValues.emptySortedSet();
}
final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
if (singleton != null) {
// some codecs may optimize SORTED_SET storage for single-valued fields
accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap);
} else {
accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
}
} else {
SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
if (sub == null) {
sub = DocValues.emptySorted();
}
accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
}
}
}
if (startTermIndex == -1) {
missingCount = counts[0];
}
// IDEA: we could also maintain a count of "other"... everything that fell outside
// of the top 'N'
int off = offset;
int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
maxsize = Math.min(maxsize, nTerms);
LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE);
// the smallest value in the top 'N' values
int min = mincount - 1;
for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) {
int c = counts[i];
if (c > min) {
if (termFilter != null) {
final BytesRef term = si.lookupOrd(startTermIndex + i);
if (!termFilter.test(term)) {
continue;
}
}
// smaller term numbers sort higher, so subtract the term number instead
long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i);
boolean displaced = queue.insert(pair);
if (displaced)
min = (int) (queue.top() >>> 32);
}
}
// if we are deep paging, we don't have to order the highest "offset" counts.
int collectCount = Math.max(0, queue.size() - off);
assert collectCount <= lim;
// the start and end indexes of our list "sorted" (starting with the highest value)
int sortedIdxStart = queue.size() - (collectCount - 1);
int sortedIdxEnd = queue.size() + 1;
final long[] sorted = queue.sort(collectCount);
for (int i = sortedIdxStart; i < sortedIdxEnd; i++) {
long pair = sorted[i];
int c = (int) (pair >>> 32);
int tnum = Integer.MAX_VALUE - (int) pair;
final BytesRef term = si.lookupOrd(startTermIndex + tnum);
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
} else {
// add results in index order
int i = (startTermIndex == -1) ? 1 : 0;
if (mincount <= 0 && termFilter == null) {
// if mincount<=0 and we're not examining the values for the term filter, then
// we won't discard any terms and we know exactly where to start.
i += off;
off = 0;
}
for (; i < nTerms; i++) {
int c = counts[i];
if (c < mincount)
continue;
BytesRef term = null;
if (termFilter != null) {
term = si.lookupOrd(startTermIndex + i);
if (!termFilter.test(term)) {
continue;
}
}
if (--off >= 0)
continue;
if (--lim < 0)
break;
if (term == null) {
term = si.lookupOrd(startTermIndex + i);
}
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
}
}
return finalize(res, searcher, schemaField, docs, missingCount, missing);
}
Aggregations