use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class CompletionTokenStreamTest method testWithSynonyms.
@Test
public void testWithSynonyms() throws Exception {
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "mykeyword another keyword";
tokenStream.setReader(new StringReader(input));
SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
BytesRef payload = new BytesRef("payload");
CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter, true, false, 100);
completionTokenStream.setPayload(payload);
PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
String[] expectedOutputs = new String[2];
CharsRefBuilder expectedOutput = new CharsRefBuilder();
expectedOutput.append("mykeyword");
expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
expectedOutput.append("another");
expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
expectedOutput.append("keyword");
expectedOutputs[0] = expectedOutput.toCharsRef().toString();
expectedOutput.clear();
expectedOutput.append("mysynonym");
expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
expectedOutput.append("another");
expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
expectedOutput.append("keyword");
expectedOutputs[1] = expectedOutput.toCharsRef().toString();
assertTokenStreamContents(stream, expectedOutputs, null, null, new String[] { payload.utf8ToString(), payload.utf8ToString() }, new int[] { 1, 1 }, null, null);
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class TestContextSuggestField method testTokenStream.
@Test
public void testTokenStream() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
ContextSuggestField field = new ContextSuggestField("field", "input", 1, "context1", "context2");
BytesRef surfaceForm = new BytesRef("input");
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
output.writeVInt(surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeVInt(1 + 1);
output.writeByte(ContextSuggestField.TYPE);
}
BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
String[] expectedOutputs = new String[2];
CharsRefBuilder builder = new CharsRefBuilder();
builder.append("context1");
builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
builder.append(((char) CompletionAnalyzer.SEP_LABEL));
builder.append("input");
expectedOutputs[0] = builder.toCharsRef().toString();
builder.clear();
builder.append("context2");
builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
builder.append(((char) CompletionAnalyzer.SEP_LABEL));
builder.append("input");
expectedOutputs[1] = builder.toCharsRef().toString();
TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null));
assertTokenStreamContents(stream, expectedOutputs, null, null, new String[] { payload.utf8ToString(), payload.utf8ToString() }, new int[] { 1, 1 }, null, null);
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null));
assertTokenStreamContents(stream, expectedOutputs, null, null, new String[] { payload.utf8ToString(), payload.utf8ToString() }, new int[] { 1, 1 }, null, null);
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class TermsComponent method process.
@Override
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.get(TermsParams.TERMS, "false").equals("true")) {
return;
}
String[] fields = params.getParams(TermsParams.TERMS_FIELD);
NamedList<Object> termsResult = new SimpleOrderedMap<>();
rb.rsp.add("terms", termsResult);
if (fields == null || fields.length == 0)
return;
boolean termStats = params.getBool(TermsParams.TERMS_STATS, false);
if (termStats) {
NamedList<Number> stats = new SimpleOrderedMap<>();
rb.rsp.add("indexstats", stats);
collectStats(rb.req.getSearcher(), stats);
}
String termList = params.get(TermsParams.TERMS_LIST);
if (termList != null) {
boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
return;
}
int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
if (limit < 0) {
limit = Integer.MAX_VALUE;
}
String lowerStr = params.get(TermsParams.TERMS_LOWER);
String upperStr = params.get(TermsParams.TERMS_UPPER);
boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
boolean sort = !TermsParams.TERMS_SORT_INDEX.equals(params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
if (freqmax < 0) {
freqmax = Integer.MAX_VALUE;
}
String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;
boolean raw = params.getBool(TermsParams.TERMS_RAW, false);
final LeafReader indexReader = rb.req.getSearcher().getSlowAtomicReader();
Fields lfields = indexReader.fields();
for (String field : fields) {
NamedList<Integer> fieldTerms = new NamedList<>();
termsResult.add(field, fieldTerms);
Terms terms = lfields.terms(field);
if (terms == null) {
// field does not exist
continue;
}
FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
if (ft == null)
ft = new StrField();
// prefix must currently be text
BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);
BytesRef upperBytes = null;
if (upperStr != null) {
BytesRefBuilder b = new BytesRefBuilder();
ft.readableToIndexed(upperStr, b);
upperBytes = b.get();
}
BytesRef lowerBytes;
if (lowerStr == null) {
// If no lower bound was specified, use the prefix
lowerBytes = prefixBytes;
} else {
lowerBytes = new BytesRef();
if (raw) {
// TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
// perhaps we detect if the FieldType is non-character and expect hex if so?
lowerBytes = new BytesRef(lowerStr);
} else {
BytesRefBuilder b = new BytesRefBuilder();
ft.readableToIndexed(lowerStr, b);
lowerBytes = b.get();
}
}
TermsEnum termsEnum = terms.iterator();
BytesRef term = null;
if (lowerBytes != null) {
if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
termsEnum = null;
} else {
term = termsEnum.term();
//Only advance the enum if we are excluding the lower bound and the lower Term actually matches
if (lowerIncl == false && term.equals(lowerBytes)) {
term = termsEnum.next();
}
}
} else {
// position termsEnum on first term
term = termsEnum.next();
}
int i = 0;
BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
CharsRefBuilder external = new CharsRefBuilder();
while (term != null && (i < limit || sort)) {
// did we fill in "external" yet for this term?
boolean externalized = false;
// stop if the prefix doesn't match
if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes))
break;
if (pattern != null) {
// indexed text or external text?
// TODO: support "raw" mode?
ft.indexedToReadable(term, external);
externalized = true;
if (!pattern.matcher(external.get()).matches()) {
term = termsEnum.next();
continue;
}
}
if (upperBytes != null) {
int upperCmp = term.compareTo(upperBytes);
// if we are past the upper term, or equal to it (when don't include upper) then stop.
if (upperCmp > 0 || (upperCmp == 0 && !upperIncl))
break;
}
// This is a good term in the range. Check if mincount/maxcount conditions are satisfied.
int docFreq = termsEnum.docFreq();
if (docFreq >= freqmin && docFreq <= freqmax) {
// add the term to the list
if (sort) {
queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
} else {
// TODO: handle raw somehow
if (!externalized) {
ft.indexedToReadable(term, external);
}
fieldTerms.add(external.toString(), docFreq);
i++;
}
}
term = termsEnum.next();
}
if (sort) {
for (CountPair<BytesRef, Integer> item : queue) {
if (i >= limit)
break;
ft.indexedToReadable(item.key, external);
fieldTerms.add(external.toString(), item.val);
i++;
}
}
}
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class LukeRequestHandler method getDocumentFieldsInfo.
private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException {
final CharsRefBuilder spare = new CharsRefBuilder();
SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();
for (Object o : doc.getFields()) {
Field field = (Field) o;
SimpleOrderedMap<Object> f = new SimpleOrderedMap<>();
SchemaField sfield = schema.getFieldOrNull(field.name());
FieldType ftype = (sfield == null) ? null : sfield.getType();
f.add("type", (ftype == null) ? null : ftype.getTypeName());
f.add("schema", getFieldFlags(sfield));
f.add("flags", getFieldFlags(field));
f.add("value", (ftype == null) ? null : ftype.toExternal(field));
// TODO: this really should be "stored"
// may be a binary number
f.add("internal", field.stringValue());
BytesRef bytes = field.binaryValue();
if (bytes != null) {
f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length));
}
if (!ftype.isPointField()) {
Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue());
// this can be 0 for non-indexed fields
f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t));
}
// If we have a term vector, return that
if (field.fieldType().storeTermVectors()) {
try {
Terms v = reader.getTermVector(docId, field.name());
if (v != null) {
SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>();
final TermsEnum termsEnum = v.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
final int freq = (int) termsEnum.totalTermFreq();
spare.copyUTF8Bytes(text);
tfv.add(spare.toString(), freq);
}
f.add("termVector", tfv);
}
} catch (Exception ex) {
log.warn("error writing term vector", ex);
}
}
finfo.add(field.name(), f);
}
return finfo;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class TestIndexWriterUnicode method testAllUnicodeChars.
// LUCENE-510
public void testAllUnicodeChars() throws Throwable {
CharsRefBuilder utf16 = new CharsRefBuilder();
char[] chars = new char[2];
for (int ch = 0; ch < 0x0010FFFF; ch++) {
if (ch == 0xd800)
// Skip invalid code points
ch = 0xe000;
int len = 0;
if (ch <= 0xffff) {
chars[len++] = (char) ch;
} else {
chars[len++] = (char) (((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
chars[len++] = (char) (((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
}
BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));
String s1 = new String(chars, 0, len);
String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
assertEquals("codepoint " + ch, s1, s2);
utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
assertEquals("codepoint " + ch, s1, utf16.toString());
byte[] b = s1.getBytes(StandardCharsets.UTF_8);
assertEquals(utf8.length, b.length);
for (int j = 0; j < utf8.length; j++) assertEquals(utf8.bytes[j], b[j]);
}
}
Aggregations