Search in sources :

Example 31 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class CompletionTokenStreamTest method testWithSynonyms.

@Test
public void testWithSynonyms() throws Exception {
    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    String input = "mykeyword another keyword";
    tokenStream.setReader(new StringReader(input));
    SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
    BytesRef payload = new BytesRef("payload");
    CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter, true, false, 100);
    completionTokenStream.setPayload(payload);
    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
    String[] expectedOutputs = new String[2];
    CharsRefBuilder expectedOutput = new CharsRefBuilder();
    expectedOutput.append("mykeyword");
    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
    expectedOutput.append("another");
    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
    expectedOutput.append("keyword");
    expectedOutputs[0] = expectedOutput.toCharsRef().toString();
    expectedOutput.clear();
    expectedOutput.append("mysynonym");
    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
    expectedOutput.append("another");
    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
    expectedOutput.append("keyword");
    expectedOutputs[1] = expectedOutput.toCharsRef().toString();
    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[] { payload.utf8ToString(), payload.utf8ToString() }, new int[] { 1, 1 }, null, null);
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) SynonymFilter(org.apache.lucene.analysis.synonym.SynonymFilter) CharsRef(org.apache.lucene.util.CharsRef) SynonymMap(org.apache.lucene.analysis.synonym.SynonymMap) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) StringReader(java.io.StringReader) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 32 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class TestContextSuggestField method testTokenStream.

@Test
public void testTokenStream() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    ContextSuggestField field = new ContextSuggestField("field", "input", 1, "context1", "context2");
    BytesRef surfaceForm = new BytesRef("input");
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
        output.writeVInt(surfaceForm.length);
        output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
        output.writeVInt(1 + 1);
        output.writeByte(ContextSuggestField.TYPE);
    }
    BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
    String[] expectedOutputs = new String[2];
    CharsRefBuilder builder = new CharsRefBuilder();
    builder.append("context1");
    builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("input");
    expectedOutputs[0] = builder.toCharsRef().toString();
    builder.clear();
    builder.append("context2");
    builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("input");
    expectedOutputs[1] = builder.toCharsRef().toString();
    TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null));
    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[] { payload.utf8ToString(), payload.utf8ToString() }, new int[] { 1, 1 }, null, null);
    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
    stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null));
    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[] { payload.utf8ToString(), payload.utf8ToString() }, new int[] { 1, 1 }, null, null);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) OutputStreamDataOutput(org.apache.lucene.store.OutputStreamDataOutput) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 33 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class TermsComponent method process.

@Override
public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.get(TermsParams.TERMS, "false").equals("true")) {
        return;
    }
    String[] fields = params.getParams(TermsParams.TERMS_FIELD);
    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);
    if (fields == null || fields.length == 0)
        return;
    boolean termStats = params.getBool(TermsParams.TERMS_STATS, false);
    if (termStats) {
        NamedList<Number> stats = new SimpleOrderedMap<>();
        rb.rsp.add("indexstats", stats);
        collectStats(rb.req.getSearcher(), stats);
    }
    String termList = params.get(TermsParams.TERMS_LIST);
    if (termList != null) {
        boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
        fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
        return;
    }
    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
        limit = Integer.MAX_VALUE;
    }
    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort = !TermsParams.TERMS_SORT_INDEX.equals(params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
        freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;
    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);
    final LeafReader indexReader = rb.req.getSearcher().getSlowAtomicReader();
    Fields lfields = indexReader.fields();
    for (String field : fields) {
        NamedList<Integer> fieldTerms = new NamedList<>();
        termsResult.add(field, fieldTerms);
        Terms terms = lfields.terms(field);
        if (terms == null) {
            // field does not exist
            continue;
        }
        FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
        if (ft == null)
            ft = new StrField();
        // prefix must currently be text
        BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);
        BytesRef upperBytes = null;
        if (upperStr != null) {
            BytesRefBuilder b = new BytesRefBuilder();
            ft.readableToIndexed(upperStr, b);
            upperBytes = b.get();
        }
        BytesRef lowerBytes;
        if (lowerStr == null) {
            // If no lower bound was specified, use the prefix
            lowerBytes = prefixBytes;
        } else {
            lowerBytes = new BytesRef();
            if (raw) {
                // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
                // perhaps we detect if the FieldType is non-character and expect hex if so?
                lowerBytes = new BytesRef(lowerStr);
            } else {
                BytesRefBuilder b = new BytesRefBuilder();
                ft.readableToIndexed(lowerStr, b);
                lowerBytes = b.get();
            }
        }
        TermsEnum termsEnum = terms.iterator();
        BytesRef term = null;
        if (lowerBytes != null) {
            if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
                termsEnum = null;
            } else {
                term = termsEnum.term();
                //Only advance the enum if we are excluding the lower bound and the lower Term actually matches
                if (lowerIncl == false && term.equals(lowerBytes)) {
                    term = termsEnum.next();
                }
            }
        } else {
            // position termsEnum on first term
            term = termsEnum.next();
        }
        int i = 0;
        BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
        CharsRefBuilder external = new CharsRefBuilder();
        while (term != null && (i < limit || sort)) {
            // did we fill in "external" yet for this term?
            boolean externalized = false;
            // stop if the prefix doesn't match
            if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes))
                break;
            if (pattern != null) {
                // indexed text or external text?
                // TODO: support "raw" mode?
                ft.indexedToReadable(term, external);
                externalized = true;
                if (!pattern.matcher(external.get()).matches()) {
                    term = termsEnum.next();
                    continue;
                }
            }
            if (upperBytes != null) {
                int upperCmp = term.compareTo(upperBytes);
                // if we are past the upper term, or equal to it (when don't include upper) then stop.
                if (upperCmp > 0 || (upperCmp == 0 && !upperIncl))
                    break;
            }
            // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
            int docFreq = termsEnum.docFreq();
            if (docFreq >= freqmin && docFreq <= freqmax) {
                // add the term to the list
                if (sort) {
                    queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
                } else {
                    // TODO: handle raw somehow
                    if (!externalized) {
                        ft.indexedToReadable(term, external);
                    }
                    fieldTerms.add(external.toString(), docFreq);
                    i++;
                }
            }
            term = termsEnum.next();
        }
        if (sort) {
            for (CountPair<BytesRef, Integer> item : queue) {
                if (i >= limit)
                    break;
                ft.indexedToReadable(item.key, external);
                fieldTerms.add(external.toString(), item.val);
                i++;
            }
        }
    }
}
Also used : StrField(org.apache.solr.schema.StrField) BoundedTreeSet(org.apache.solr.util.BoundedTreeSet) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) Pattern(java.util.regex.Pattern) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) NamedList(org.apache.solr.common.util.NamedList) CountPair(org.apache.solr.request.SimpleFacets.CountPair) FieldType(org.apache.solr.schema.FieldType)

Example 34 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class LukeRequestHandler method getDocumentFieldsInfo.

private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException {
    final CharsRefBuilder spare = new CharsRefBuilder();
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();
    for (Object o : doc.getFields()) {
        Field field = (Field) o;
        SimpleOrderedMap<Object> f = new SimpleOrderedMap<>();
        SchemaField sfield = schema.getFieldOrNull(field.name());
        FieldType ftype = (sfield == null) ? null : sfield.getType();
        f.add("type", (ftype == null) ? null : ftype.getTypeName());
        f.add("schema", getFieldFlags(sfield));
        f.add("flags", getFieldFlags(field));
        f.add("value", (ftype == null) ? null : ftype.toExternal(field));
        // TODO: this really should be "stored"
        // may be a binary number
        f.add("internal", field.stringValue());
        BytesRef bytes = field.binaryValue();
        if (bytes != null) {
            f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length));
        }
        if (!ftype.isPointField()) {
            Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue());
            // this can be 0 for non-indexed fields
            f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t));
        }
        // If we have a term vector, return that
        if (field.fieldType().storeTermVectors()) {
            try {
                Terms v = reader.getTermVector(docId, field.name());
                if (v != null) {
                    SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>();
                    final TermsEnum termsEnum = v.iterator();
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        final int freq = (int) termsEnum.totalTermFreq();
                        spare.copyUTF8Bytes(text);
                        tfv.add(spare.toString(), freq);
                    }
                    f.add("termVector", tfv);
                }
            } catch (Exception ex) {
                log.warn("error writing term vector", ex);
            }
        }
        finfo.add(field.name(), f);
    }
    return finfo;
}
Also used : Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) SolrException(org.apache.solr.common.SolrException) IOException(java.io.IOException) FieldType(org.apache.solr.schema.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) SchemaField(org.apache.solr.schema.SchemaField) CopyField(org.apache.solr.schema.CopyField) IndexableField(org.apache.lucene.index.IndexableField) SchemaField(org.apache.solr.schema.SchemaField) Field(org.apache.lucene.document.Field) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 35 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class TestIndexWriterUnicode method testAllUnicodeChars.

// LUCENE-510
public void testAllUnicodeChars() throws Throwable {
    CharsRefBuilder utf16 = new CharsRefBuilder();
    char[] chars = new char[2];
    for (int ch = 0; ch < 0x0010FFFF; ch++) {
        if (ch == 0xd800)
            // Skip invalid code points
            ch = 0xe000;
        int len = 0;
        if (ch <= 0xffff) {
            chars[len++] = (char) ch;
        } else {
            chars[len++] = (char) (((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
            chars[len++] = (char) (((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
        }
        BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));
        String s1 = new String(chars, 0, len);
        String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
        assertEquals("codepoint " + ch, s1, s2);
        utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
        assertEquals("codepoint " + ch, s1, utf16.toString());
        byte[] b = s1.getBytes(StandardCharsets.UTF_8);
        assertEquals(utf8.length, b.length);
        for (int j = 0; j < utf8.length; j++) assertEquals(utf8.bytes[j], b[j]);
    }
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)52 BytesRef (org.apache.lucene.util.BytesRef)30 ArrayList (java.util.ArrayList)11 IOException (java.io.IOException)10 NamedList (org.apache.solr.common.util.NamedList)10 FieldType (org.apache.solr.schema.FieldType)10 TermsEnum (org.apache.lucene.index.TermsEnum)9 SchemaField (org.apache.solr.schema.SchemaField)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 HashSet (java.util.HashSet)5 Test (org.junit.Test)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 PostingsEnum (org.apache.lucene.index.PostingsEnum)4 Terms (org.apache.lucene.index.Terms)4 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)4 LeafReader (org.apache.lucene.index.LeafReader)3 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)3 CharsRef (org.apache.lucene.util.CharsRef)3 Util (org.apache.lucene.util.fst.Util)3 SolrException (org.apache.solr.common.SolrException)3