Search in sources :

Example 51 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class CustomPassageFormatterTests method testHtmlEncodeFormat.

public void testHtmlEncodeFormat() {
    String content = "<b>This is a really cool highlighter.</b> Postings highlighter gives nice snippets back.";
    CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
    Passage[] passages = new Passage[2];
    String match = "highlighter";
    BytesRef matchBytesRef = new BytesRef(match);
    Passage passage1 = new Passage();
    int start = content.indexOf(match);
    int end = start + match.length();
    passage1.startOffset = 0;
    //lets include the whitespace at the end to make sure we trim it
    passage1.endOffset = end + 6;
    passage1.addMatch(start, end, matchBytesRef);
    passages[0] = passage1;
    Passage passage2 = new Passage();
    start = content.lastIndexOf(match);
    end = start + match.length();
    passage2.startOffset = passage1.endOffset;
    passage2.endOffset = content.length();
    passage2.addMatch(start, end, matchBytesRef);
    passages[1] = passage2;
    Snippet[] fragments = passageFormatter.format(passages, content);
    assertThat(fragments, notNullValue());
    assertThat(fragments.length, equalTo(2));
    assertThat(fragments[0].getText(), equalTo("&lt;b&gt;This is a really cool <em>highlighter</em>.&lt;&#x2F;b&gt;"));
    assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
}
Also used : SimpleHTMLEncoder(org.apache.lucene.search.highlight.SimpleHTMLEncoder) Snippet(org.apache.lucene.search.highlight.Snippet) BytesRef(org.apache.lucene.util.BytesRef)

Example 52 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class NoisyChannelSpellCheckerTests method testMultiGenerator.

public void testMultiGenerator() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<>();
    mapping.put("body_ngram", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(tf));
        }
    });
    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            return new TokenStreamComponents(t, new LowerCaseFilter(t));
        }
    });
    mapping.put("body_reverse", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            return new TokenStreamComponents(t, new ReverseStringFilter(new LowerCaseFilter(t)));
        }
    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
    IndexWriterConfig conf = new IndexWriterConfig(wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
    for (String line : strings) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }
    DirectoryReader ir = DirectoryReader.open(writer);
    LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
    DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
    CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    // only use forward with constant prefix
    assertThat(corrections.length, equalTo(0));
    corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    // Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
    corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
}
Also used : HashMap(java.util.HashMap) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) DirectSpellChecker(org.apache.lucene.search.spell.DirectSpellChecker) BytesRef(org.apache.lucene.util.BytesRef) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) DirectoryReader(org.apache.lucene.index.DirectoryReader) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 53 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class PercolatorFieldMapperTests method testMultiplePercolatorFields.

// multiple percolator fields are allowed in the mapping, but only one field can be used at index time.
public void testMultiplePercolatorFields() throws Exception {
    String typeName = "another_type";
    String percolatorMapper = XContentFactory.jsonBuilder().startObject().startObject(typeName).startObject("_field_names").field("enabled", false).endObject().startObject("properties").startObject("query_field1").field("type", "percolator").endObject().startObject("query_field2").field("type", "percolator").endObject().endObject().endObject().endObject().string();
    mapperService.merge(typeName, new CompressedXContent(percolatorMapper), MapperService.MergeReason.MAPPING_UPDATE, true);
    QueryBuilder queryBuilder = matchQuery("field", "value");
    ParsedDocument doc = mapperService.documentMapper(typeName).parse("test", typeName, "1", jsonBuilder().startObject().field("query_field1", queryBuilder).field("query_field2", queryBuilder).endObject().bytes());
    // also includes all other meta fields
    assertThat(doc.rootDoc().getFields().size(), equalTo(14));
    BytesRef queryBuilderAsBytes = doc.rootDoc().getField("query_field1.query_builder_field").binaryValue();
    assertQueryBuilder(queryBuilderAsBytes, queryBuilder);
    queryBuilderAsBytes = doc.rootDoc().getField("query_field2.query_builder_field").binaryValue();
    assertQueryBuilder(queryBuilderAsBytes, queryBuilder);
}
Also used : ParsedDocument(org.elasticsearch.index.mapper.ParsedDocument) CompressedXContent(org.elasticsearch.common.compress.CompressedXContent) Matchers.containsString(org.hamcrest.Matchers.containsString) HasParentQueryBuilder(org.elasticsearch.index.query.HasParentQueryBuilder) RangeQueryBuilder(org.elasticsearch.index.query.RangeQueryBuilder) MatchAllQueryBuilder(org.elasticsearch.index.query.MatchAllQueryBuilder) BoolQueryBuilder(org.elasticsearch.index.query.BoolQueryBuilder) ConstantScoreQueryBuilder(org.elasticsearch.index.query.ConstantScoreQueryBuilder) FunctionScoreQueryBuilder(org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder) QueryBuilder(org.elasticsearch.index.query.QueryBuilder) BoostingQueryBuilder(org.elasticsearch.index.query.BoostingQueryBuilder) HasChildQueryBuilder(org.elasticsearch.index.query.HasChildQueryBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 54 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class PercolatorFieldMapperTests method testPercolatorFieldMapper.

public void testPercolatorFieldMapper() throws Exception {
    addQueryMapping();
    QueryBuilder queryBuilder = termQuery("field", "value");
    ParsedDocument doc = mapperService.documentMapper(typeName).parse("test", typeName, "1", XContentFactory.jsonBuilder().startObject().field(fieldName, queryBuilder).endObject().bytes());
    assertThat(doc.rootDoc().getFields(fieldType.queryTermsField.name()).length, equalTo(1));
    assertThat(doc.rootDoc().getFields(fieldType.queryTermsField.name())[0].binaryValue().utf8ToString(), equalTo("field\0value"));
    assertThat(doc.rootDoc().getFields(fieldType.queryBuilderField.name()).length, equalTo(1));
    assertThat(doc.rootDoc().getFields(fieldType.extractionResultField.name()).length, equalTo(1));
    assertThat(doc.rootDoc().getFields(fieldType.extractionResultField.name())[0].stringValue(), equalTo(EXTRACTION_COMPLETE));
    BytesRef qbSource = doc.rootDoc().getFields(fieldType.queryBuilderField.name())[0].binaryValue();
    assertQueryBuilder(qbSource, queryBuilder);
    // add an query for which we don't extract terms from
    queryBuilder = rangeQuery("field").from("a").to("z");
    doc = mapperService.documentMapper(typeName).parse("test", typeName, "1", XContentFactory.jsonBuilder().startObject().field(fieldName, queryBuilder).endObject().bytes());
    assertThat(doc.rootDoc().getFields(fieldType.extractionResultField.name()).length, equalTo(1));
    assertThat(doc.rootDoc().getFields(fieldType.extractionResultField.name())[0].stringValue(), equalTo(EXTRACTION_FAILED));
    assertThat(doc.rootDoc().getFields(fieldType.queryTermsField.name()).length, equalTo(0));
    assertThat(doc.rootDoc().getFields(fieldType.queryBuilderField.name()).length, equalTo(1));
    qbSource = doc.rootDoc().getFields(fieldType.queryBuilderField.name())[0].binaryValue();
    assertQueryBuilder(qbSource, queryBuilder);
}
Also used : ParsedDocument(org.elasticsearch.index.mapper.ParsedDocument) HasParentQueryBuilder(org.elasticsearch.index.query.HasParentQueryBuilder) RangeQueryBuilder(org.elasticsearch.index.query.RangeQueryBuilder) MatchAllQueryBuilder(org.elasticsearch.index.query.MatchAllQueryBuilder) BoolQueryBuilder(org.elasticsearch.index.query.BoolQueryBuilder) ConstantScoreQueryBuilder(org.elasticsearch.index.query.ConstantScoreQueryBuilder) FunctionScoreQueryBuilder(org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder) QueryBuilder(org.elasticsearch.index.query.QueryBuilder) BoostingQueryBuilder(org.elasticsearch.index.query.BoostingQueryBuilder) HasChildQueryBuilder(org.elasticsearch.index.query.HasChildQueryBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 55 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class PercolatorFieldMapperTests method testStoringQueries.

public void testStoringQueries() throws Exception {
    addQueryMapping();
    QueryBuilder[] queries = new QueryBuilder[] { termQuery("field", "value"), matchAllQuery(), matchQuery("field", "value"), matchPhraseQuery("field", "value"), prefixQuery("field", "v"), wildcardQuery("field", "v*"), rangeQuery("number_field").gte(0).lte(9), rangeQuery("date_field").from("2015-01-01T00:00").to("2015-01-01T00:00") };
    for (QueryBuilder query : queries) {
        ParsedDocument doc = mapperService.documentMapper(typeName).parse("test", typeName, "1", XContentFactory.jsonBuilder().startObject().field(fieldName, query).endObject().bytes());
        BytesRef qbSource = doc.rootDoc().getFields(fieldType.queryBuilderField.name())[0].binaryValue();
        assertQueryBuilder(qbSource, query);
    }
}
Also used : ParsedDocument(org.elasticsearch.index.mapper.ParsedDocument) HasParentQueryBuilder(org.elasticsearch.index.query.HasParentQueryBuilder) RangeQueryBuilder(org.elasticsearch.index.query.RangeQueryBuilder) MatchAllQueryBuilder(org.elasticsearch.index.query.MatchAllQueryBuilder) BoolQueryBuilder(org.elasticsearch.index.query.BoolQueryBuilder) ConstantScoreQueryBuilder(org.elasticsearch.index.query.ConstantScoreQueryBuilder) FunctionScoreQueryBuilder(org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder) QueryBuilder(org.elasticsearch.index.query.QueryBuilder) BoostingQueryBuilder(org.elasticsearch.index.query.BoostingQueryBuilder) HasChildQueryBuilder(org.elasticsearch.index.query.HasChildQueryBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

BytesRef (org.apache.lucene.util.BytesRef)1449 Document (org.apache.lucene.document.Document)410 Directory (org.apache.lucene.store.Directory)370 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)266 ArrayList (java.util.ArrayList)186 Test (org.junit.Test)182 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)164 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)152 Term (org.apache.lucene.index.Term)124 Analyzer (org.apache.lucene.analysis.Analyzer)121 IndexReader (org.apache.lucene.index.IndexReader)121 TermsEnum (org.apache.lucene.index.TermsEnum)116 SortedSetDocValuesField (org.apache.lucene.document.SortedSetDocValuesField)110 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)105 IOException (java.io.IOException)104 Field (org.apache.lucene.document.Field)101 StringField (org.apache.lucene.document.StringField)101 CrateUnitTest (io.crate.test.integration.CrateUnitTest)95 TextField (org.apache.lucene.document.TextField)95 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)87