Search in sources :

Example 1 with MockAnalyzer

use of org.apache.lucene.analysis.MockAnalyzer in project elasticsearch by elastic.

the class XMoreLikeThisTests method testTopN.

public void testTopN() throws Exception {
    int numDocs = 100;
    int topN = 25;
    // add series of docs with terms of decreasing df
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    for (int i = 0; i < numDocs; i++) {
        addDoc(writer, generateStrSeq(0, i + 1));
    }
    IndexReader reader = writer.getReader();
    writer.close();
    // setup MLT query
    MoreLikeThis mlt = new MoreLikeThis(reader);
    mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
    mlt.setMaxQueryTerms(topN);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] { "text" });
    // perform MLT query
    String likeText = "";
    for (String text : generateStrSeq(0, numDocs)) {
        likeText += text + " ";
    }
    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText));
    // check best terms are topN of highest idf
    List<BooleanClause> clauses = query.clauses();
    assertEquals("Expected" + topN + "clauses only!", topN, clauses.size());
    Term[] expectedTerms = new Term[topN];
    int idx = 0;
    for (String text : generateStrSeq(numDocs - topN, topN)) {
        expectedTerms[idx++] = new Term("text", text);
    }
    for (BooleanClause clause : clauses) {
        Term term = ((TermQuery) clause.getQuery()).getTerm();
        assertTrue(Arrays.asList(expectedTerms).contains(term));
    }
    // clean up
    reader.close();
    dir.close();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TermQuery(org.apache.lucene.search.TermQuery) MoreLikeThis(org.apache.lucene.queries.mlt.MoreLikeThis) Term(org.apache.lucene.index.Term) BooleanClause(org.apache.lucene.search.BooleanClause) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) StringReader(java.io.StringReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 2 with MockAnalyzer

use of org.apache.lucene.analysis.MockAnalyzer in project elasticsearch by elastic.

the class BooleanFieldMapperTests method testDefaults.

public void testDefaults() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties").startObject("field").field("type", "boolean").endObject().endObject().endObject().endObject().string();
    DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping));
    ParsedDocument doc = defaultMapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject().field("field", true).endObject().bytes());
    try (Directory dir = new RAMDirectory();
        IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) {
        w.addDocuments(doc.docs());
        try (DirectoryReader reader = DirectoryReader.open(w)) {
            final LeafReader leaf = reader.leaves().get(0).reader();
            // boolean fields are indexed and have doc values by default
            assertEquals(new BytesRef("T"), leaf.terms("field").iterator().next());
            SortedNumericDocValues values = leaf.getSortedNumericDocValues("field");
            assertNotNull(values);
            values.setDocument(0);
            assertEquals(1, values.count());
            assertEquals(1, values.valueAt(0));
        }
    }
}
Also used : SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) Matchers.containsString(org.hamcrest.Matchers.containsString) RAMDirectory(org.apache.lucene.store.RAMDirectory) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) CompressedXContent(org.elasticsearch.common.compress.CompressedXContent) BytesRef(org.apache.lucene.util.BytesRef) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 3 with MockAnalyzer

use of org.apache.lucene.analysis.MockAnalyzer in project elasticsearch by elastic.

the class StoreTests method testUserDataRead.

public void testUserDataRead() throws IOException {
    final ShardId shardId = new ShardId("index", "_na_", 1);
    DirectoryService directoryService = new LuceneManagedDirectoryService(random());
    Store store = new Store(shardId, INDEX_SETTINGS, directoryService, new DummyShardLock(shardId));
    IndexWriterConfig config = newIndexWriterConfig(random(), new MockAnalyzer(random())).setCodec(TestUtil.getDefaultCodec());
    SnapshotDeletionPolicy deletionPolicy = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
    config.setIndexDeletionPolicy(deletionPolicy);
    IndexWriter writer = new IndexWriter(store.directory(), config);
    Document doc = new Document();
    doc.add(new TextField("id", "1", Field.Store.NO));
    writer.addDocument(doc);
    Map<String, String> commitData = new HashMap<>(2);
    String syncId = "a sync id";
    String translogId = "a translog id";
    commitData.put(Engine.SYNC_COMMIT_ID, syncId);
    commitData.put(Translog.TRANSLOG_GENERATION_KEY, translogId);
    writer.setCommitData(commitData);
    writer.commit();
    writer.close();
    Store.MetadataSnapshot metadata;
    metadata = store.getMetadata(randomBoolean() ? null : deletionPolicy.snapshot());
    assertFalse(metadata.asMap().isEmpty());
    // do not check for correct files, we have enough tests for that above
    assertThat(metadata.getCommitUserData().get(Engine.SYNC_COMMIT_ID), equalTo(syncId));
    assertThat(metadata.getCommitUserData().get(Translog.TRANSLOG_GENERATION_KEY), equalTo(translogId));
    TestUtil.checkIndex(store.directory());
    assertDeleteContent(store, directoryService);
    IOUtils.close(store);
}
Also used : KeepOnlyLastCommitDeletionPolicy(org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy) HashMap(java.util.HashMap) Document(org.apache.lucene.document.Document) SnapshotDeletionPolicy(org.apache.lucene.index.SnapshotDeletionPolicy) ShardId(org.elasticsearch.index.shard.ShardId) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) TextField(org.apache.lucene.document.TextField) DummyShardLock(org.elasticsearch.test.DummyShardLock) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 4 with MockAnalyzer

use of org.apache.lucene.analysis.MockAnalyzer in project elasticsearch by elastic.

the class BlendedTermQueryTests method testDismaxQuery.

public void testDismaxQuery() throws IOException {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
    String[] username = new String[] { "foo fighters", "some cool fan", "cover band" };
    String[] song = new String[] { "generator", "foo fighers - generator", "foo fighters generator" };
    final boolean omitNorms = random().nextBoolean();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(random().nextBoolean() ? IndexOptions.DOCS : IndexOptions.DOCS_AND_FREQS);
    ft.setOmitNorms(omitNorms);
    ft.freeze();
    FieldType ft1 = new FieldType(TextField.TYPE_NOT_STORED);
    ft1.setIndexOptions(random().nextBoolean() ? IndexOptions.DOCS : IndexOptions.DOCS_AND_FREQS);
    ft1.setOmitNorms(omitNorms);
    ft1.freeze();
    for (int i = 0; i < username.length; i++) {
        Document d = new Document();
        d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
        d.add(new Field("username", username[i], ft));
        d.add(new Field("song", song[i], ft));
        w.addDocument(d);
    }
    int iters = scaledRandomIntBetween(25, 100);
    for (int j = 0; j < iters; j++) {
        Document d = new Document();
        d.add(new TextField("id", Integer.toString(username.length + j), Field.Store.YES));
        d.add(new Field("username", "foo fighters", ft1));
        d.add(new Field("song", "some bogus text to bump up IDF", ft1));
        w.addDocument(d);
    }
    w.commit();
    DirectoryReader reader = DirectoryReader.open(w);
    IndexSearcher searcher = setSimilarity(newSearcher(reader));
    {
        String[] fields = new String[] { "username", "song" };
        BooleanQuery.Builder query = new BooleanQuery.Builder();
        query.setDisableCoord(true);
        query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f), BooleanClause.Occur.SHOULD);
        query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "fighters"), 0.1f), BooleanClause.Occur.SHOULD);
        query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "generator"), 0.1f), BooleanClause.Occur.SHOULD);
        TopDocs search = searcher.search(query.build(), 10);
        ScoreDoc[] scoreDocs = search.scoreDocs;
        assertEquals(Integer.toString(0), reader.document(scoreDocs[0].doc).getField("id").stringValue());
    }
    {
        BooleanQuery.Builder query = new BooleanQuery.Builder();
        query.setDisableCoord(true);
        DisjunctionMaxQuery uname = new DisjunctionMaxQuery(Arrays.asList(new TermQuery(new Term("username", "foo")), new TermQuery(new Term("song", "foo"))), 0.0f);
        DisjunctionMaxQuery s = new DisjunctionMaxQuery(Arrays.asList(new TermQuery(new Term("username", "fighers")), new TermQuery(new Term("song", "fighers"))), 0.0f);
        DisjunctionMaxQuery gen = new DisjunctionMaxQuery(Arrays.asList(new TermQuery(new Term("username", "generator")), new TermQuery(new Term("song", "generator"))), 0f);
        query.add(uname, BooleanClause.Occur.SHOULD);
        query.add(s, BooleanClause.Occur.SHOULD);
        query.add(gen, BooleanClause.Occur.SHOULD);
        TopDocs search = searcher.search(query.build(), 4);
        ScoreDoc[] scoreDocs = search.scoreDocs;
        assertEquals(Integer.toString(1), reader.document(scoreDocs[0].doc).getField("id").stringValue());
    }
    reader.close();
    w.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) BooleanQuery(org.apache.lucene.search.BooleanQuery) TermQuery(org.apache.lucene.search.TermQuery) DirectoryReader(org.apache.lucene.index.DirectoryReader) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) TextField(org.apache.lucene.document.TextField) Directory(org.apache.lucene.store.Directory)

Example 5 with MockAnalyzer

use of org.apache.lucene.analysis.MockAnalyzer in project elasticsearch by elastic.

the class CustomPostingsHighlighterTests method testCustomPostingsHighlighter.

public void testCustomPostingsHighlighter() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    //good position but only one match
    final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter.";
    Field body = new Field("body", "", offsetsType);
    Document doc = new Document();
    doc.add(body);
    body.setStringValue(firstValue);
    //two matches, not the best snippet due to its length though
    final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.";
    Field body2 = new Field("body", "", offsetsType);
    doc.add(body2);
    body2.setStringValue(secondValue);
    //two matches and short, will be scored highest
    final String thirdValue = "This is highlighting the third short highlighting value.";
    Field body3 = new Field("body", "", offsetsType);
    doc.add(body3);
    body3.setStringValue(thirdValue);
    //one match, same as first but at the end, will be scored lower due to its position
    final String fourthValue = "Just a test4 highlighting from postings highlighter.";
    Field body4 = new Field("body", "", offsetsType);
    doc.add(body4);
    body4.setStringValue(fourthValue);
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    String firstHlValue = "Just a test1 <b>highlighting</b> from postings highlighter.";
    String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a longer text that gets scored lower.";
    String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
    String fourthHlValue = "Just a test4 <b>highlighting</b> from postings highlighter.";
    IndexSearcher searcher = newSearcher(ir);
    Query query = new TermQuery(new Term("body", "highlighting"));
    TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
    assertThat(topDocs.totalHits, equalTo(1));
    int docId = topDocs.scoreDocs[0].doc;
    String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
    CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), fieldValue, false);
    Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
    assertThat(snippets.length, equalTo(4));
    assertThat(snippets[0].getText(), equalTo(firstHlValue));
    assertThat(snippets[1].getText(), equalTo(secondHlValue));
    assertThat(snippets[2].getText(), equalTo(thirdHlValue));
    assertThat(snippets[3].getText(), equalTo(fourthHlValue));
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) Term(org.apache.lucene.index.Term) Snippet(org.apache.lucene.search.highlight.Snippet) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)1164 Directory (org.apache.lucene.store.Directory)785 Document (org.apache.lucene.document.Document)775 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)265 Analyzer (org.apache.lucene.analysis.Analyzer)259 BytesRef (org.apache.lucene.util.BytesRef)252 StringField (org.apache.lucene.document.StringField)183 Term (org.apache.lucene.index.Term)183 RAMDirectory (org.apache.lucene.store.RAMDirectory)168 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)165 Field (org.apache.lucene.document.Field)164 TextField (org.apache.lucene.document.TextField)159 Test (org.junit.Test)142 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)136 IndexReader (org.apache.lucene.index.IndexReader)134 IndexWriter (org.apache.lucene.index.IndexWriter)133 TermQuery (org.apache.lucene.search.TermQuery)121 FieldType (org.apache.lucene.document.FieldType)119 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)116 IndexSearcher (org.apache.lucene.search.IndexSearcher)111