Search in sources :

Example 41 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testGetSimpleHighlight.

public void testGetSimpleHighlight() throws Exception {
    TestHighlightRunner helper = new TestHighlightRunner() {

        @Override
        public void run() throws Exception {
            numHighlights = 0;
            doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
            for (int i = 0; i < hits.totalHits; i++) {
                String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
                TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
                Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this);
                String result = highlighter.getBestFragment(tokenStream, text);
                if (VERBOSE)
                    System.out.println("\t" + result);
            }
            assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
        }
    };
    helper.start();
}
Also used : MultiTermQuery(org.apache.lucene.search.MultiTermQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) TermQuery(org.apache.lucene.search.TermQuery) TestHighlightRunner(org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) IntPoint(org.apache.lucene.document.IntPoint)

Example 42 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testHighlightingCommonTermsQuery.

public void testHighlightingCommonTermsQuery() throws Exception {
    CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
    //stop-word
    query.add(new Term(FIELD_NAME, "this"));
    query.add(new Term(FIELD_NAME, "long"));
    query.add(new Term(FIELD_NAME, "very"));
    searcher = newSearcher(reader);
    TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE));
    assertEquals(2, hits.totalHits);
    QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
    Highlighter highlighter = new Highlighter(scorer);
    final int docId0 = hits.scoreDocs[0].doc;
    Document doc = searcher.doc(docId0);
    String storedField = doc.get(FIELD_NAME);
    TokenStream stream = getAnyTokenStream(FIELD_NAME, docId0);
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
    highlighter.setTextFragmenter(fragmenter);
    String fragment = highlighter.getBestFragment(stream, storedField);
    assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
    final int docId1 = hits.scoreDocs[1].doc;
    doc = searcher.doc(docId1);
    storedField = doc.get(FIELD_NAME);
    stream = getAnyTokenStream(FIELD_NAME, docId1);
    highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
    fragment = highlighter.getBestFragment(stream, storedField);
    assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
}
Also used : TopDocs(org.apache.lucene.search.TopDocs) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Sort(org.apache.lucene.search.Sort) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) IntPoint(org.apache.lucene.document.IntPoint) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery)

Example 43 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testGetTextFragments.

public void testGetTextFragments() throws Exception {
    TestHighlightRunner helper = new TestHighlightRunner() {

        @Override
        public void run() throws Exception {
            doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
            for (int i = 0; i < hits.totalHits; i++) {
                final int docId = hits.scoreDocs[i].doc;
                final Document doc = searcher.doc(docId);
                String text = doc.get(FIELD_NAME);
                TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
                Highlighter highlighter = getHighlighter(query, FIELD_NAME, // new Highlighter(this, new
                HighlighterTest.this);
                // QueryTermScorer(query));
                highlighter.setTextFragmenter(new SimpleFragmenter(20));
                String[] stringResults = highlighter.getBestFragments(tokenStream, text, 10);
                tokenStream = analyzer.tokenStream(FIELD_NAME, text);
                TextFragment[] fragmentResults = highlighter.getBestTextFragments(tokenStream, text, true, 10);
                assertTrue("Failed to find correct number of text Fragments: " + fragmentResults.length + " vs " + stringResults.length, fragmentResults.length == stringResults.length);
                for (int j = 0; j < stringResults.length; j++) {
                    if (VERBOSE)
                        System.out.println(fragmentResults[j]);
                    assertTrue("Failed to find same text Fragments: " + fragmentResults[j] + " found", fragmentResults[j].toString().equals(stringResults[j]));
                }
            }
        }
    };
    helper.start();
}
Also used : MultiTermQuery(org.apache.lucene.search.MultiTermQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) TermQuery(org.apache.lucene.search.TermQuery) TestHighlightRunner(org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) IntPoint(org.apache.lucene.document.IntPoint)

Example 44 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testUnRewrittenQuery.

public void testUnRewrittenQuery() throws Exception {
    final TestHighlightRunner helper = new TestHighlightRunner() {

        @Override
        public void run() throws Exception {
            numHighlights = 0;
            // test to show how rewritten query can still be used
            searcher = newSearcher(reader);
            BooleanQuery.Builder query = new BooleanQuery.Builder();
            query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
            query.add(new WildcardQuery(new Term(FIELD_NAME, "kenned*")), Occur.SHOULD);
            if (VERBOSE)
                System.out.println("Searching with primitive query");
            // forget to set this and...
            // query=query.rewrite(reader);
            TopDocs hits = searcher.search(query.build(), 1000);
            // create an instance of the highlighter with the tags used to surround
            // highlighted text
            // QueryHighlightExtractor highlighter = new
            // QueryHighlightExtractor(this,
            // query, new StandardAnalyzer(TEST_VERSION));
            int maxNumFragmentsRequired = 3;
            for (int i = 0; i < hits.totalHits; i++) {
                final int docId = hits.scoreDocs[i].doc;
                final Document doc = searcher.doc(docId);
                String text = doc.get(FIELD_NAME);
                TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
                Highlighter highlighter = getHighlighter(query.build(), FIELD_NAME, HighlighterTest.this, false);
                highlighter.setTextFragmenter(new SimpleFragmenter(40));
                String highlightedText = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
                if (VERBOSE)
                    System.out.println(highlightedText);
            }
            // We expect to have zero highlights if the query is multi-terms and is
            // not
            // rewritten!
            assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0);
        }
    };
    helper.start();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) WildcardQuery(org.apache.lucene.search.WildcardQuery) TestHighlightRunner(org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Builder(org.apache.lucene.search.PhraseQuery.Builder) DocumentBuilder(javax.xml.parsers.DocumentBuilder) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) IntPoint(org.apache.lucene.document.IntPoint) TopDocs(org.apache.lucene.search.TopDocs)

Example 45 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testSimpleSpanHighlighterWithStopWordsStraddlingFragmentBoundaries.

// LUCENE-2229
public void testSimpleSpanHighlighterWithStopWordsStraddlingFragmentBoundaries() throws Exception {
    doSearching(new PhraseQuery(FIELD_NAME, "all", "tokens"));
    int maxNumFragmentsRequired = 1;
    QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
    Highlighter highlighter = new Highlighter(scorer);
    assertEquals("Must have one hit", 1, hits.totalHits);
    for (int i = 0; i < hits.totalHits; i++) {
        String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
        TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
        highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 36));
        String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
        if (VERBOSE)
            System.out.println("\t" + result);
        assertTrue("Fragment must be less than 60 characters long", result.length() < 60);
    }
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PhraseQuery(org.apache.lucene.search.PhraseQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) IntPoint(org.apache.lucene.document.IntPoint)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)848 StringReader (java.io.StringReader)336 Tokenizer (org.apache.lucene.analysis.Tokenizer)244 Reader (java.io.Reader)175 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)140 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)128 Analyzer (org.apache.lucene.analysis.Analyzer)121 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)94 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)88 IOException (java.io.IOException)85 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)73 Term (org.apache.lucene.index.Term)66 Document (org.apache.lucene.document.Document)64 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)59 ArrayList (java.util.ArrayList)58 StopFilter (org.apache.lucene.analysis.StopFilter)58 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)57 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)53 Test (org.junit.Test)53 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)46