Search in sources :

Example 46 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testExternalReader.

public void testExternalReader() throws Exception {
    query = new RegexpQuery(new Term(FIELD_NAME, "ken.*"));
    searcher = newSearcher(reader);
    hits = searcher.search(query, 100);
    int maxNumFragmentsRequired = 2;
    QueryScorer scorer = new QueryScorer(query, reader, FIELD_NAME);
    Highlighter highlighter = new Highlighter(this, scorer);
    for (int i = 0; i < hits.totalHits; i++) {
        final int docId = hits.scoreDocs[i].doc;
        final Document doc = searcher.doc(docId);
        String text = doc.get(FIELD_NAME);
        TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
        highlighter.setTextFragmenter(new SimpleFragmenter(40));
        String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
        if (VERBOSE)
            System.out.println("\t" + result);
    }
    assertTrue(reader.docFreq(new Term(FIELD_NAME, "hello")) > 0);
    assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) IntPoint(org.apache.lucene.document.IntPoint) RegexpQuery(org.apache.lucene.search.RegexpQuery)

Example 47 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class HighlightCustomQueryTest method highlightField.

/**
   * This method intended for use with
   * <tt>testHighlightingWithDefaultField()</tt>
   */
private String highlightField(Query query, String fieldName, String text) throws IOException, InvalidTokenOffsetsException {
    TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName, text);
    // Assuming "<B>", "</B>" used to highlight
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
    MyQueryScorer scorer = new MyQueryScorer(query, fieldName, FIELD_NAME);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
    String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)");
    return rv.length() == 0 ? text : rv;
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) SimpleFragmenter(org.apache.lucene.search.highlight.SimpleFragmenter) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) Highlighter(org.apache.lucene.search.highlight.Highlighter)

Example 48 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testQueryScorerHits.

public void testQueryScorerHits() throws Exception {
    PhraseQuery phraseQuery = new PhraseQuery(FIELD_NAME, "very", "long");
    query = phraseQuery;
    searcher = newSearcher(reader);
    TopDocs hits = searcher.search(query, 10);
    QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
    Highlighter highlighter = new Highlighter(scorer);
    for (int i = 0; i < hits.scoreDocs.length; i++) {
        final int docId = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(docId);
        String storedField = doc.get(FIELD_NAME);
        TokenStream stream = getAnyTokenStream(FIELD_NAME, docId);
        Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
        highlighter.setTextFragmenter(fragmenter);
        String fragment = highlighter.getBestFragment(stream, storedField);
        if (VERBOSE)
            System.out.println(fragment);
    }
}
Also used : TopDocs(org.apache.lucene.search.TopDocs) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PhraseQuery(org.apache.lucene.search.PhraseQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) Document(org.apache.lucene.document.Document) IntPoint(org.apache.lucene.document.IntPoint)

Example 49 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testSpanRegexQuery.

public void testSpanRegexQuery() throws Exception {
    query = new SpanOrQuery(new SpanMultiTermQueryWrapper<>(new RegexpQuery(new Term(FIELD_NAME, "ken.*"))));
    searcher = newSearcher(reader);
    hits = searcher.search(query, 100);
    int maxNumFragmentsRequired = 2;
    QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
    Highlighter highlighter = new Highlighter(this, scorer);
    for (int i = 0; i < hits.totalHits; i++) {
        final int docId = hits.scoreDocs[i].doc;
        final Document doc = searcher.doc(docId);
        String text = doc.get(FIELD_NAME);
        TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
        highlighter.setTextFragmenter(new SimpleFragmenter(40));
        String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
        if (VERBOSE)
            System.out.println("\t" + result);
    }
    assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) SpanMultiTermQueryWrapper(org.apache.lucene.search.spans.SpanMultiTermQueryWrapper) Term(org.apache.lucene.index.Term) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) Document(org.apache.lucene.document.Document) IntPoint(org.apache.lucene.document.IntPoint) RegexpQuery(org.apache.lucene.search.RegexpQuery)

Example 50 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SynonymTokenizer method testNoFragments.

public void testNoFragments() throws Exception {
    TestHighlightRunner helper = new TestHighlightRunner() {

        @Override
        public void run() throws Exception {
            doSearching(new TermQuery(new Term(FIELD_NAME, "aninvalidquerywhichshouldyieldnoresults")));
            for (String text : texts) {
                TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
                Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this);
                String result = highlighter.getBestFragment(tokenStream, text);
                assertNull("The highlight result should be null for text with no query terms", result);
            }
        }
    };
    helper.start();
}
Also used : MultiTermQuery(org.apache.lucene.search.MultiTermQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) TermQuery(org.apache.lucene.search.TermQuery) TestHighlightRunner(org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)849 StringReader (java.io.StringReader)337 Tokenizer (org.apache.lucene.analysis.Tokenizer)244 Reader (java.io.Reader)175 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)141 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)128 Analyzer (org.apache.lucene.analysis.Analyzer)121 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)94 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)88 IOException (java.io.IOException)86 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)73 Term (org.apache.lucene.index.Term)66 Document (org.apache.lucene.document.Document)64 ArrayList (java.util.ArrayList)59 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)59 StopFilter (org.apache.lucene.analysis.StopFilter)58 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)57 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)53 Test (org.junit.Test)53 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)47