Search in sources :

Example 6 with SimpleHTMLFormatter

use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project ansj_seg by NLPchina.

the class IndexTest method toHighlighter.

/**
 * 高亮设置
 *
 * @param query
 * @param doc
 * @param field
 * @return
 */
private String toHighlighter(Analyzer analyzer, Query query, Document doc) {
    String field = "text";
    try {
        SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
        Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
        TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field)));
        String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
        return highlighterStr == null ? doc.get(field) : highlighterStr;
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (InvalidTokenOffsetsException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return null;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) QueryScorer(org.apache.lucene.search.highlight.QueryScorer) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) StringReader(java.io.StringReader) IOException(java.io.IOException) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) Highlighter(org.apache.lucene.search.highlight.Highlighter)

Example 7 with SimpleHTMLFormatter

use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project zeppelin by apache.

the class LuceneSearch method query.

/* (non-Javadoc)
   * @see org.apache.zeppelin.search.Search#query(java.lang.String)
   */
@Override
public List<Map<String, String>> query(String queryStr) {
    if (null == indexDirectory) {
        throw new IllegalStateException("Something went wrong on instance creation time, index dir is null");
    }
    List<Map<String, String>> result = Collections.emptyList();
    try (IndexReader indexReader = DirectoryReader.open(indexDirectory)) {
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        Analyzer analyzer = new StandardAnalyzer();
        MultiFieldQueryParser parser = new MultiFieldQueryParser(new String[] { SEARCH_FIELD_TEXT, SEARCH_FIELD_TITLE }, analyzer);
        Query query = parser.parse(queryStr);
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("Searching for: {}", query.toString(SEARCH_FIELD_TEXT));
        }
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
        result = doSearch(indexSearcher, query, analyzer, highlighter);
    } catch (IOException e) {
        LOGGER.error("Failed to open index dir {}, make sure indexing finished OK", indexDirectory, e);
    } catch (ParseException e) {
        LOGGER.error("Failed to parse query {}", queryStr, e);
    }
    return result;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) Query(org.apache.lucene.search.Query) WildcardQuery(org.apache.lucene.search.WildcardQuery) QueryScorer(org.apache.lucene.search.highlight.QueryScorer) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) ParseException(org.apache.lucene.queryparser.classic.ParseException) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Highlighter(org.apache.lucene.search.highlight.Highlighter)

Example 8 with SimpleHTMLFormatter

use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project sppanblog4springboot by whoismy8023.

the class LuceneSearcher method setHighlighter.

/**
 * 高亮设置
 *
 * @param query
 * @param doc
 * @param field
 * @return
 */
private String setHighlighter(Query query, Document doc, String field) {
    try {
        SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
        Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
        String fieldValue = doc.get(field);
        String highlighterStr = highlighter.getBestFragment(analyzer, field, fieldValue);
        return highlighterStr == null ? fieldValue : highlighterStr;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}
Also used : QueryScorer(org.apache.lucene.search.highlight.QueryScorer) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) ParseException(org.apache.lucene.queryparser.classic.ParseException) IOException(java.io.IOException) Highlighter(org.apache.lucene.search.highlight.Highlighter)

Example 9 with SimpleHTMLFormatter

use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project jspwiki by apache.

the class LuceneSearchProvider method findPages.

/**
 *  Searches pages using a particular combination of flags.
 *
 *  @param query The query to perform in Lucene query language
 *  @param flags A set of flags
 *  @return A Collection of SearchResult instances
 *  @throws ProviderException if there is a problem with the backend
 */
public Collection findPages(String query, int flags, WikiContext wikiContext) throws ProviderException {
    IndexSearcher searcher = null;
    ArrayList<SearchResult> list = null;
    Highlighter highlighter = null;
    try {
        String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS };
        QueryParser qp = new MultiFieldQueryParser(Version.LUCENE_47, queryfields, getLuceneAnalyzer());
        // QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
        Query luceneQuery = qp.parse(query);
        if ((flags & FLAG_CONTEXTS) != 0) {
            highlighter = new Highlighter(new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"), new SimpleHTMLEncoder(), new QueryScorer(luceneQuery));
        }
        try {
            File dir = new File(m_luceneDirectory);
            Directory luceneDir = new SimpleFSDirectory(dir, null);
            IndexReader reader = DirectoryReader.open(luceneDir);
            searcher = new IndexSearcher(reader);
        } catch (Exception ex) {
            log.info("Lucene not yet ready; indexing not started", ex);
            return null;
        }
        ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs;
        AuthorizationManager mgr = m_engine.getAuthorizationManager();
        list = new ArrayList<SearchResult>(hits.length);
        for (int curr = 0; curr < hits.length; curr++) {
            int docID = hits[curr].doc;
            Document doc = searcher.doc(docID);
            String pageName = doc.get(LUCENE_ID);
            WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);
            if (page != null) {
                if (page instanceof Attachment) {
                // Currently attachments don't look nice on the search-results page
                // When the search-results are cleaned up this can be enabled again.
                }
                PagePermission pp = new PagePermission(page, PagePermission.VIEW_ACTION);
                if (mgr.checkPermission(wikiContext.getWikiSession(), pp)) {
                    int score = (int) (hits[curr].score * 100);
                    // Get highlighted search contexts
                    String text = doc.get(LUCENE_PAGE_CONTENTS);
                    String[] fragments = new String[0];
                    if (text != null && highlighter != null) {
                        TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text));
                        fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);
                    }
                    SearchResult result = new SearchResultImpl(page, score, fragments);
                    list.add(result);
                }
            } else {
                log.error("Lucene found a result page '" + pageName + "' that could not be loaded, removing from Lucene cache");
                pageRemoved(new WikiPage(m_engine, pageName));
            }
        }
    } catch (IOException e) {
        log.error("Failed during lucene search", e);
    } catch (ParseException e) {
        log.info("Broken query; cannot parse query ", e);
        throw new ProviderException("You have entered a query Lucene cannot process: " + e.getMessage());
    } catch (InvalidTokenOffsetsException e) {
        log.error("Tokens are incompatible with provided text ", e);
    } finally {
        if (searcher != null) {
            try {
                searcher.getIndexReader().close();
            } catch (IOException e) {
                log.error(e);
            }
        }
    }
    return list;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) ProviderException(org.apache.wiki.api.exceptions.ProviderException) WikiPage(org.apache.wiki.WikiPage) Attachment(org.apache.wiki.attachment.Attachment) Document(org.apache.lucene.document.Document) ScoreDoc(org.apache.lucene.search.ScoreDoc) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) StringReader(java.io.StringReader) Highlighter(org.apache.lucene.search.highlight.Highlighter) Directory(org.apache.lucene.store.Directory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) SimpleHTMLEncoder(org.apache.lucene.search.highlight.SimpleHTMLEncoder) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryScorer(org.apache.lucene.search.highlight.QueryScorer) IOException(java.io.IOException) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) NoRequiredPropertyException(org.apache.wiki.api.exceptions.NoRequiredPropertyException) InternalWikiException(org.apache.wiki.InternalWikiException) ParseException(org.apache.lucene.queryparser.classic.ParseException) LockObtainFailedException(org.apache.lucene.store.LockObtainFailedException) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) IOException(java.io.IOException) ProviderException(org.apache.wiki.api.exceptions.ProviderException) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader) AuthorizationManager(org.apache.wiki.auth.AuthorizationManager) ParseException(org.apache.lucene.queryparser.classic.ParseException) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) File(java.io.File) PagePermission(org.apache.wiki.auth.permissions.PagePermission)

Example 10 with SimpleHTMLFormatter

use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project openolat by klemens.

the class SearchResultsImpl method doHighlight.

/**
 * Highlight (bold,color) query words in result-document. Set HighlightResult for content or description.
 * @param query
 * @param analyzer
 * @param doc
 * @param resultDocument
 * @throws IOException
 */
private void doHighlight(Query query, Analyzer analyzer, Document doc, ResultDocument resultDocument) throws IOException {
    Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(HIGHLIGHT_PRE_TAG, HIGHLIGHT_POST_TAG), new SimpleHTMLEncoder(), new QueryScorer(query));
    // Get 3 best fragments of content and seperate with a "..."
    try {
        // highlight content
        String content = doc.get(AbstractOlatDocument.CONTENT_FIELD_NAME);
        TokenStream tokenStream = analyzer.tokenStream(AbstractOlatDocument.CONTENT_FIELD_NAME, new StringReader(content));
        String highlightResult = highlighter.getBestFragments(tokenStream, content, 3, HIGHLIGHT_SEPARATOR);
        // if no highlightResult is in content => look in description
        if (highlightResult.length() == 0) {
            String description = doc.get(AbstractOlatDocument.DESCRIPTION_FIELD_NAME);
            tokenStream = analyzer.tokenStream(AbstractOlatDocument.DESCRIPTION_FIELD_NAME, new StringReader(description));
            highlightResult = highlighter.getBestFragments(tokenStream, description, 3, HIGHLIGHT_SEPARATOR);
            resultDocument.setHighlightingDescription(true);
        }
        resultDocument.setHighlightResult(highlightResult);
        // highlight title
        String title = doc.get(AbstractOlatDocument.TITLE_FIELD_NAME);
        title = title.trim();
        if (title.length() > 128) {
            title = FilterFactory.getHtmlTagAndDescapingFilter().filter(title);
            title = Formatter.truncate(title, 128);
        }
        tokenStream = analyzer.tokenStream(AbstractOlatDocument.TITLE_FIELD_NAME, new StringReader(title));
        String highlightTitle = highlighter.getBestFragments(tokenStream, title, 3, " ");
        resultDocument.setHighlightTitle(highlightTitle);
    } catch (InvalidTokenOffsetsException e) {
        log.warn("", e);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) SimpleHTMLEncoder(org.apache.lucene.search.highlight.SimpleHTMLEncoder) QueryScorer(org.apache.lucene.search.highlight.QueryScorer) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) StringReader(java.io.StringReader) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) Highlighter(org.apache.lucene.search.highlight.Highlighter)

Aggregations

SimpleHTMLFormatter (org.apache.lucene.search.highlight.SimpleHTMLFormatter)17 Highlighter (org.apache.lucene.search.highlight.Highlighter)16 QueryScorer (org.apache.lucene.search.highlight.QueryScorer)16 TokenStream (org.apache.lucene.analysis.TokenStream)13 IOException (java.io.IOException)12 StringReader (java.io.StringReader)10 InvalidTokenOffsetsException (org.apache.lucene.search.highlight.InvalidTokenOffsetsException)10 ParseException (org.apache.lucene.queryparser.classic.ParseException)4 Map (java.util.Map)3 Analyzer (org.apache.lucene.analysis.Analyzer)3 IndexReader (org.apache.lucene.index.IndexReader)3 MultiFieldQueryParser (org.apache.lucene.queryparser.classic.MultiFieldQueryParser)3 IndexSearcher (org.apache.lucene.search.IndexSearcher)3 Query (org.apache.lucene.search.Query)3 SimpleFragmenter (org.apache.lucene.search.highlight.SimpleFragmenter)3 SimpleHTMLEncoder (org.apache.lucene.search.highlight.SimpleHTMLEncoder)3 ImmutableMap (com.google.common.collect.ImmutableMap)2 ArrayList (java.util.ArrayList)2 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)2 Document (org.apache.lucene.document.Document)2