use of org.apache.lucene.search.highlight.SimpleHTMLEncoder in project elasticsearch by elastic.
the class CustomPassageFormatterTests method testHtmlEncodeFormat.
public void testHtmlEncodeFormat() {
String content = "<b>This is a really cool highlighter.</b> Unified highlighter gives nice snippets back.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
Passage[] passages = new Passage[2];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.setStartOffset(0);
//lets include the whitespace at the end to make sure we trim it
passage1.setEndOffset(end + 6);
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.setStartOffset(passage1.getEndOffset());
passage2.setEndOffset(content.length());
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(2));
assertThat(fragments[0].getText(), equalTo("<b>This is a really cool <em>highlighter</em>.</b>"));
assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
}
use of org.apache.lucene.search.highlight.SimpleHTMLEncoder in project elasticsearch by elastic.
the class CustomPassageFormatterTests method testHtmlEncodeFormat.
public void testHtmlEncodeFormat() {
String content = "<b>This is a really cool highlighter.</b> Postings highlighter gives nice snippets back.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
Passage[] passages = new Passage[2];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.startOffset = 0;
//lets include the whitespace at the end to make sure we trim it
passage1.endOffset = end + 6;
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.startOffset = passage1.endOffset;
passage2.endOffset = content.length();
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(2));
assertThat(fragments[0].getText(), equalTo("<b>This is a really cool <em>highlighter</em>.</b>"));
assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
}
use of org.apache.lucene.search.highlight.SimpleHTMLEncoder in project lucene-solr by apache.
the class SimpleFragmentsBuilderTest method testTagsAndEncoder.
public void testTagsAndEncoder() throws Exception {
FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "<h1> a </h1>");
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
String[] preTags = { "[" };
String[] postTags = { "]" };
assertEquals("<h1> [a] </h1>", sfb.createFragment(reader, 0, F, ffl, preTags, postTags, new SimpleHTMLEncoder()));
}
use of org.apache.lucene.search.highlight.SimpleHTMLEncoder in project OpenOLAT by OpenOLAT.
the class SearchResultsImpl method doHighlight.
/**
* Highlight (bold,color) query words in result-document. Set HighlightResult for content or description.
* @param query
* @param analyzer
* @param doc
* @param resultDocument
* @throws IOException
*/
private void doHighlight(Query query, Analyzer analyzer, Document doc, ResultDocument resultDocument) throws IOException {
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(HIGHLIGHT_PRE_TAG, HIGHLIGHT_POST_TAG), new SimpleHTMLEncoder(), new QueryScorer(query));
// Get 3 best fragments of content and seperate with a "..."
try {
// highlight content
String content = doc.get(AbstractOlatDocument.CONTENT_FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(AbstractOlatDocument.CONTENT_FIELD_NAME, new StringReader(content));
String highlightResult = highlighter.getBestFragments(tokenStream, content, 3, HIGHLIGHT_SEPARATOR);
// if no highlightResult is in content => look in description
if (highlightResult.length() == 0) {
String description = doc.get(AbstractOlatDocument.DESCRIPTION_FIELD_NAME);
tokenStream = analyzer.tokenStream(AbstractOlatDocument.DESCRIPTION_FIELD_NAME, new StringReader(description));
highlightResult = highlighter.getBestFragments(tokenStream, description, 3, HIGHLIGHT_SEPARATOR);
resultDocument.setHighlightingDescription(true);
}
resultDocument.setHighlightResult(highlightResult);
// highlight title
String title = doc.get(AbstractOlatDocument.TITLE_FIELD_NAME);
title = title.trim();
if (title.length() > 128) {
title = FilterFactory.getHtmlTagAndDescapingFilter().filter(title);
title = Formatter.truncate(title, 128);
}
tokenStream = analyzer.tokenStream(AbstractOlatDocument.TITLE_FIELD_NAME, new StringReader(title));
String highlightTitle = highlighter.getBestFragments(tokenStream, title, 3, " ");
resultDocument.setHighlightTitle(highlightTitle);
} catch (InvalidTokenOffsetsException e) {
log.warn("", e);
}
}
use of org.apache.lucene.search.highlight.SimpleHTMLEncoder in project jspwiki by apache.
the class LuceneSearchProvider method findPages.
/**
* Searches pages using a particular combination of flags.
*
* @param query The query to perform in Lucene query language
* @param flags A set of flags
* @return A Collection of SearchResult instances
* @throws ProviderException if there is a problem with the backend
*/
public Collection findPages(String query, int flags, WikiContext wikiContext) throws ProviderException {
IndexSearcher searcher = null;
ArrayList<SearchResult> list = null;
Highlighter highlighter = null;
try {
String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS };
QueryParser qp = new MultiFieldQueryParser(Version.LUCENE_47, queryfields, getLuceneAnalyzer());
// QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
Query luceneQuery = qp.parse(query);
if ((flags & FLAG_CONTEXTS) != 0) {
highlighter = new Highlighter(new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"), new SimpleHTMLEncoder(), new QueryScorer(luceneQuery));
}
try {
File dir = new File(m_luceneDirectory);
Directory luceneDir = new SimpleFSDirectory(dir, null);
IndexReader reader = DirectoryReader.open(luceneDir);
searcher = new IndexSearcher(reader);
} catch (Exception ex) {
log.info("Lucene not yet ready; indexing not started", ex);
return null;
}
ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs;
AuthorizationManager mgr = m_engine.getAuthorizationManager();
list = new ArrayList<SearchResult>(hits.length);
for (int curr = 0; curr < hits.length; curr++) {
int docID = hits[curr].doc;
Document doc = searcher.doc(docID);
String pageName = doc.get(LUCENE_ID);
WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);
if (page != null) {
if (page instanceof Attachment) {
// Currently attachments don't look nice on the search-results page
// When the search-results are cleaned up this can be enabled again.
}
PagePermission pp = new PagePermission(page, PagePermission.VIEW_ACTION);
if (mgr.checkPermission(wikiContext.getWikiSession(), pp)) {
int score = (int) (hits[curr].score * 100);
// Get highlighted search contexts
String text = doc.get(LUCENE_PAGE_CONTENTS);
String[] fragments = new String[0];
if (text != null && highlighter != null) {
TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text));
fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);
}
SearchResult result = new SearchResultImpl(page, score, fragments);
list.add(result);
}
} else {
log.error("Lucene found a result page '" + pageName + "' that could not be loaded, removing from Lucene cache");
pageRemoved(new WikiPage(m_engine, pageName));
}
}
} catch (IOException e) {
log.error("Failed during lucene search", e);
} catch (ParseException e) {
log.info("Broken query; cannot parse query ", e);
throw new ProviderException("You have entered a query Lucene cannot process: " + e.getMessage());
} catch (InvalidTokenOffsetsException e) {
log.error("Tokens are incompatible with provided text ", e);
} finally {
if (searcher != null) {
try {
searcher.getIndexReader().close();
} catch (IOException e) {
log.error(e);
}
}
}
return list;
}
Aggregations