use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project SSM by Intel-bigdata.
the class LuceneSearch method query.
/* (non-Javadoc)
* @see org.apache.zeppelin.search.Search#query(java.lang.String)
*/
@Override
public List<Map<String, String>> query(String queryStr) {
if (null == ramDirectory) {
throw new IllegalStateException("Something went wrong on instance creation time, index dir is null");
}
List<Map<String, String>> result = Collections.emptyList();
try (IndexReader indexReader = DirectoryReader.open(ramDirectory)) {
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
Analyzer analyzer = new StandardAnalyzer();
MultiFieldQueryParser parser = new MultiFieldQueryParser(new String[] { SEARCH_FIELD_TEXT, SEARCH_FIELD_TITLE }, analyzer);
Query query = parser.parse(queryStr);
LOG.debug("Searching for: " + query.toString(SEARCH_FIELD_TEXT));
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
result = doSearch(indexSearcher, query, analyzer, highlighter);
indexReader.close();
} catch (IOException e) {
LOG.error("Failed to open index dir {}, make sure indexing finished OK", ramDirectory, e);
} catch (ParseException e) {
LOG.error("Failed to parse query " + queryStr, e);
}
return result;
}
use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project gitblit by gitblit.
the class LuceneService method getHighlightedFragment.
/**
* @param analyzer
* @param query
* @param content
* @param result
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
private String getHighlightedFragment(Analyzer analyzer, Query query, String content, SearchResult result) throws IOException, InvalidTokenOffsetsException {
if (content == null) {
content = "";
}
int tabLength = storedSettings.getInteger(Keys.web.tabLength, 4);
int fragmentLength = SearchObjectType.commit == result.type ? 512 : 150;
QueryScorer scorer = new QueryScorer(query, "content");
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentLength);
// use an artificial delimiter for the token
String termTag = "!!--[";
String termTagEnd = "]--!!";
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(termTag, termTagEnd);
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter.setTextFragmenter(fragmenter);
String[] fragments = highlighter.getBestFragments(analyzer, "content", content, 3);
if (ArrayUtils.isEmpty(fragments)) {
if (SearchObjectType.blob == result.type) {
return "";
}
// clip commit message
String fragment = content;
if (fragment.length() > fragmentLength) {
fragment = fragment.substring(0, fragmentLength) + "...";
}
return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true, tabLength) + "</pre>";
}
// make sure we have unique fragments
Set<String> uniqueFragments = new LinkedHashSet<String>();
for (String fragment : fragments) {
uniqueFragments.add(fragment);
}
fragments = uniqueFragments.toArray(new String[uniqueFragments.size()]);
StringBuilder sb = new StringBuilder();
for (int i = 0, len = fragments.length; i < len; i++) {
String fragment = fragments[i];
String tag = "<pre class=\"text\">";
// resurrect the raw fragment from removing the artificial delimiters
String raw = fragment.replace(termTag, "").replace(termTagEnd, "");
// determine position of the raw fragment in the content
int pos = content.indexOf(raw);
// restore complete first line of fragment
int c = pos;
while (c > 0) {
c--;
if (content.charAt(c) == '\n') {
break;
}
}
if (c > 0) {
// inject leading chunk of first fragment line
fragment = content.substring(c + 1, pos) + fragment;
}
if (SearchObjectType.blob == result.type) {
// count lines as offset into the content for this fragment
int line = Math.max(1, StringUtils.countLines(content.substring(0, pos)));
// create fragment tag with line number and language
String lang = "";
String ext = StringUtils.getFileExtension(result.path).toLowerCase();
if (!StringUtils.isEmpty(ext)) {
// maintain leading space!
lang = " lang-" + ext;
}
tag = MessageFormat.format("<pre class=\"prettyprint linenums:{0,number,0}{1}\">", line, lang);
}
sb.append(tag);
// replace the artificial delimiter with html tags
String html = StringUtils.escapeForHtml(fragment, false);
html = html.replace(termTag, "<span class=\"highlight\">").replace(termTagEnd, "</span>");
sb.append(html);
sb.append("</pre>");
if (i < len - 1) {
sb.append("<span class=\"ellipses\">...</span><br/>");
}
}
return sb.toString();
}
use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project jena by apache.
the class TextIndexLucene method highlightResults.
private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, List<String> fields, String highlight, String queryLang) throws IOException, InvalidTokenOffsetsException {
List<TextHit> results = new ArrayList<>();
HighlightOpts opts = new HighlightOpts(highlight);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(opts.start, opts.end);
Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(opts.fragSize));
for (ScoreDoc sd : sDocs) {
Document doc = indexSearcher.doc(sd.doc);
String entity = doc.get(docDef.getEntityField());
Node literal = null;
String field = getDocField(doc, fields);
String lexical = doc.get(field);
Collection<Node> props = docDef.getPredicates(field);
// pick one - should be only one normally
Node prop = props.isEmpty() ? null : props.iterator().next();
String docLang = doc.get(docDef.getLangField());
String effectiveField = queryLang != null ? field + "_" + Util.getEffectiveLang(docLang, queryLang) : field;
log.trace("highlightResults[{}]: {}, field: {}, lexical: {}, docLang: {}, effectiveField: {}", sd.doc, doc, field, lexical, docLang, effectiveField);
if (lexical != null) {
TokenStream tokenStream = indexAnalyzer.tokenStream(effectiveField, lexical);
log.trace("tokenStream: {}", tokenStream.toString());
TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags, opts.maxFrags);
String rez = frags2string(frags, opts);
log.trace("result: {}, #frags: {}", rez, frags.length);
literal = NodeFactory.createLiteral(rez, docLang);
}
String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null;
Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null;
Node subject = TextQueryFuncs.stringToNode(entity);
TextHit hit = new TextHit(subject, sd.score, literal, graph, prop);
results.add(hit);
}
return results;
}
use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project ansj_seg by NLPchina.
the class HeightLightTest method toHighlighter.
/**
* 高亮设置
*
* @param query
* @param doc
* @param field
* @return
*/
private static String toHighlighter(Analyzer analyzer, Query query, Document doc) {
String field = "text";
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
TokenStream tokenStream1 = indexAnalyzer.tokenStream("text", new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project ansj_seg by NLPchina.
the class IndexTest method toHighlighter.
/**
* 高亮设置
*
* @param query
* @param doc
* @param field
* @return
*/
private String toHighlighter(Analyzer analyzer, Query query, Document doc) {
String field = "text";
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
Aggregations