use of org.apache.lucene.search.highlight.InvalidTokenOffsetsException in project zeppelin by apache.
the class LuceneSearch method doSearch.
private List<Map<String, String>> doSearch(IndexSearcher searcher, Query query, Analyzer analyzer, Highlighter highlighter) {
List<Map<String, String>> matchingParagraphs = Lists.newArrayList();
ScoreDoc[] hits;
try {
hits = searcher.search(query, 20).scoreDocs;
for (int i = 0; i < hits.length; i++) {
LOG.debug("doc={} score={}", hits[i].doc, hits[i].score);
int id = hits[i].doc;
Document doc = searcher.doc(id);
String path = doc.get(ID_FIELD);
if (path != null) {
LOG.debug((i + 1) + ". " + path);
String title = doc.get("title");
if (title != null) {
LOG.debug(" Title: {}", doc.get("title"));
}
String text = doc.get(SEARCH_FIELD_TEXT);
String header = doc.get(SEARCH_FIELD_TITLE);
String fragment = "";
if (text != null) {
TokenStream tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TEXT, analyzer);
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, true, 3);
LOG.debug(" {} fragments found for query '{}'", frag.length, query);
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
LOG.debug(" Fragment: {}", frag[j].toString());
}
}
fragment = (frag != null && frag.length > 0) ? frag[0].toString() : "";
}
if (header != null) {
TokenStream tokenTitle = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TITLE, analyzer);
TextFragment[] frgTitle = highlighter.getBestTextFragments(tokenTitle, header, true, 3);
header = (frgTitle != null && frgTitle.length > 0) ? frgTitle[0].toString() : "";
} else {
header = "";
}
matchingParagraphs.add(// <noteId>/paragraph/<paragraphId>
ImmutableMap.of(// <noteId>/paragraph/<paragraphId>
"id", // <noteId>/paragraph/<paragraphId>
path, "name", title, "snippet", fragment, "text", text, "header", header));
} else {
LOG.info("{}. No {} for this document", i + 1, ID_FIELD);
}
}
} catch (IOException | InvalidTokenOffsetsException e) {
LOG.error("Exception on searching for {}", query, e);
}
return matchingParagraphs;
}
use of org.apache.lucene.search.highlight.InvalidTokenOffsetsException in project ansj_seg by NLPchina.
the class HeightLightTest method toHighlighter.
/**
* 高亮设置
*
* @param query
* @param doc
* @param field
* @return
*/
private static String toHighlighter(Analyzer analyzer, Query query, Document doc) {
String field = "text";
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
TokenStream tokenStream1 = indexAnalyzer.tokenStream("text", new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
use of org.apache.lucene.search.highlight.InvalidTokenOffsetsException in project ansj_seg by NLPchina.
the class IndexAndTest method toHighlighter.
/**
* 高亮设置
*
* @param query
* @param doc
* @param field
* @return
*/
private String toHighlighter(Analyzer analyzer, Query query, Document doc) {
String field = "text";
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
use of org.apache.lucene.search.highlight.InvalidTokenOffsetsException in project ansj_seg by NLPchina.
the class IndexTest method toHighlighter.
/**
* 高亮设置
*
* @param query
* @param doc
* @param field
* @return
*/
private String toHighlighter(Analyzer analyzer, Query query, Document doc) {
String field = "text";
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
use of org.apache.lucene.search.highlight.InvalidTokenOffsetsException in project jackrabbit-oak by apache.
the class LuceneIndex method getExcerpt.
private String getExcerpt(Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc) throws IOException {
StringBuilder excerpt = new StringBuilder();
for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
String name = field.name();
// only full text or analyzed fields
if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
String text = field.stringValue();
TokenStream tokenStream = analyzer.tokenStream(name, text);
try {
TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 2);
if (textFragments != null && textFragments.length > 0) {
for (TextFragment fragment : textFragments) {
if (excerpt.length() > 0) {
excerpt.append("...");
}
excerpt.append(fragment.toString());
}
break;
}
} catch (InvalidTokenOffsetsException e) {
LOG.error("higlighting failed", e);
}
}
}
return excerpt.toString();
}
Aggregations