use of org.apache.lucene.index.IndexWriterConfig in project elasticsearch by elastic.
the class CustomUnifiedHighlighterTests method assertHighlightOneDoc.
private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query, Locale locale, BreakIterator breakIterator, int noMatchSize, String[] expectedPassages) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.freeze();
Document doc = new Document();
for (String input : inputs) {
Field field = new Field(fieldName, "", ft);
field.setStringValue(input);
doc.add(field);
}
iw.addDocument(doc);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue, noMatchSize);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets = highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
assertEquals(snippets.length, expectedPassages.length);
for (int i = 0; i < snippets.length; i++) {
assertEquals(snippets[i].getText(), expectedPassages[i]);
}
reader.close();
dir.close();
}
use of org.apache.lucene.index.IndexWriterConfig in project elasticsearch by elastic.
the class CustomPostingsHighlighterTests method testCustomPostingsHighlighter.
public void testCustomPostingsHighlighter() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
//good position but only one match
final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter.";
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue(firstValue);
//two matches, not the best snippet due to its length though
final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.";
Field body2 = new Field("body", "", offsetsType);
doc.add(body2);
body2.setStringValue(secondValue);
//two matches and short, will be scored highest
final String thirdValue = "This is highlighting the third short highlighting value.";
Field body3 = new Field("body", "", offsetsType);
doc.add(body3);
body3.setStringValue(thirdValue);
//one match, same as first but at the end, will be scored lower due to its position
final String fourthValue = "Just a test4 highlighting from postings highlighter.";
Field body4 = new Field("body", "", offsetsType);
doc.add(body4);
body4.setStringValue(fourthValue);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
String firstHlValue = "Just a test1 <b>highlighting</b> from postings highlighter.";
String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a longer text that gets scored lower.";
String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
String fourthHlValue = "Just a test4 <b>highlighting</b> from postings highlighter.";
IndexSearcher searcher = newSearcher(ir);
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), fieldValue, false);
Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
assertThat(snippets.length, equalTo(4));
assertThat(snippets[0].getText(), equalTo(firstHlValue));
assertThat(snippets[1].getText(), equalTo(secondHlValue));
assertThat(snippets[2].getText(), equalTo(thirdHlValue));
assertThat(snippets[3].getText(), equalTo(fourthHlValue));
ir.close();
dir.close();
}
use of org.apache.lucene.index.IndexWriterConfig in project ansj_seg by NLPchina.
the class HeightLightTest method index.
private static void index(Analyzer analysis, String content) throws CorruptIndexException, IOException {
Document doc = new Document();
IndexWriter iwriter = new IndexWriter(directory, new IndexWriterConfig(analysis));
doc.add(new TextField("text", content, Field.Store.YES));
iwriter.addDocument(doc);
iwriter.commit();
iwriter.close();
}
use of org.apache.lucene.index.IndexWriterConfig in project ansj_seg by NLPchina.
the class IndexAndTest method test.
@Test
public void test() throws Exception {
DicLibrary.put(DicLibrary.DEFAULT, "../../library/default.dic");
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
Directory directory = null;
IndexWriter iwriter = null;
IndexWriterConfig ic = new IndexWriterConfig(analyzer);
String text = "旅游和服务是最好的";
System.out.println(IndexAnalysis.parse(text));
// 建立内存索引对象
directory = new RAMDirectory();
iwriter = new IndexWriter(directory, ic);
addContent(iwriter, text);
iwriter.commit();
iwriter.close();
System.out.println("索引建立完毕");
Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.index_ansj);
System.out.println("index ok to search!");
for (Term t : IndexAnalysis.parse(text)) {
System.out.println(t.getName());
search(queryAnalyzer, directory, "\"" + t.getName() + "\"");
}
}
use of org.apache.lucene.index.IndexWriterConfig in project ansj_seg by NLPchina.
the class IndexTest method indexTest.
@Test
public void indexTest() throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
Directory directory = null;
IndexWriter iwriter = null;
IndexWriterConfig ic = new IndexWriterConfig(analyzer);
// 建立内存索引对象
directory = new RAMDirectory();
iwriter = new IndexWriter(directory, ic);
addContent(iwriter, "助推企业转型升级提供强有力的技术支持和服保障。中心的建成将使青岛的服务器承载能力突破10万台,达到世界一流水平。");
addContent(iwriter, "涉及民生的部分商品和服务成本监审政策");
addContent(iwriter, "我穿着和服");
iwriter.commit();
iwriter.close();
System.out.println("索引建立完毕");
Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.dic_ansj);
System.out.println("index ok to search!");
search(queryAnalyzer, directory, "\"和服\"");
}
Aggregations