use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class SynonymTokenizer method testGetBestSingleFragmentWithWeights.
public void testGetBestSingleFragmentWithWeights() throws Exception {
TestHighlightRunner helper = new TestHighlightRunner() {
@Override
public void run() throws Exception {
WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2];
wTerms[0] = new WeightedSpanTerm(10f, "hello");
List<PositionSpan> positionSpans = new ArrayList<>();
positionSpans.add(new PositionSpan(0, 0));
wTerms[0].addPositionSpans(positionSpans);
wTerms[1] = new WeightedSpanTerm(1f, "kennedy");
positionSpans = new ArrayList<>();
positionSpans.add(new PositionSpan(14, 14));
wTerms[1].addPositionSpans(positionSpans);
// new
Highlighter highlighter = getHighlighter(wTerms, HighlighterTest.this);
// Highlighter(new
// QueryTermScorer(wTerms));
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, texts[0]);
highlighter.setTextFragmenter(new SimpleFragmenter(2));
String result = highlighter.getBestFragment(tokenStream, texts[0]).trim();
assertTrue("Failed to find best section using weighted terms. Found: [" + result + "]", "<B>Hello</B>".equals(result));
// readjust weights
wTerms[1].setWeight(50f);
tokenStream = analyzer.tokenStream(FIELD_NAME, texts[0]);
highlighter = getHighlighter(wTerms, HighlighterTest.this);
highlighter.setTextFragmenter(new SimpleFragmenter(2));
result = highlighter.getBestFragment(tokenStream, texts[0]).trim();
assertTrue("Failed to find best section using weighted terms. Found: " + result, "<B>kennedy</B>".equals(result));
}
};
helper.start();
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class TokenSourcesTest method testMaxStartOffsetConsistency.
public void testMaxStartOffsetConsistency() throws IOException {
FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
tvFieldType.setStoreTermVectors(true);
tvFieldType.setStoreTermVectorOffsets(true);
tvFieldType.setStoreTermVectorPositions(true);
Directory dir = newDirectory();
MockAnalyzer analyzer = new MockAnalyzer(random());
//we don't necessarily consume the whole stream because of limiting by startOffset
analyzer.setEnableChecks(false);
Document doc = new Document();
final String TEXT = " f gg h";
doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));
IndexReader reader;
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
writer.addDocument(doc);
reader = writer.getReader();
}
try {
Fields tvFields = reader.getTermVectors(0);
for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);
//assert have same tokens, none of which has a start offset > maxStartOffset
final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
tvStream.reset();
anaStream.reset();
while (tvStream.incrementToken()) {
assertTrue(anaStream.incrementToken());
assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
if (maxStartOffset >= 0)
assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
}
assertTrue(anaStream.incrementToken() == false);
tvStream.end();
anaStream.end();
tvStream.close();
anaStream.close();
}
} finally {
reader.close();
}
dir.close();
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class TokenSourcesTest method testOverlapWithOffset.
public void testOverlapWithOffset() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
// no positions!
customType.setStoreTermVectorOffsets(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
try {
final DisjunctionMaxQuery query = new DisjunctionMaxQuery(Arrays.asList(new SpanTermQuery(new Term(FIELD, "{fox}")), new SpanTermQuery(new Term(FIELD, "fox"))), 1);
// final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
// new SpanTermQuery(new Term(FIELD, "{fox}")),
// new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
TopDocs hits = indexSearcher.search(query, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(query));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class SynonymTokenizer method testCustomScoreQueryHighlight.
public void testCustomScoreQueryHighlight() throws Exception {
TermQuery termQuery = new TermQuery(new Term(FIELD_NAME, "very"));
CustomScoreQuery query = new CustomScoreQuery(termQuery);
searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE));
assertEquals(2, hits.totalHits);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);
final int docId0 = hits.scoreDocs[0].doc;
Document doc = searcher.doc(docId0);
String storedField = doc.get(FIELD_NAME);
TokenStream stream = getAnyTokenStream(FIELD_NAME, docId0);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("Hello this is a piece of text that is <B>very</B> long and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class TokenSourcesTest method testOverlapWithOffsetExactPhrase.
public void testOverlapWithOffsetExactPhrase() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
// no positions!
customType.setStoreTermVectorOffsets(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
// query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD, "the")), new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
Aggregations