use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class TokenSourcesTest method testPayloads.
// LUCENE-5294
public void testPayloads() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
myFieldType.setStoreTermVectors(true);
myFieldType.setStoreTermVectorOffsets(true);
myFieldType.setStoreTermVectorPositions(true);
myFieldType.setStoreTermVectorPayloads(true);
curOffset = 0;
Token[] tokens = new Token[] { getToken("foxes"), getToken("can"), getToken("jump"), getToken("high") };
Document doc = new Document();
doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
assertEquals(1, reader.numDocs());
TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
ts.reset();
for (Token token : tokens) {
assertTrue(ts.incrementToken());
assertEquals(token.toString(), termAtt.toString());
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
assertEquals(token.getPayload(), payloadAtt.getPayload());
assertEquals(token.startOffset(), offsetAtt.startOffset());
assertEquals(token.endOffset(), offsetAtt.endOffset());
}
assertFalse(ts.incrementToken());
reader.close();
dir.close();
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class SynonymTokenizer method testEncoding.
/**
* Demonstrates creation of an XHTML compliant doc using new encoding facilities.
*/
public void testEncoding() throws Exception {
String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
// run the highlighter on the raw content (scorer does not score any tokens
// for
// highlighting but scores a single fragment for selection
Highlighter highlighter = new Highlighter(this, new SimpleHTMLEncoder(), new Scorer() {
@Override
public void startFragment(TextFragment newFragment) {
}
@Override
public float getTokenScore() {
return 0;
}
@Override
public float getFragmentScore() {
return 1;
}
@Override
public TokenStream init(TokenStream tokenStream) {
return null;
}
});
highlighter.setTextFragmenter(new SimpleFragmenter(2000));
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, rawDocContent);
String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent, 1, "");
// An ugly bit of XML creation:
String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n" + "<head>\n" + "<title>My Test HTML Document</title>\n" + "</head>\n" + "<body>\n" + "<h2>" + encodedSnippet + "</h2>\n" + "</body>\n" + "</html>";
// now an ugly built of XML parsing to test the snippet is encoded OK
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8)));
Element root = doc.getDocumentElement();
NodeList nodes = root.getElementsByTagName("body");
Element body = (Element) nodes.item(0);
nodes = body.getElementsByTagName("h2");
Element h2 = (Element) nodes.item(0);
String decodedSnippet = h2.getFirstChild().getNodeValue();
assertEquals("XHTML Encoding should have worked:", rawDocContent, decodedSnippet);
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class SynonymTokenizer method testConstantScoreMultiTermQuery.
public void testConstantScoreMultiTermQuery() throws Exception {
numHighlights = 0;
query = new WildcardQuery(new Term(FIELD_NAME, "ken*"));
((WildcardQuery) query).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
searcher = newSearcher(reader);
// query = unReWrittenQuery.rewrite(reader);
if (VERBOSE)
System.out.println("Searching for: " + query.toString(FIELD_NAME));
hits = searcher.search(query, 1000);
for (int i = 0; i < hits.totalHits; i++) {
final int docId = hits.scoreDocs[i].doc;
final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
Highlighter highlighter = new Highlighter(this, scorer);
highlighter.setTextFragmenter(new SimpleFragmenter(20));
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator);
if (VERBOSE)
System.out.println("\t" + result);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
// try null field
hits = searcher.search(query, 1000);
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
final int docId = hits.scoreDocs[i].doc;
final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, null);
Highlighter highlighter = new Highlighter(this, scorer);
highlighter.setTextFragmenter(new SimpleFragmenter(20));
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator);
if (VERBOSE)
System.out.println("\t" + result);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
// try default field
hits = searcher.search(query, 1000);
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
final int docId = hits.scoreDocs[i].doc;
final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
Highlighter highlighter = new Highlighter(this, scorer);
highlighter.setTextFragmenter(new SimpleFragmenter(20));
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator);
if (VERBOSE)
System.out.println("\t" + result);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class TokenSourcesTest method testOverlapWithPositionsAndOffset.
public void testOverlapWithPositionsAndOffset() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPositions(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final DisjunctionMaxQuery query = new DisjunctionMaxQuery(Arrays.asList(new SpanTermQuery(new Term(FIELD, "{fox}")), new SpanTermQuery(new Term(FIELD, "fox"))), 1);
// final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
// new SpanTermQuery(new Term(FIELD, "{fox}")),
// new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
TopDocs hits = indexSearcher.search(query, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(query));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class TokenSourcesTest method testOverlapWithPositionsAndOffsetExactPhrase.
public void testOverlapWithPositionsAndOffsetExactPhrase() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
// query.add(new SpanTermQuery(new Term(FIELD, "the")));
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD, "the")), new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
Aggregations