use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class PayloadUtils method createSpanQuery.
/**
* The generated SpanQuery will be either a SpanTermQuery or an ordered, zero slop SpanNearQuery, depending
* on how many tokens are emitted.
*/
public static SpanQuery createSpanQuery(String field, String value, Analyzer analyzer) throws IOException {
// adapted this from QueryBuilder.createSpanQuery (which isn't currently public) and added reset(), end(), and close() calls
List<SpanTermQuery> terms = new ArrayList<>();
try (TokenStream in = analyzer.tokenStream(field, value)) {
in.reset();
TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
while (in.incrementToken()) {
terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
}
in.end();
}
SpanQuery query;
if (terms.isEmpty()) {
query = null;
} else if (terms.size() == 1) {
query = terms.get(0);
} else {
query = new SpanNearQuery(terms.toArray(new SpanTermQuery[terms.size()]), 0, true);
}
return query;
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class FragmentQueue method getBestTextFragments.
/**
* Low level api to get the most relevant (formatted) sections of the document.
* This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface.
* @throws IOException If there is a low-level I/O error
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException {
ArrayList<TextFragment> docFrags = new ArrayList<>();
StringBuilder newText = new StringBuilder();
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
if (fragmentScorer instanceof QueryScorer) {
((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
}
TokenStream newStream = fragmentScorer.init(tokenStream);
if (newStream != null) {
tokenStream = newStream;
}
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
try {
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text, tokenStream);
TokenGroup tokenGroup = new TokenGroup(tokenStream);
tokenStream.reset();
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length());
}
if ((tokenGroup.getNumTokens() > 0) && (tokenGroup.isDistinct())) {
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.getStartOffset();
endOffset = tokenGroup.getEndOffset();
tokenText = text.substring(startOffset, endOffset);
String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset = Math.max(endOffset, lastEndOffset);
tokenGroup.clear();
//check if current token marks the start of a new fragment
if (textFragmenter.isNewFragment()) {
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
tokenGroup.addToken(fragmentScorer.getTokenScore());
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
if (tokenGroup.getNumTokens() > 0) {
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.getStartOffset();
endOffset = tokenGroup.getEndOffset();
tokenText = text.substring(startOffset, endOffset);
String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset = Math.max(lastEndOffset, endOffset);
}
//Test what remains of the original text beyond the point where we stopped analyzing
if (// if there is text beyond the last token considered..
(lastEndOffset < text.length()) && // and that text is not too large...
(text.length() <= maxDocCharsToAnalyze)) {
//append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
}
currentFrag.textEndPos = newText.length();
//sort the most relevant sections of the text
for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext(); ) {
currentFrag = i.next();
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
}
}
*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insertWithOverflow(currentFrag);
}
//return the most relevant fragments
TextFragment[] frag = new TextFragment[fragQueue.size()];
for (int i = frag.length - 1; i >= 0; i--) {
frag[i] = fragQueue.pop();
}
//merge any contiguous fragments to improve readability
if (mergeContiguousFragments) {
mergeContiguousFragments(frag);
ArrayList<TextFragment> fragTexts = new ArrayList<>();
for (int i = 0; i < frag.length; i++) {
if ((frag[i] != null) && (frag[i].getScore() > 0)) {
fragTexts.add(frag[i]);
}
}
frag = fragTexts.toArray(new TextFragment[0]);
}
return frag;
} finally {
if (tokenStream != null) {
try {
tokenStream.end();
tokenStream.close();
} catch (Exception e) {
}
}
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class HighlighterPhraseTest method testSparsePhraseWithNoPositions.
public void testSparsePhraseWithNoPositions() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_STORED);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectors(true);
document.add(new Field(FIELD, TEXT, customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final PhraseQuery phraseQuery = new PhraseQuery(1, FIELD, "did", "jump");
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class HighlighterPhraseTest method testSparsePhrase.
public void testSparsePhrase() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectors(true);
document.add(new Field(FIELD, new TokenStreamSparse(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final PhraseQuery phraseQuery = new PhraseQuery(FIELD, "did", "jump");
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(0, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals(highlighter.getBestFragment(new TokenStreamSparse(), TEXT), highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class SynonymTokenizer method testSimpleQueryTermScorerHighlighter.
public void testSimpleQueryTermScorerHighlighter() throws Exception {
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
Highlighter highlighter = new Highlighter(new QueryTermScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(40));
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.totalHits; i++) {
final int docId = hits.scoreDocs[i].doc;
final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
if (VERBOSE)
System.out.println("\t" + result);
}
// Not sure we can assert anything here - just running to check we dont
// throw any exceptions
}
Aggregations