use of org.apache.lucene.analysis.CachingTokenFilter in project lucene-solr by apache.
the class TestTermVectorsWriter method testEndOffsetPositionWithCachingTokenFilter.
// LUCENE-1448
public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
try (TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", "abcd "))) {
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
Field f = new Field("field", stream, customType);
doc.add(f);
doc.add(f);
w.addDocument(doc);
}
w.close();
IndexReader r = DirectoryReader.open(dir);
TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator();
assertNotNull(termsEnum.next());
PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL);
assertEquals(2, termsEnum.totalTermFreq());
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
dpEnum.nextPosition();
assertEquals(0, dpEnum.startOffset());
assertEquals(4, dpEnum.endOffset());
dpEnum.nextPosition();
assertEquals(8, dpEnum.startOffset());
assertEquals(12, dpEnum.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
r.close();
dir.close();
}
use of org.apache.lucene.analysis.CachingTokenFilter in project lucene-solr by apache.
the class QueryBuilder method createFieldQuery.
/**
* Creates a query from a token stream.
*
* @param source the token stream to create the query from
* @param operator default boolean operator used for this query
* @param field field to create queries against
* @param quoted true if phrases should be generated when terms occur at more than one position
* @param phraseSlop slop factor for phrase/multiphrase queries
*/
protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, int phraseSlop) {
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
// Build an appropriate query based on the analysis chain.
try (CachingTokenFilter stream = new CachingTokenFilter(source)) {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
if (termAtt == null) {
return null;
}
// phase 1: read through the stream and assess the situation:
// counting the number of tokens/positions and marking if we have any synonyms.
int numTokens = 0;
int positionCount = 0;
boolean hasSynonyms = false;
boolean isGraph = false;
stream.reset();
while (stream.incrementToken()) {
numTokens++;
int positionIncrement = posIncAtt.getPositionIncrement();
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
hasSynonyms = true;
}
int positionLength = posLenAtt.getPositionLength();
if (enableGraphQueries && positionLength > 1) {
isGraph = true;
}
}
if (numTokens == 0) {
return null;
} else if (numTokens == 1) {
// single term
return analyzeTerm(field, stream);
} else if (isGraph) {
// graph
if (quoted) {
return analyzeGraphPhrase(stream, field, phraseSlop);
} else {
return analyzeGraphBoolean(field, stream, operator);
}
} else if (quoted && positionCount > 1) {
// phrase
if (hasSynonyms) {
// complex phrase with synonyms
return analyzeMultiPhrase(field, stream, phraseSlop);
} else {
// simple phrase
return analyzePhrase(field, stream, phraseSlop);
}
} else {
// boolean
if (positionCount == 1) {
// only one position, with synonyms
return analyzeBoolean(field, stream);
} else {
// complex case: multiple positions
return analyzeMultiBoolean(field, stream, operator);
}
}
} catch (IOException e) {
throw new RuntimeException("Error analyzing query text", e);
}
}
use of org.apache.lucene.analysis.CachingTokenFilter in project lucene-solr by apache.
the class WeightedSpanTermExtractor method getLeafContext.
protected LeafReaderContext getLeafContext() throws IOException {
if (internalReader == null) {
boolean cacheIt = wrapToCaching && !(tokenStream instanceof CachingTokenFilter);
// If it's from term vectors, simply wrap the underlying Terms in a reader
if (tokenStream instanceof TokenStreamFromTermVector) {
cacheIt = false;
Terms termVectorTerms = ((TokenStreamFromTermVector) tokenStream).getTermVectorTerms();
if (termVectorTerms.hasPositions() && termVectorTerms.hasOffsets()) {
internalReader = new TermVectorLeafReader(DelegatingLeafReader.FIELD_NAME, termVectorTerms);
}
}
// Use MemoryIndex (index/invert this tokenStream now)
if (internalReader == null) {
//offsets and payloads
final MemoryIndex indexer = new MemoryIndex(true, usePayloads);
if (cacheIt) {
assert !cachedTokenStream;
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
cachedTokenStream = true;
indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
} else {
indexer.addField(DelegatingLeafReader.FIELD_NAME, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
}
final IndexSearcher searcher = indexer.createSearcher();
// MEM index has only atomic ctx
internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();
}
//Now wrap it so we always use a common field.
this.internalReader = new DelegatingLeafReader(internalReader);
}
return internalReader.getContext();
}
use of org.apache.lucene.analysis.CachingTokenFilter in project lucene-solr by apache.
the class AnalyzerQueryNodeProcessor method postProcessNode.
@Override
protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException {
if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof RegexpQueryNode) && !(node.getParent() instanceof RangeQueryNode)) {
FieldQueryNode fieldNode = ((FieldQueryNode) node);
String text = fieldNode.getTextAsString();
String field = fieldNode.getFieldAsString();
CachingTokenFilter buffer = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
try {
try (TokenStream source = this.analyzer.tokenStream(field, text)) {
buffer = new CachingTokenFilter(source);
buffer.reset();
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
try {
while (buffer.incrementToken()) {
numTokens++;
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
}
} catch (IOException e) {
// ignore
}
// rewind the buffer stream
//will never through on subsequent reset calls
buffer.reset();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (!buffer.hasAttribute(CharTermAttribute.class)) {
return new NoTokenFoundQueryNode();
}
CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);
if (numTokens == 0) {
return new NoTokenFoundQueryNode();
} else if (numTokens == 1) {
String term = null;
try {
boolean hasNext;
hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
fieldNode.setText(term);
return fieldNode;
} else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1) {
// simple case: only one position, with synonyms
LinkedList<QueryNode> children = new LinkedList<>();
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
children.add(new FieldQueryNode(field, term, -1, -1));
}
return new GroupQueryNode(new SynonymQueryNode(children));
} else {
// multiple positions
QueryNode q = new BooleanQueryNode(Collections.<QueryNode>emptyList());
QueryNode currentQuery = null;
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
if (!(currentQuery instanceof BooleanQueryNode)) {
QueryNode t = currentQuery;
currentQuery = new SynonymQueryNode(Collections.<QueryNode>emptyList());
((BooleanQueryNode) currentQuery).add(t);
}
((BooleanQueryNode) currentQuery).add(new FieldQueryNode(field, term, -1, -1));
} else {
if (currentQuery != null) {
if (this.defaultOperator == Operator.OR) {
q.add(currentQuery);
} else {
q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
}
}
currentQuery = new FieldQueryNode(field, term, -1, -1);
}
}
if (this.defaultOperator == Operator.OR) {
q.add(currentQuery);
} else {
q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
}
if (q instanceof BooleanQueryNode) {
q = new GroupQueryNode(q);
}
return q;
}
} else {
// phrase query:
MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();
List<FieldQueryNode> multiTerms = new ArrayList<>();
int position = -1;
int i = 0;
int termGroupCount = 0;
for (; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (positionIncrement > 0 && multiTerms.size() > 0) {
for (FieldQueryNode termNode : multiTerms) {
if (this.positionIncrementsEnabled) {
termNode.setPositionIncrement(position);
} else {
termNode.setPositionIncrement(termGroupCount);
}
mpq.add(termNode);
}
// Only increment once for each "group" of
// terms that were in the same position:
termGroupCount++;
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new FieldQueryNode(field, term, -1, -1));
}
for (FieldQueryNode termNode : multiTerms) {
if (this.positionIncrementsEnabled) {
termNode.setPositionIncrement(position);
} else {
termNode.setPositionIncrement(termGroupCount);
}
mpq.add(termNode);
}
return mpq;
}
} else {
TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
int position = -1;
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);
if (this.positionIncrementsEnabled) {
position += positionIncrement;
newFieldNode.setPositionIncrement(position);
} else {
newFieldNode.setPositionIncrement(i);
}
pq.add(newFieldNode);
}
return pq;
}
} finally {
if (buffer != null) {
try {
buffer.close();
} catch (IOException e) {
// safe to ignore
}
}
}
}
return node;
}
use of org.apache.lucene.analysis.CachingTokenFilter in project lucene-solr by apache.
the class SynonymTokenizer method testPayloadQuery.
/** We can highlight based on payloads. It's supported both via term vectors and MemoryIndex since Lucene 5. */
public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException {
//"words" at positions 1 & 4
final String text = "random words and words";
//sets payload to "pos: X" (where X is position #)
Analyzer analyzer = new MockPayloadAnalyzer();
try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer))) {
writer.deleteAll();
Document doc = new Document();
doc.add(new Field(FIELD_NAME, text, fieldType));
writer.addDocument(doc);
writer.commit();
}
try (IndexReader reader = DirectoryReader.open(dir)) {
Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")), //just match the first "word" occurrence
Collections.singletonList(new BytesRef("pos: 1")));
IndexSearcher searcher = newSearcher(reader);
QueryScorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
scorer.setUsePayloads(true);
Highlighter h = new Highlighter(scorer);
TopDocs hits = searcher.search(query, 10);
assertEquals(1, hits.scoreDocs.length);
TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer);
if (random().nextBoolean()) {
//conceals detection of TokenStreamFromTermVector
stream = new CachingTokenFilter(stream);
}
String result = h.getBestFragment(stream, text);
//only highlight first "word"
assertEquals("random <B>words</B> and words", result);
}
}
Aggregations