use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method analyzeTokenStream.
/**
* Analyzes the given TokenStream, collecting the Tokens it produces.
*
* @param tokenStream TokenStream to analyze
*
* @return List of tokens produced from the TokenStream
*/
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
final List<AttributeSource> tokens = new ArrayList<>();
final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
// for backwards compatibility, add all "common" attributes
tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(TypeAttribute.class);
try {
tokenStream.reset();
int position = 0;
while (tokenStream.incrementToken()) {
position += posIncrAtt.getPositionIncrement();
trackerAtt.setActPosition(position);
tokens.add(tokenStream.cloneAttributes());
}
// TODO should we capture?
tokenStream.end();
} catch (IOException ioe) {
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
} finally {
IOUtils.closeWhileHandlingException(tokenStream);
}
return tokens;
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TestStopAnalyzer method testStopListPositions.
public void testStopListPositions() throws IOException {
CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
String s = "This is a good test of the english stop analyzer with positions";
int[] expectedIncr = { 1, 1, 1, 3, 1, 1, 1, 2, 1 };
try (TokenStream stream = newStop.tokenStream("test", s)) {
assertNotNull(stream);
int i = 0;
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String text = termAtt.toString();
assertFalse(stopWordsSet.contains(text));
assertEquals(expectedIncr[i++], posIncrAtt.getPositionIncrement());
}
stream.end();
}
newStop.close();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class QueryBuilder method analyzeMultiBoolean.
/**
* Creates complex boolean query from the cached tokenstream contents
*/
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
BooleanQuery.Builder q = newBooleanQuery();
List<Term> currentQuery = new ArrayList<>();
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
stream.reset();
while (stream.incrementToken()) {
if (posIncrAtt.getPositionIncrement() != 0) {
add(q, currentQuery, operator);
currentQuery.clear();
}
currentQuery.add(new Term(field, termAtt.getBytesRef()));
}
add(q, currentQuery, operator);
return q.build();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class QueryBuilder method createFieldQuery.
/**
* Creates a query from a token stream.
*
* @param source the token stream to create the query from
* @param operator default boolean operator used for this query
* @param field field to create queries against
* @param quoted true if phrases should be generated when terms occur at more than one position
* @param phraseSlop slop factor for phrase/multiphrase queries
*/
protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, int phraseSlop) {
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
// Build an appropriate query based on the analysis chain.
try (CachingTokenFilter stream = new CachingTokenFilter(source)) {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
if (termAtt == null) {
return null;
}
// phase 1: read through the stream and assess the situation:
// counting the number of tokens/positions and marking if we have any synonyms.
int numTokens = 0;
int positionCount = 0;
boolean hasSynonyms = false;
boolean isGraph = false;
stream.reset();
while (stream.incrementToken()) {
numTokens++;
int positionIncrement = posIncAtt.getPositionIncrement();
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
hasSynonyms = true;
}
int positionLength = posLenAtt.getPositionLength();
if (enableGraphQueries && positionLength > 1) {
isGraph = true;
}
}
if (numTokens == 0) {
return null;
} else if (numTokens == 1) {
// single term
return analyzeTerm(field, stream);
} else if (isGraph) {
// graph
if (quoted) {
return analyzeGraphPhrase(stream, field, phraseSlop);
} else {
return analyzeGraphBoolean(field, stream, operator);
}
} else if (quoted && positionCount > 1) {
// phrase
if (hasSynonyms) {
// complex phrase with synonyms
return analyzeMultiPhrase(field, stream, phraseSlop);
} else {
// simple phrase
return analyzePhrase(field, stream, phraseSlop);
}
} else {
// boolean
if (positionCount == 1) {
// only one position, with synonyms
return analyzeBoolean(field, stream);
} else {
// complex case: multiple positions
return analyzeMultiBoolean(field, stream, operator);
}
}
} catch (IOException e) {
throw new RuntimeException("Error analyzing query text", e);
}
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TestIndexWriterExceptions method testTooManyTokens.
// kind of slow, but omits positions, so just CPU
@Nightly
public void testTooManyTokens() throws Exception {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
doc.add(new Field("foo", new TokenStream() {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
long num = 0;
@Override
public boolean incrementToken() throws IOException {
if (num == Integer.MAX_VALUE + 1) {
return false;
}
clearAttributes();
if (num == 0) {
posIncAtt.setPositionIncrement(1);
} else {
posIncAtt.setPositionIncrement(0);
}
termAtt.append("a");
num++;
if (VERBOSE && num % 1000000 == 0) {
System.out.println("indexed: " + num);
}
return true;
}
}, ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
iw.addDocument(doc);
});
assertTrue(expected.getMessage().contains("too many tokens"));
iw.close();
dir.close();
}
Aggregations