use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-skos by behas.
the class SKOSQueryNodeProcessor method postProcessNode.
@Override
protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException {
if (node instanceof TextableQueryNode && !(node instanceof WildcardQueryNode) && !(node instanceof FuzzyQueryNode) && !(node instanceof RegexpQueryNode) && !(node.getParent() instanceof RangeQueryNode)) {
FieldQueryNode fieldNode = ((FieldQueryNode) node);
String text = fieldNode.getTextAsString();
String field = fieldNode.getFieldAsString();
CachingTokenFilter buffer = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
try {
try (TokenStream source = this.analyzer.tokenStream(field, text)) {
buffer = new CachingTokenFilter(source);
buffer.reset();
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
try {
while (buffer.incrementToken()) {
numTokens++;
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
}
} catch (IOException e) {
// ignore
}
// rewind the buffer stream
//will never through on subsequent reset calls
buffer.reset();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (!buffer.hasAttribute(CharTermAttribute.class)) {
return new NoTokenFoundQueryNode();
}
CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);
if (numTokens == 0) {
return new NoTokenFoundQueryNode();
} else if (numTokens == 1) {
String term = null;
try {
boolean hasNext;
hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
fieldNode.setText(term);
return fieldNode;
} else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1) {
// simple case: only one position, with synonyms
LinkedList<QueryNode> children = new LinkedList<>();
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (buffer.hasAttribute(SKOSTypeAttribute.class) && boosts != null) {
SKOSTypeAttribute skosAttr = buffer.getAttribute(SKOSTypeAttribute.class);
children.add(new BoostQueryNode(new FieldQueryNode(field, term, -1, -1), getBoost(skosAttr.getSkosType())));
} else {
children.add(new FieldQueryNode(field, term, -1, -1));
}
}
return new GroupQueryNode(new StandardBooleanQueryNode(children, positionCount == 1));
} else {
// multiple positions
QueryNode q = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), false);
QueryNode currentQuery = null;
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
if (!(currentQuery instanceof BooleanQueryNode)) {
QueryNode t = currentQuery;
currentQuery = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), true);
((BooleanQueryNode) currentQuery).add(t);
}
((BooleanQueryNode) currentQuery).add(new FieldQueryNode(field, term, -1, -1));
} else {
if (currentQuery != null) {
if (this.defaultOperator == Operator.OR) {
q.add(currentQuery);
} else {
q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
}
}
currentQuery = new FieldQueryNode(field, term, -1, -1);
}
}
if (this.defaultOperator == Operator.OR) {
q.add(currentQuery);
} else {
q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
}
if (q instanceof BooleanQueryNode) {
q = new GroupQueryNode(q);
}
return q;
}
} else {
// phrase query:
MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();
List<FieldQueryNode> multiTerms = new ArrayList<>();
int position = -1;
int i = 0;
int termGroupCount = 0;
for (; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (positionIncrement > 0 && multiTerms.size() > 0) {
for (FieldQueryNode termNode : multiTerms) {
if (this.positionIncrementsEnabled) {
termNode.setPositionIncrement(position);
} else {
termNode.setPositionIncrement(termGroupCount);
}
mpq.add(termNode);
}
// Only increment once for each "group" of
// terms that were in the same position:
termGroupCount++;
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new FieldQueryNode(field, term, -1, -1));
}
for (FieldQueryNode termNode : multiTerms) {
if (this.positionIncrementsEnabled) {
termNode.setPositionIncrement(position);
} else {
termNode.setPositionIncrement(termGroupCount);
}
mpq.add(termNode);
}
return mpq;
}
} else {
TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
int position = -1;
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);
if (this.positionIncrementsEnabled) {
position += positionIncrement;
newFieldNode.setPositionIncrement(position);
} else {
newFieldNode.setPositionIncrement(i);
}
pq.add(newFieldNode);
}
return pq;
}
} finally {
if (buffer != null) {
try {
buffer.close();
} catch (IOException e) {
// safe to ignore
}
}
}
}
return node;
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project zm-mailbox by Zimbra.
the class UniversalAnalyzerTest method testCJK.
private void testCJK(String src) throws IOException {
TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);
TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
while (true) {
boolean result = cjk.incrementToken();
Assert.assertEquals(result, uni.incrementToken());
if (!result) {
break;
}
String term = cjkTermAttr.toString();
Assert.assertEquals(cjkTermAttr, uniTermAttr);
if (assertOffset) {
Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
}
Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
}
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class SimpleQueryConverter method convert.
@Override
public Collection<Token> convert(String origQuery) {
Collection<Token> result = new HashSet<>();
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
try (TokenStream ts = analyzer.tokenStream("", origQuery)) {
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
while (ts.incrementToken()) {
Token tok = new Token();
tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
tok.setFlags(flagsAtt.getFlags());
tok.setPayload(payloadAtt.getPayload());
tok.setPositionIncrement(posIncAtt.getPositionIncrement());
tok.setType(typeAtt.type());
result.add(tok);
}
ts.end();
return result;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TokenStreamToAutomaton method toAutomaton.
/** Pulls the graph (including {@link
* PositionLengthAttribute}) from the provided {@link
* TokenStream}, and creates the corresponding
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
final Automaton.Builder builder = new Automaton.Builder();
builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
in.reset();
// Only temporarily holds states ahead of our current
// position:
final RollingBuffer<Position> positions = new Positions();
int pos = -1;
int freedPos = 0;
Position posData = null;
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (preservePositionIncrements == false && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
if (posInc > 0) {
// New node:
pos += posInc;
posData = positions.get(pos);
assert posData.leaving == -1;
if (posData.arriving == -1) {
// No token ever arrived to this position
if (pos == 0) {
// OK: this is the first token
posData.leaving = 0;
} else {
// This means there's a hole (eg, StopFilter
// does this):
posData.leaving = builder.createState();
addHoles(builder, positions, pos);
}
} else {
posData.leaving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
// A token spanned over a hole; add holes
// "under" it:
addHoles(builder, positions, pos);
}
}
while (freedPos <= pos) {
Position freePosData = positions.get(freedPos);
// don't free this position yet if we may still need to fill holes over it:
if (freePosData.arriving == -1 || freePosData.leaving == -1) {
break;
}
positions.freeBefore(freedPos);
freedPos++;
}
}
final int endPos = pos + posLengthAtt.getPositionLength();
final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
if (endPosData.arriving == -1) {
endPosData.arriving = builder.createState();
}
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
}
} else {
termLen = termUTF8.length;
}
int state = posData.leaving;
for (int byteIDX = 0; byteIDX < termLen; byteIDX++) {
final int nextState = byteIDX == termLen - 1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
builder.addTransition(state, nextState, c);
state = nextState;
}
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
}
in.end();
int endState = -1;
int endPosInc = posIncAtt.getPositionIncrement();
if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
endPosInc = 1;
}
if (endPosInc > 0) {
// there were hole(s) after the last token
endState = builder.createState();
// add trailing holes now:
int lastState = endState;
while (true) {
int state1 = builder.createState();
builder.addTransition(lastState, state1, HOLE);
endPosInc--;
if (endPosInc == 0) {
builder.setAccept(state1, true);
break;
}
int state2 = builder.createState();
builder.addTransition(state1, state2, POS_SEP);
lastState = state2;
}
} else {
endState = -1;
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != -1) {
if (endState != -1) {
builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
builder.setAccept(posData.arriving, true);
}
}
pos++;
}
return builder.finish();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class CompletionTokenStreamTest method testValidNumberOfExpansions.
@Test
public void testValidNumberOfExpansions() throws IOException {
SynonymMap.Builder builder = new SynonymMap.Builder(true);
for (int i = 0; i < 256; i++) {
builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true);
}
StringBuilder valueBuilder = new StringBuilder();
for (int i = 0; i < 8; i++) {
valueBuilder.append(i + 1);
valueBuilder.append(" ");
}
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
tokenizer.setReader(new StringReader(valueBuilder.toString()));
SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter);
completionTokenStream.setPayload(new BytesRef());
PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
stream.reset();
CompletionTokenStream.BytesRefBuilderTermAttribute attr = stream.addAttribute(CompletionTokenStream.BytesRefBuilderTermAttribute.class);
PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
int maxPos = 0;
int count = 0;
while (stream.incrementToken()) {
count++;
assertNotNull(attr.getBytesRef());
assertTrue(attr.getBytesRef().length > 0);
maxPos += posAttr.getPositionIncrement();
}
stream.close();
assertEquals(count, 256);
assertEquals(count, maxPos);
}
Aggregations