use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class TokenStreamToAutomaton method toAutomaton.
/** Pulls the graph (including {@link
* PositionLengthAttribute}) from the provided {@link
* TokenStream}, and creates the corresponding
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
final Automaton.Builder builder = new Automaton.Builder();
builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
in.reset();
// Only temporarily holds states ahead of our current
// position:
final RollingBuffer<Position> positions = new Positions();
int pos = -1;
int freedPos = 0;
Position posData = null;
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (preservePositionIncrements == false && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
if (posInc > 0) {
// New node:
pos += posInc;
posData = positions.get(pos);
assert posData.leaving == -1;
if (posData.arriving == -1) {
// No token ever arrived to this position
if (pos == 0) {
// OK: this is the first token
posData.leaving = 0;
} else {
// This means there's a hole (eg, StopFilter
// does this):
posData.leaving = builder.createState();
addHoles(builder, positions, pos);
}
} else {
posData.leaving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
// A token spanned over a hole; add holes
// "under" it:
addHoles(builder, positions, pos);
}
}
while (freedPos <= pos) {
Position freePosData = positions.get(freedPos);
// don't free this position yet if we may still need to fill holes over it:
if (freePosData.arriving == -1 || freePosData.leaving == -1) {
break;
}
positions.freeBefore(freedPos);
freedPos++;
}
}
final int endPos = pos + posLengthAtt.getPositionLength();
final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
if (endPosData.arriving == -1) {
endPosData.arriving = builder.createState();
}
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
}
} else {
termLen = termUTF8.length;
}
int state = posData.leaving;
for (int byteIDX = 0; byteIDX < termLen; byteIDX++) {
final int nextState = byteIDX == termLen - 1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
builder.addTransition(state, nextState, c);
state = nextState;
}
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
}
in.end();
int endState = -1;
int endPosInc = posIncAtt.getPositionIncrement();
if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
endPosInc = 1;
}
if (endPosInc > 0) {
// there were hole(s) after the last token
endState = builder.createState();
// add trailing holes now:
int lastState = endState;
while (true) {
int state1 = builder.createState();
builder.addTransition(lastState, state1, HOLE);
endPosInc--;
if (endPosInc == 0) {
builder.setAccept(state1, true);
break;
}
int state2 = builder.createState();
builder.addTransition(state1, state2, POS_SEP);
lastState = state2;
}
} else {
endState = -1;
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != -1) {
if (endState != -1) {
builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
builder.setAccept(posData.arriving, true);
}
}
pos++;
}
return builder.finish();
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class SynonymTokenizer method getTS2.
protected TokenStream getTS2() {
// String s = "Hi-Speed10 foo";
return new TokenStream() {
Iterator<Token> iter;
List<Token> lst;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
{
lst = new ArrayList<>();
Token t;
t = createToken("hi", 0, 2);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("hispeed", 0, 8);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("speed", 3, 8);
t.setPositionIncrement(0);
lst.add(t);
t = createToken("10", 8, 10);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("foo", 11, 14);
t.setPositionIncrement(1);
lst.add(t);
iter = lst.iterator();
}
@Override
public boolean incrementToken() {
if (iter.hasNext()) {
Token token = iter.next();
clearAttributes();
termAtt.setEmpty().append(token);
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
iter = lst.iterator();
}
};
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class SynonymTokenizer method getTS2a.
// same token-stream as above, but the bigger token comes first this time
protected TokenStream getTS2a() {
// String s = "Hi-Speed10 foo";
return new TokenStream() {
Iterator<Token> iter;
List<Token> lst;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
{
lst = new ArrayList<>();
Token t;
t = createToken("hispeed", 0, 8);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("hi", 0, 2);
t.setPositionIncrement(0);
lst.add(t);
t = createToken("speed", 3, 8);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("10", 8, 10);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("foo", 11, 14);
t.setPositionIncrement(1);
lst.add(t);
iter = lst.iterator();
}
@Override
public boolean incrementToken() {
if (iter.hasNext()) {
Token token = iter.next();
clearAttributes();
termAtt.setEmpty().append(token);
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
iter = lst.iterator();
}
};
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class NGramTokenizerTest method testNGrams.
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
// convert the string to code points
final int[] codePoints = toCodePoints(s);
final int[] offsets = new int[codePoints.length + 1];
for (int i = 0; i < codePoints.length; ++i) {
offsets[i + 1] = offsets[i] + Character.charCount(codePoints[i]);
}
final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) {
@Override
protected boolean isTokenChar(int chr) {
return nonTokenChars.indexOf(chr) < 0;
}
};
grams.setReader(new StringReader(s));
final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
grams.reset();
for (int start = 0; start < codePoints.length; ++start) {
nextGram: for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
// not on an edge
continue nextGram;
}
for (int j = start; j < end; ++j) {
if (!isTokenChar(nonTokenChars, codePoints[j])) {
continue nextGram;
}
}
assertTrue(grams.incrementToken());
assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
assertEquals(1, posIncAtt.getPositionIncrement());
assertEquals(1, posLenAtt.getPositionLength());
assertEquals(offsets[start], offsetAtt.startOffset());
assertEquals(offsets[end], offsetAtt.endOffset());
}
}
assertFalse(grams.incrementToken());
grams.end();
assertEquals(s.length(), offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class TestSimplePatternTokenizer method testEndOffset.
public void testEndOffset() throws Exception {
Tokenizer t = new SimplePatternTokenizer("a+");
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
t.setReader(new StringReader("aaabbb"));
t.reset();
assertTrue(t.incrementToken());
assertEquals("aaa", termAtt.toString());
assertFalse(t.incrementToken());
t.end();
assertEquals(6, offsetAtt.endOffset());
}
Aggregations