use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class TestRemoveDuplicatesTokenFilter method testDups.
public void testDups(final String expected, final Token... tokens) throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
final TokenStream ts = new RemoveDuplicatesTokenFilter((new TokenStream() {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public boolean incrementToken() {
if (toks.hasNext()) {
Token tok =;
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
return true;
} else {
return false;
assertTokenStreamContents(ts, expected.split("\\s"));
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class TokenStreamFromTermVector method init.
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
short dpEnumFlags = PostingsEnum.POSITIONS;
if (vector.hasOffsets()) {
dpEnumFlags |= PostingsEnum.OFFSETS;
offsetAttribute = addAttribute(OffsetAttribute.class);
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
//must ask for offsets too
dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
// We put term data here
termCharsBuilder = new CharsRefBuilder();
//7 is over-estimate of average term len
termCharsBuilder.grow((int) (vector.size() * 7));
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
int lastPosition = -1;
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
//only for UTF8->UTF16 call
CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
//int sumFreq = 0;
while ((termBytesRef = != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
final int termCharsOff = termCharsBuilder.length();
termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
// presumably checked by TokenSources.hasPositions earlier
assert dpEnum != null;
final int freq = dpEnum.freq();
//sumFreq += freq;
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
token.termCharsOff = termCharsOff;
token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
if (token.startOffset > maxStartOffset) {
//filter this token out; exceeds threshold
token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
//divide by 8
pos = token.startOffset >> 3;
if (payloadAttribute != null) {
final BytesRef payload = dpEnum.getPayload();
token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
//Add token to an array indexed by position
if (positionedTokens.length <= pos) {
//grow, but not 2x since we think our original length estimate is close
TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
positionedTokens = newPositionedTokens;
positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
lastPosition = Math.max(lastPosition, pos);
// System.out.println(String.format(
// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
// (originalPositionEstimate/(lastPosition + 1.0f))));
// Step 2: Link all Tokens into a linked-list and set position increments as we go
int prevTokenPos = -1;
TokenLL prevToken = null;
for (int pos = 0; pos <= lastPosition; pos++) {
TokenLL token = positionedTokens[pos];
if (token == null) {
if (prevToken != null) {
assert == null;
//concatenate linked-list = token;
} else {
assert firstToken == null;
firstToken = token;
//set increments
if (vector.hasPositions()) {
token.positionIncrement = pos - prevTokenPos;
while ( != null) {
token =;
token.positionIncrement = 0;
} else {
token.positionIncrement = 1;
while ( != null) {
prevToken = token;
token =;
if (prevToken.startOffset == token.startOffset) {
token.positionIncrement = 0;
} else {
token.positionIncrement = 1;
prevTokenPos = pos;
prevToken = token;
initialized = true;
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class JsonPreAnalyzedParser method toFormattedString.
public String toFormattedString(Field f) throws IOException {
Map<String, Object> map = new LinkedHashMap<>();
if (f.fieldType().stored()) {
String stringValue = f.stringValue();
if (stringValue != null) {
map.put(STRING_KEY, stringValue);
BytesRef binaryValue = f.binaryValue();
if (binaryValue != null) {
map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
TokenStream ts = f.tokenStreamValue();
if (ts != null) {
List<Map<String, Object>> tokens = new LinkedList<>();
while (ts.incrementToken()) {
Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
String cTerm = null;
String tTerm = null;
Map<String, Object> tok = new TreeMap<>();
while (it.hasNext()) {
Class<? extends Attribute> cl =;
Attribute att = ts.getAttribute(cl);
if (att == null) {
if (cl.isAssignableFrom(CharTermAttribute.class)) {
CharTermAttribute catt = (CharTermAttribute) att;
cTerm = new String(catt.buffer(), 0, catt.length());
} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
tTerm = tatt.getBytesRef().utf8ToString();
} else {
if (cl.isAssignableFrom(FlagsAttribute.class)) {
tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
BytesRef p = ((PayloadAttribute) att).getPayload();
if (p != null && p.length > 0) {
tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
} else if (cl.isAssignableFrom(TypeAttribute.class)) {
tok.put(TYPE_KEY, ((TypeAttribute) att).type());
} else {
tok.put(cl.getName(), att.toString());
String term = null;
if (cTerm != null) {
term = cTerm;
} else {
term = tTerm;
if (term != null && term.length() > 0) {
tok.put(TOKEN_KEY, term);
map.put(TOKENS_KEY, tokens);
return JSONUtil.toJSON(map, -1);
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class SimplePreAnalyzedParser method createState.
private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
char[] tokChars = state.token.toString().toCharArray();
termAtt.copyBuffer(tokChars, 0, tokChars.length);
int tokenStart = tokenEnd - state.token.length();
for (Entry<String, String> e : state.attr.entrySet()) {
String k = e.getKey();
if (k.equals("i")) {
// position increment
int incr = Integer.parseInt(e.getValue());
PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
} else if (k.equals("s")) {
tokenStart = Integer.parseInt(e.getValue());
} else if (k.equals("e")) {
tokenEnd = Integer.parseInt(e.getValue());
} else if (k.equals("y")) {
TypeAttribute type = a.addAttribute(TypeAttribute.class);
} else if (k.equals("f")) {
FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
int f = Integer.parseInt(e.getValue(), 16);
} else if (k.equals("p")) {
PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
byte[] data = hexToBytes(e.getValue());
if (data != null && data.length > 0) {
p.setPayload(new BytesRef(data));
} else {
// unknown attribute
// handle offset attr
OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
offset.setOffset(tokenStart, tokenEnd);
State resState = a.captureState();
return resState;
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class TestDuelingAnalyzers method assertEquals.
// we only check a few core attributes here.
// TODO: test other things
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
while (left.incrementToken()) {
assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());