use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TestRemoveDuplicatesTokenFilter method testDups.
public void testDups(final String expected, final Token... tokens) throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
final TokenStream ts = new RemoveDuplicatesTokenFilter((new TokenStream() {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
@Override
public boolean incrementToken() {
if (toks.hasNext()) {
clearAttributes();
Token tok = toks.next();
termAtt.setEmpty().append(tok);
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
return true;
} else {
return false;
}
}
}));
assertTokenStreamContents(ts, expected.split("\\s"));
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class JsonPreAnalyzedParser method toFormattedString.
@Override
public String toFormattedString(Field f) throws IOException {
Map<String, Object> map = new LinkedHashMap<>();
map.put(VERSION_KEY, VERSION);
if (f.fieldType().stored()) {
String stringValue = f.stringValue();
if (stringValue != null) {
map.put(STRING_KEY, stringValue);
}
BytesRef binaryValue = f.binaryValue();
if (binaryValue != null) {
map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
}
}
TokenStream ts = f.tokenStreamValue();
if (ts != null) {
List<Map<String, Object>> tokens = new LinkedList<>();
while (ts.incrementToken()) {
Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
String cTerm = null;
String tTerm = null;
Map<String, Object> tok = new TreeMap<>();
while (it.hasNext()) {
Class<? extends Attribute> cl = it.next();
Attribute att = ts.getAttribute(cl);
if (att == null) {
continue;
}
if (cl.isAssignableFrom(CharTermAttribute.class)) {
CharTermAttribute catt = (CharTermAttribute) att;
cTerm = new String(catt.buffer(), 0, catt.length());
} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
tTerm = tatt.getBytesRef().utf8ToString();
} else {
if (cl.isAssignableFrom(FlagsAttribute.class)) {
tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
BytesRef p = ((PayloadAttribute) att).getPayload();
if (p != null && p.length > 0) {
tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
}
} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
} else if (cl.isAssignableFrom(TypeAttribute.class)) {
tok.put(TYPE_KEY, ((TypeAttribute) att).type());
} else {
tok.put(cl.getName(), att.toString());
}
}
}
String term = null;
if (cTerm != null) {
term = cTerm;
} else {
term = tTerm;
}
if (term != null && term.length() > 0) {
tok.put(TOKEN_KEY, term);
}
tokens.add(tok);
}
map.put(TOKENS_KEY, tokens);
}
return JSONUtil.toJSON(map, -1);
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class SimplePreAnalyzedParser method createState.
private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
a.clearAttributes();
CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
char[] tokChars = state.token.toString().toCharArray();
termAtt.copyBuffer(tokChars, 0, tokChars.length);
int tokenStart = tokenEnd - state.token.length();
for (Entry<String, String> e : state.attr.entrySet()) {
String k = e.getKey();
if (k.equals("i")) {
// position increment
int incr = Integer.parseInt(e.getValue());
PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
posIncr.setPositionIncrement(incr);
} else if (k.equals("s")) {
tokenStart = Integer.parseInt(e.getValue());
} else if (k.equals("e")) {
tokenEnd = Integer.parseInt(e.getValue());
} else if (k.equals("y")) {
TypeAttribute type = a.addAttribute(TypeAttribute.class);
type.setType(e.getValue());
} else if (k.equals("f")) {
FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
int f = Integer.parseInt(e.getValue(), 16);
flags.setFlags(f);
} else if (k.equals("p")) {
PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
byte[] data = hexToBytes(e.getValue());
if (data != null && data.length > 0) {
p.setPayload(new BytesRef(data));
}
} else {
// unknown attribute
}
}
// handle offset attr
OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
offset.setOffset(tokenStart, tokenEnd);
State resState = a.captureState();
a.clearAttributes();
return resState;
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TestDuelingAnalyzers method assertEquals.
// we only check a few core attributes here.
// TODO: test other things
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
left.reset();
right.reset();
CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
while (left.incrementToken()) {
assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
}
;
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
left.end();
right.end();
assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
left.close();
right.close();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project textdb by TextDB.
the class DataflowUtils method generatePayload.
public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
List<Span> payload = new ArrayList<>();
try {
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
int tokenPositionCounter = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
int tokenPosition = tokenPositionCounter;
int charStart = offsetAttribute.startOffset();
int charEnd = offsetAttribute.endOffset();
String analyzedTermStr = charTermAttribute.toString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
}
tokenStream.close();
} catch (IOException e) {
// return empty payload
payload.clear();
}
return payload;
}
Aggregations