use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.
the class SpellCheckComponent method getTokens.
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<>();
assert analyzer != null;
try (TokenStream ts = analyzer.tokenStream("", q)) {
ts.reset();
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
while (ts.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
ts.end();
return result;
}
}
use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.
the class SimplePreAnalyzedParser method toFormattedString.
@Override
public String toFormattedString(Field f) throws IOException {
StringBuilder sb = new StringBuilder();
sb.append(VERSION + " ");
if (f.fieldType().stored()) {
String s = f.stringValue();
if (s != null) {
// encode the equals sign
s = s.replaceAll("=", "\\=");
sb.append('=');
sb.append(s);
sb.append('=');
}
}
TokenStream ts = f.tokenStreamValue();
if (ts != null) {
StringBuilder tok = new StringBuilder();
boolean next = false;
while (ts.incrementToken()) {
if (next) {
sb.append(' ');
} else {
next = true;
}
tok.setLength(0);
Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
String cTerm = null;
String tTerm = null;
while (it.hasNext()) {
Class<? extends Attribute> cl = it.next();
Attribute att = ts.getAttribute(cl);
if (att == null) {
continue;
}
if (cl.isAssignableFrom(CharTermAttribute.class)) {
CharTermAttribute catt = (CharTermAttribute) att;
cTerm = escape(catt.buffer(), catt.length());
} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
tTerm = escape(tTermChars, tTermChars.length);
} else {
if (tok.length() > 0)
tok.append(',');
if (cl.isAssignableFrom(FlagsAttribute.class)) {
tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
BytesRef p = ((PayloadAttribute) att).getPayload();
if (p != null && p.length > 0) {
tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
} else if (tok.length() > 0) {
// remove the last comma
tok.setLength(tok.length() - 1);
}
} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
} else if (cl.isAssignableFrom(TypeAttribute.class)) {
tok.append("y=" + escape(((TypeAttribute) att).type()));
} else {
tok.append(cl.getName() + "=" + escape(att.toString()));
}
}
}
String term = null;
if (cTerm != null) {
term = cTerm;
} else {
term = tTerm;
}
if (term != null && term.length() > 0) {
if (tok.length() > 0) {
tok.insert(0, term + ",");
} else {
tok.insert(0, term);
}
}
sb.append(tok);
}
}
return sb.toString();
}
use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.
the class JsonPreAnalyzedParser method parse.
@SuppressWarnings("unchecked")
@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
ParseResult res = new ParseResult();
StringBuilder sb = new StringBuilder();
char[] buf = new char[128];
int cnt;
while ((cnt = reader.read(buf)) > 0) {
sb.append(buf, 0, cnt);
}
String val = sb.toString();
// empty string - accept even without version number
if (val.length() == 0) {
return res;
}
Object o = ObjectBuilder.fromJSON(val);
if (!(o instanceof Map)) {
throw new IOException("Invalid JSON type " + o.getClass().getName() + ", expected Map");
}
Map<String, Object> map = (Map<String, Object>) o;
// check version
String version = (String) map.get(VERSION_KEY);
if (version == null) {
throw new IOException("Missing VERSION key");
}
if (!VERSION.equals(version)) {
throw new IOException("Unknown VERSION '" + version + "', expected " + VERSION);
}
if (map.containsKey(STRING_KEY) && map.containsKey(BINARY_KEY)) {
throw new IOException("Field cannot have both stringValue and binaryValue");
}
res.str = (String) map.get(STRING_KEY);
String bin = (String) map.get(BINARY_KEY);
if (bin != null) {
byte[] data = Base64.base64ToByteArray(bin);
res.bin = data;
}
List<Object> tokens = (List<Object>) map.get(TOKENS_KEY);
if (tokens == null) {
return res;
}
int tokenStart = 0;
int tokenEnd = 0;
parent.clearAttributes();
for (Object ot : tokens) {
// automatic increment by 1 separator
tokenStart = tokenEnd + 1;
Map<String, Object> tok = (Map<String, Object>) ot;
boolean hasOffsetStart = false;
boolean hasOffsetEnd = false;
int len = -1;
for (Entry<String, Object> e : tok.entrySet()) {
String key = e.getKey();
if (key.equals(TOKEN_KEY)) {
CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
String str = String.valueOf(e.getValue());
catt.append(str);
len = str.length();
} else if (key.equals(OFFSET_START_KEY)) {
Object obj = e.getValue();
hasOffsetStart = true;
if (obj instanceof Number) {
tokenStart = ((Number) obj).intValue();
} else {
try {
tokenStart = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + OFFSET_START_KEY + " attribute, skipped: '" + obj + "'");
hasOffsetStart = false;
}
}
} else if (key.equals(OFFSET_END_KEY)) {
hasOffsetEnd = true;
Object obj = e.getValue();
if (obj instanceof Number) {
tokenEnd = ((Number) obj).intValue();
} else {
try {
tokenEnd = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + OFFSET_END_KEY + " attribute, skipped: '" + obj + "'");
hasOffsetEnd = false;
}
}
} else if (key.equals(POSINCR_KEY)) {
Object obj = e.getValue();
int posIncr = 1;
if (obj instanceof Number) {
posIncr = ((Number) obj).intValue();
} else {
try {
posIncr = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + POSINCR_KEY + " attribute, skipped: '" + obj + "'");
}
}
PositionIncrementAttribute patt = parent.addAttribute(PositionIncrementAttribute.class);
patt.setPositionIncrement(posIncr);
} else if (key.equals(PAYLOAD_KEY)) {
String str = String.valueOf(e.getValue());
if (str.length() > 0) {
byte[] data = Base64.base64ToByteArray(str);
PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
if (data != null && data.length > 0) {
p.setPayload(new BytesRef(data));
}
}
} else if (key.equals(FLAGS_KEY)) {
try {
int f = Integer.parseInt(String.valueOf(e.getValue()), 16);
FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
flags.setFlags(f);
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + FLAGS_KEY + " attribute, skipped: '" + e.getValue() + "'");
}
} else if (key.equals(TYPE_KEY)) {
TypeAttribute tattr = parent.addAttribute(TypeAttribute.class);
tattr.setType(String.valueOf(e.getValue()));
} else {
LOG.warn("Unknown attribute, skipped: " + e.getKey() + "=" + e.getValue());
}
}
// handle offset attr
OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
if (!hasOffsetEnd && len > -1) {
tokenEnd = tokenStart + len;
}
offset.setOffset(tokenStart, tokenEnd);
if (!hasOffsetStart) {
tokenStart = tokenEnd + 1;
}
// capture state and add to result
State state = parent.captureState();
res.states.add(state.clone());
// reset for reuse
parent.clearAttributes();
}
return res;
}
use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.
the class TestSnowball method testFilterTokens.
public void testFilterTokens() throws Exception {
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
filter.incrementToken();
assertEquals("accent", termAtt.toString());
assertEquals(2, offsetAtt.startOffset());
assertEquals(7, offsetAtt.endOffset());
assertEquals("wrd", typeAtt.type());
assertEquals(3, posIncAtt.getPositionIncrement());
assertEquals(77, flagsAtt.getFlags());
assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.getPayload());
}
use of org.apache.lucene.analysis.tokenattributes.FlagsAttribute in project lucene-solr by apache.
the class WikipediaTokenizerTest method testBoth.
public void testBoth() throws Exception {
Set<String> untoks = new HashSet<>();
untoks.add(WikipediaTokenizer.CATEGORY);
untoks.add(WikipediaTokenizer.ITALICS);
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
tf.setReader(new StringReader(test));
assertTokenStreamContents(tf, new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g", "link", "here", "link", "there", "italics here", "italics", "here", "something", "more italics", "more", "italics", "h i j", "h", "i", "j" }, new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132 }, new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 }, new int[] { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1 });
// now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
tf.setReader(new StringReader(test));
int[] expectedFlags = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 };
FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class);
tf.reset();
for (int i = 0; i < expectedFlags.length; i++) {
assertTrue(tf.incrementToken());
assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags());
}
assertFalse(tf.incrementToken());
tf.close();
}
Aggregations