use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project elasticsearch by elastic.
the class TransportAnalyzeAction method simpleAnalyze.
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
int lastPosition = -1;
int lastOffset = 0;
for (String text : request.text()) {
try (TokenStream stream = analyzer.tokenStream(field, text)) {
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));
}
stream.end();
lastOffset += offset.endOffset();
lastPosition += posIncr.getPositionIncrement();
lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
}
}
return tokens;
}
use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project languagetool by languagetool-org.
the class LanguageToolFilterTest method displayTokensWithFullDetails.
private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
int position = 0;
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
position = position + increment;
System.out.println();
System.out.print(position + ": ");
}
System.out.print("[" + term + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] ");
}
System.out.println();
}
use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.
the class SpellCheckComponent method getTokens.
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<>();
assert analyzer != null;
try (TokenStream ts = analyzer.tokenStream("", q)) {
ts.reset();
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
while (ts.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
ts.end();
return result;
}
}
use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.
the class SimplePreAnalyzedParser method toFormattedString.
@Override
public String toFormattedString(Field f) throws IOException {
StringBuilder sb = new StringBuilder();
sb.append(VERSION + " ");
if (f.fieldType().stored()) {
String s = f.stringValue();
if (s != null) {
// encode the equals sign
s = s.replaceAll("=", "\\=");
sb.append('=');
sb.append(s);
sb.append('=');
}
}
TokenStream ts = f.tokenStreamValue();
if (ts != null) {
StringBuilder tok = new StringBuilder();
boolean next = false;
while (ts.incrementToken()) {
if (next) {
sb.append(' ');
} else {
next = true;
}
tok.setLength(0);
Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
String cTerm = null;
String tTerm = null;
while (it.hasNext()) {
Class<? extends Attribute> cl = it.next();
Attribute att = ts.getAttribute(cl);
if (att == null) {
continue;
}
if (cl.isAssignableFrom(CharTermAttribute.class)) {
CharTermAttribute catt = (CharTermAttribute) att;
cTerm = escape(catt.buffer(), catt.length());
} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
tTerm = escape(tTermChars, tTermChars.length);
} else {
if (tok.length() > 0)
tok.append(',');
if (cl.isAssignableFrom(FlagsAttribute.class)) {
tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
BytesRef p = ((PayloadAttribute) att).getPayload();
if (p != null && p.length > 0) {
tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
} else if (tok.length() > 0) {
// remove the last comma
tok.setLength(tok.length() - 1);
}
} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
} else if (cl.isAssignableFrom(TypeAttribute.class)) {
tok.append("y=" + escape(((TypeAttribute) att).type()));
} else {
tok.append(cl.getName() + "=" + escape(att.toString()));
}
}
}
String term = null;
if (cTerm != null) {
term = cTerm;
} else {
term = tTerm;
}
if (term != null && term.length() > 0) {
if (tok.length() > 0) {
tok.insert(0, term + ",");
} else {
tok.insert(0, term);
}
}
sb.append(tok);
}
}
return sb.toString();
}
use of org.apache.lucene.analysis.tokenattributes.TypeAttribute in project lucene-solr by apache.
the class JsonPreAnalyzedParser method parse.
@SuppressWarnings("unchecked")
@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
ParseResult res = new ParseResult();
StringBuilder sb = new StringBuilder();
char[] buf = new char[128];
int cnt;
while ((cnt = reader.read(buf)) > 0) {
sb.append(buf, 0, cnt);
}
String val = sb.toString();
// empty string - accept even without version number
if (val.length() == 0) {
return res;
}
Object o = ObjectBuilder.fromJSON(val);
if (!(o instanceof Map)) {
throw new IOException("Invalid JSON type " + o.getClass().getName() + ", expected Map");
}
Map<String, Object> map = (Map<String, Object>) o;
// check version
String version = (String) map.get(VERSION_KEY);
if (version == null) {
throw new IOException("Missing VERSION key");
}
if (!VERSION.equals(version)) {
throw new IOException("Unknown VERSION '" + version + "', expected " + VERSION);
}
if (map.containsKey(STRING_KEY) && map.containsKey(BINARY_KEY)) {
throw new IOException("Field cannot have both stringValue and binaryValue");
}
res.str = (String) map.get(STRING_KEY);
String bin = (String) map.get(BINARY_KEY);
if (bin != null) {
byte[] data = Base64.base64ToByteArray(bin);
res.bin = data;
}
List<Object> tokens = (List<Object>) map.get(TOKENS_KEY);
if (tokens == null) {
return res;
}
int tokenStart = 0;
int tokenEnd = 0;
parent.clearAttributes();
for (Object ot : tokens) {
// automatic increment by 1 separator
tokenStart = tokenEnd + 1;
Map<String, Object> tok = (Map<String, Object>) ot;
boolean hasOffsetStart = false;
boolean hasOffsetEnd = false;
int len = -1;
for (Entry<String, Object> e : tok.entrySet()) {
String key = e.getKey();
if (key.equals(TOKEN_KEY)) {
CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
String str = String.valueOf(e.getValue());
catt.append(str);
len = str.length();
} else if (key.equals(OFFSET_START_KEY)) {
Object obj = e.getValue();
hasOffsetStart = true;
if (obj instanceof Number) {
tokenStart = ((Number) obj).intValue();
} else {
try {
tokenStart = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + OFFSET_START_KEY + " attribute, skipped: '" + obj + "'");
hasOffsetStart = false;
}
}
} else if (key.equals(OFFSET_END_KEY)) {
hasOffsetEnd = true;
Object obj = e.getValue();
if (obj instanceof Number) {
tokenEnd = ((Number) obj).intValue();
} else {
try {
tokenEnd = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + OFFSET_END_KEY + " attribute, skipped: '" + obj + "'");
hasOffsetEnd = false;
}
}
} else if (key.equals(POSINCR_KEY)) {
Object obj = e.getValue();
int posIncr = 1;
if (obj instanceof Number) {
posIncr = ((Number) obj).intValue();
} else {
try {
posIncr = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + POSINCR_KEY + " attribute, skipped: '" + obj + "'");
}
}
PositionIncrementAttribute patt = parent.addAttribute(PositionIncrementAttribute.class);
patt.setPositionIncrement(posIncr);
} else if (key.equals(PAYLOAD_KEY)) {
String str = String.valueOf(e.getValue());
if (str.length() > 0) {
byte[] data = Base64.base64ToByteArray(str);
PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
if (data != null && data.length > 0) {
p.setPayload(new BytesRef(data));
}
}
} else if (key.equals(FLAGS_KEY)) {
try {
int f = Integer.parseInt(String.valueOf(e.getValue()), 16);
FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
flags.setFlags(f);
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + FLAGS_KEY + " attribute, skipped: '" + e.getValue() + "'");
}
} else if (key.equals(TYPE_KEY)) {
TypeAttribute tattr = parent.addAttribute(TypeAttribute.class);
tattr.setType(String.valueOf(e.getValue()));
} else {
LOG.warn("Unknown attribute, skipped: " + e.getKey() + "=" + e.getValue());
}
}
// handle offset attr
OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
if (!hasOffsetEnd && len > -1) {
tokenEnd = tokenStart + len;
}
offset.setOffset(tokenStart, tokenEnd);
if (!hasOffsetStart) {
tokenStart = tokenEnd + 1;
}
// capture state and add to result
State state = parent.captureState();
res.states.add(state.clone());
// reset for reuse
parent.clearAttributes();
}
return res;
}
Aggregations