use of org.apache.lucene.util.AttributeSource.State in project lucene-solr by apache.
the class JsonPreAnalyzedParser method parse.
@SuppressWarnings("unchecked")
@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
ParseResult res = new ParseResult();
StringBuilder sb = new StringBuilder();
char[] buf = new char[128];
int cnt;
while ((cnt = reader.read(buf)) > 0) {
sb.append(buf, 0, cnt);
}
String val = sb.toString();
// empty string - accept even without version number
if (val.length() == 0) {
return res;
}
Object o = ObjectBuilder.fromJSON(val);
if (!(o instanceof Map)) {
throw new IOException("Invalid JSON type " + o.getClass().getName() + ", expected Map");
}
Map<String, Object> map = (Map<String, Object>) o;
// check version
String version = (String) map.get(VERSION_KEY);
if (version == null) {
throw new IOException("Missing VERSION key");
}
if (!VERSION.equals(version)) {
throw new IOException("Unknown VERSION '" + version + "', expected " + VERSION);
}
if (map.containsKey(STRING_KEY) && map.containsKey(BINARY_KEY)) {
throw new IOException("Field cannot have both stringValue and binaryValue");
}
res.str = (String) map.get(STRING_KEY);
String bin = (String) map.get(BINARY_KEY);
if (bin != null) {
byte[] data = Base64.base64ToByteArray(bin);
res.bin = data;
}
List<Object> tokens = (List<Object>) map.get(TOKENS_KEY);
if (tokens == null) {
return res;
}
int tokenStart = 0;
int tokenEnd = 0;
parent.clearAttributes();
for (Object ot : tokens) {
// automatic increment by 1 separator
tokenStart = tokenEnd + 1;
Map<String, Object> tok = (Map<String, Object>) ot;
boolean hasOffsetStart = false;
boolean hasOffsetEnd = false;
int len = -1;
for (Entry<String, Object> e : tok.entrySet()) {
String key = e.getKey();
if (key.equals(TOKEN_KEY)) {
CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
String str = String.valueOf(e.getValue());
catt.append(str);
len = str.length();
} else if (key.equals(OFFSET_START_KEY)) {
Object obj = e.getValue();
hasOffsetStart = true;
if (obj instanceof Number) {
tokenStart = ((Number) obj).intValue();
} else {
try {
tokenStart = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + OFFSET_START_KEY + " attribute, skipped: '" + obj + "'");
hasOffsetStart = false;
}
}
} else if (key.equals(OFFSET_END_KEY)) {
hasOffsetEnd = true;
Object obj = e.getValue();
if (obj instanceof Number) {
tokenEnd = ((Number) obj).intValue();
} else {
try {
tokenEnd = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + OFFSET_END_KEY + " attribute, skipped: '" + obj + "'");
hasOffsetEnd = false;
}
}
} else if (key.equals(POSINCR_KEY)) {
Object obj = e.getValue();
int posIncr = 1;
if (obj instanceof Number) {
posIncr = ((Number) obj).intValue();
} else {
try {
posIncr = Integer.parseInt(String.valueOf(obj));
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + POSINCR_KEY + " attribute, skipped: '" + obj + "'");
}
}
PositionIncrementAttribute patt = parent.addAttribute(PositionIncrementAttribute.class);
patt.setPositionIncrement(posIncr);
} else if (key.equals(PAYLOAD_KEY)) {
String str = String.valueOf(e.getValue());
if (str.length() > 0) {
byte[] data = Base64.base64ToByteArray(str);
PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
if (data != null && data.length > 0) {
p.setPayload(new BytesRef(data));
}
}
} else if (key.equals(FLAGS_KEY)) {
try {
int f = Integer.parseInt(String.valueOf(e.getValue()), 16);
FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
flags.setFlags(f);
} catch (NumberFormatException nfe) {
LOG.warn("Invalid " + FLAGS_KEY + " attribute, skipped: '" + e.getValue() + "'");
}
} else if (key.equals(TYPE_KEY)) {
TypeAttribute tattr = parent.addAttribute(TypeAttribute.class);
tattr.setType(String.valueOf(e.getValue()));
} else {
LOG.warn("Unknown attribute, skipped: " + e.getKey() + "=" + e.getValue());
}
}
// handle offset attr
OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
if (!hasOffsetEnd && len > -1) {
tokenEnd = tokenStart + len;
}
offset.setOffset(tokenStart, tokenEnd);
if (!hasOffsetStart) {
tokenStart = tokenEnd + 1;
}
// capture state and add to result
State state = parent.captureState();
res.states.add(state.clone());
// reset for reuse
parent.clearAttributes();
}
return res;
}
use of org.apache.lucene.util.AttributeSource.State in project commons by twitter.
the class TokenGroupAttributeImpl method getTokenGroupStream.
@Override
public TokenGroupStream getTokenGroupStream() {
//Lazily process the sequence into a set of states, only do it when getTokenGroupStream is called
if ((attributeClasses == null || states.isEmpty()) && seq != null) {
TokenizedCharSequenceStream ret = new TokenizedCharSequenceStream();
ret.reset(seq);
//TODO(alewis) This could probably be lazier. Make a new extension of TokenGroupStream?
ImmutableList.Builder<State> builder = ImmutableList.builder();
while (ret.incrementToken()) {
builder.add(ret.captureState());
}
setAttributeSource(ret);
setStates(builder.build());
}
// lazy initialize tokenGroupStream
if (tokenGroupStream == null) {
tokenGroupStream = new TokenGroupStream(attributeClasses);
}
tokenGroupStream.setStates(states);
return tokenGroupStream;
}
use of org.apache.lucene.util.AttributeSource.State in project commons by twitter.
the class TokenGroupAttributeImpl method clone.
@Override
public AttributeImpl clone() {
TokenGroupAttributeImpl clone = new TokenGroupAttributeImpl();
// we don't need to clone attributeClasses because it's immutable.
clone.attributeClasses = attributeClasses;
// same here. TokenizedCharSequence is an immutable obj so no need to clone.
clone.seq = seq;
ImmutableList.Builder<State> builder = ImmutableList.builder();
for (State state : states) {
builder.add(state.clone());
}
clone.states = builder.build();
clone.tokenGroupStream = null;
return clone;
}
use of org.apache.lucene.util.AttributeSource.State in project lucene-solr by apache.
the class SimplePreAnalyzedParser method createState.
private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
a.clearAttributes();
CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
char[] tokChars = state.token.toString().toCharArray();
termAtt.copyBuffer(tokChars, 0, tokChars.length);
int tokenStart = tokenEnd - state.token.length();
for (Entry<String, String> e : state.attr.entrySet()) {
String k = e.getKey();
if (k.equals("i")) {
// position increment
int incr = Integer.parseInt(e.getValue());
PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
posIncr.setPositionIncrement(incr);
} else if (k.equals("s")) {
tokenStart = Integer.parseInt(e.getValue());
} else if (k.equals("e")) {
tokenEnd = Integer.parseInt(e.getValue());
} else if (k.equals("y")) {
TypeAttribute type = a.addAttribute(TypeAttribute.class);
type.setType(e.getValue());
} else if (k.equals("f")) {
FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
int f = Integer.parseInt(e.getValue(), 16);
flags.setFlags(f);
} else if (k.equals("p")) {
PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
byte[] data = hexToBytes(e.getValue());
if (data != null && data.length > 0) {
p.setPayload(new BytesRef(data));
}
} else {
// unknown attribute
}
}
// handle offset attr
OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
offset.setOffset(tokenStart, tokenEnd);
State resState = a.captureState();
a.clearAttributes();
return resState;
}
use of org.apache.lucene.util.AttributeSource.State in project lucene-solr by apache.
the class SimplePreAnalyzedParser method parse.
@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
ParseResult res = new ParseResult();
StringBuilder sb = new StringBuilder();
char[] buf = new char[128];
int cnt;
while ((cnt = reader.read(buf)) > 0) {
sb.append(buf, 0, cnt);
}
String val = sb.toString();
// empty string - accept even without version number
if (val.length() == 0) {
return res;
}
// first consume the version
int idx = val.indexOf(' ');
if (idx == -1) {
throw new IOException("Missing VERSION token");
}
String version = val.substring(0, idx);
if (!VERSION.equals(version)) {
throw new IOException("Unknown VERSION " + version);
}
val = val.substring(idx + 1);
// then consume the optional stored part
int tsStart = 0;
boolean hasStored = false;
StringBuilder storedBuf = new StringBuilder();
if (val.charAt(0) == '=') {
hasStored = true;
if (val.length() > 1) {
for (int i = 1; i < val.length(); i++) {
char c = val.charAt(i);
if (c == '\\') {
if (i < val.length() - 1) {
c = val.charAt(++i);
if (c == '=') {
// we recognize only \= escape in the stored part
storedBuf.append('=');
} else {
storedBuf.append('\\');
storedBuf.append(c);
continue;
}
} else {
storedBuf.append(c);
continue;
}
} else if (c == '=') {
// end of stored text
tsStart = i + 1;
break;
} else {
storedBuf.append(c);
}
}
if (tsStart == 0) {
// missing end-of-stored marker
throw new IOException("Missing end marker of stored part");
}
} else {
throw new IOException("Unexpected end of stored field");
}
}
if (hasStored) {
res.str = storedBuf.toString();
}
Tok tok = new Tok();
StringBuilder attName = new StringBuilder();
StringBuilder attVal = new StringBuilder();
// parser state
S s = S.UNDEF;
int lastPos = 0;
for (int i = tsStart; i < val.length(); i++) {
char c = val.charAt(i);
if (c == ' ') {
// collect leftovers
switch(s) {
case VALUE:
if (attVal.length() == 0) {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
}
if (attName.length() > 0) {
tok.attr.put(attName.toString(), attVal.toString());
}
break;
case // attr name without a value ?
NAME:
if (attName.length() > 0) {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
} else {
// accept missing att name and value
}
break;
case TOKEN:
case UNDEF:
}
attName.setLength(0);
attVal.setLength(0);
if (!tok.isEmpty() || s == S.NAME) {
AttributeSource.State state = createState(parent, tok, lastPos);
if (state != null)
res.states.add(state.clone());
}
// reset tok
s = S.UNDEF;
tok.reset();
// skip
lastPos++;
continue;
}
StringBuilder tgt = null;
switch(s) {
case TOKEN:
tgt = tok.token;
break;
case NAME:
tgt = attName;
break;
case VALUE:
tgt = attVal;
break;
case UNDEF:
tgt = tok.token;
s = S.TOKEN;
}
if (c == '\\') {
if (s == S.TOKEN)
lastPos++;
if (i >= val.length() - 1) {
// end
tgt.append(c);
continue;
} else {
c = val.charAt(++i);
switch(c) {
case '\\':
case '=':
case ',':
case ' ':
tgt.append(c);
break;
case 'n':
tgt.append('\n');
break;
case 'r':
tgt.append('\r');
break;
case 't':
tgt.append('\t');
break;
default:
tgt.append('\\');
tgt.append(c);
lastPos++;
}
}
} else {
// state switch
if (c == ',') {
if (s == S.TOKEN) {
s = S.NAME;
} else if (s == S.VALUE) {
// end of value, start of next attr
if (attVal.length() == 0) {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
}
if (attName.length() > 0 && attVal.length() > 0) {
tok.attr.put(attName.toString(), attVal.toString());
}
// reset
attName.setLength(0);
attVal.setLength(0);
s = S.NAME;
} else {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
}
} else if (c == '=') {
if (s == S.NAME) {
s = S.VALUE;
} else {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
}
} else {
tgt.append(c);
if (s == S.TOKEN)
lastPos++;
}
}
}
// collect leftovers
if (!tok.isEmpty() || s == S.NAME || s == S.VALUE) {
// remaining attrib?
if (s == S.VALUE) {
if (attName.length() > 0 && attVal.length() > 0) {
tok.attr.put(attName.toString(), attVal.toString());
}
}
AttributeSource.State state = createState(parent, tok, lastPos);
if (state != null)
res.states.add(state.clone());
}
return res;
}
Aggregations