use of org.apache.lucene.util.AttributeSource in project commons by twitter.
the class TokenTypeAttributeSerializerTest method deserialize.
private TokenType deserialize(byte[] serialized) throws IOException {
AttributeSource attributeSource = new AttributeSource();
TokenTypeAttribute tokenTypeAttribute = attributeSource.addAttribute(TokenTypeAttribute.class);
TokenTypeAttributeSerializer serializer = new TokenTypeAttributeSerializer();
serializer.initialize(attributeSource, TokenStreamSerializer.CURRENT_VERSION);
ByteArrayInputStream input = new ByteArrayInputStream(serialized);
TokenStreamSerializer.AttributeInputStream inputStream = new TokenStreamSerializer.AttributeInputStream(input);
serializer.deserialize(inputStream, null);
return tokenTypeAttribute.getType();
}
use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method analyzeTokenStream.
/**
* Analyzes the given TokenStream, collecting the Tokens it produces.
*
* @param tokenStream TokenStream to analyze
*
* @return List of tokens produced from the TokenStream
*/
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
final List<AttributeSource> tokens = new ArrayList<>();
final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
// for backwards compatibility, add all "common" attributes
tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(TypeAttribute.class);
try {
tokenStream.reset();
int position = 0;
while (tokenStream.incrementToken()) {
position += posIncrAtt.getPositionIncrement();
trackerAtt.setActPosition(position);
tokens.add(tokenStream.cloneAttributes());
}
// TODO should we capture?
tokenStream.end();
} catch (IOException ioe) {
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
} finally {
IOUtils.closeWhileHandlingException(tokenStream);
}
return tokens;
}
use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.
the class MockSynonymFilter method addSynonymAndRestoreOrigToken.
private void addSynonymAndRestoreOrigToken(String synonymText, int posLen, int endOffset) {
AttributeSource origToken = cloneAttributes();
addSynonym(synonymText, posLen, endOffset);
origToken.copyTo(this);
}
use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.
the class FuzzyLikeThisQuery method addTerms.
private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
if (f.queryString == null)
return;
final Terms terms = MultiFields.getTerms(reader, f.fieldName);
if (terms == null) {
return;
}
try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
int corpusNumDocs = reader.numDocs();
HashSet<String> processedTerms = new HashSet<>();
ts.reset();
while (ts.incrementToken()) {
String term = termAtt.toString();
if (!processedTerms.contains(term)) {
processedTerms.add(term);
//maxNum variants considered for any one term
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM);
float minScore = 0;
Term startTerm = new Term(f.fieldName, term);
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.maxEdits, f.prefixLength, true);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants = 0;
int totalVariantDocFreqs = 0;
BytesRef possibleMatch;
BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class);
while ((possibleMatch = fe.next()) != null) {
numVariants++;
totalVariantDocFreqs += fe.docFreq();
float score = boostAtt.getBoost();
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
variantsQ.insertWithOverflow(st);
// maintain minScore
minScore = variantsQ.top().score;
}
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
}
if (numVariants > 0) {
int avgDf = totalVariantDocFreqs / numVariants;
if (//no direct match we can use as df for all variants
df == 0) {
//use avg df of all variants
df = avgDf;
}
// take the top variants (scored by edit distance) and reset the score
// to include an IDF factor then add to the global queue for ranking
// overall top query terms
int size = variantsQ.size();
for (int i = 0; i < size; i++) {
ScoreTerm st = variantsQ.pop();
st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
q.insertWithOverflow(st);
}
}
}
}
ts.end();
}
}
use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.
the class SimplePreAnalyzedParser method parse.
@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
ParseResult res = new ParseResult();
StringBuilder sb = new StringBuilder();
char[] buf = new char[128];
int cnt;
while ((cnt = reader.read(buf)) > 0) {
sb.append(buf, 0, cnt);
}
String val = sb.toString();
// empty string - accept even without version number
if (val.length() == 0) {
return res;
}
// first consume the version
int idx = val.indexOf(' ');
if (idx == -1) {
throw new IOException("Missing VERSION token");
}
String version = val.substring(0, idx);
if (!VERSION.equals(version)) {
throw new IOException("Unknown VERSION " + version);
}
val = val.substring(idx + 1);
// then consume the optional stored part
int tsStart = 0;
boolean hasStored = false;
StringBuilder storedBuf = new StringBuilder();
if (val.charAt(0) == '=') {
hasStored = true;
if (val.length() > 1) {
for (int i = 1; i < val.length(); i++) {
char c = val.charAt(i);
if (c == '\\') {
if (i < val.length() - 1) {
c = val.charAt(++i);
if (c == '=') {
// we recognize only \= escape in the stored part
storedBuf.append('=');
} else {
storedBuf.append('\\');
storedBuf.append(c);
continue;
}
} else {
storedBuf.append(c);
continue;
}
} else if (c == '=') {
// end of stored text
tsStart = i + 1;
break;
} else {
storedBuf.append(c);
}
}
if (tsStart == 0) {
// missing end-of-stored marker
throw new IOException("Missing end marker of stored part");
}
} else {
throw new IOException("Unexpected end of stored field");
}
}
if (hasStored) {
res.str = storedBuf.toString();
}
Tok tok = new Tok();
StringBuilder attName = new StringBuilder();
StringBuilder attVal = new StringBuilder();
// parser state
S s = S.UNDEF;
int lastPos = 0;
for (int i = tsStart; i < val.length(); i++) {
char c = val.charAt(i);
if (c == ' ') {
// collect leftovers
switch(s) {
case VALUE:
if (attVal.length() == 0) {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
}
if (attName.length() > 0) {
tok.attr.put(attName.toString(), attVal.toString());
}
break;
case // attr name without a value ?
NAME:
if (attName.length() > 0) {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
} else {
// accept missing att name and value
}
break;
case TOKEN:
case UNDEF:
}
attName.setLength(0);
attVal.setLength(0);
if (!tok.isEmpty() || s == S.NAME) {
AttributeSource.State state = createState(parent, tok, lastPos);
if (state != null)
res.states.add(state.clone());
}
// reset tok
s = S.UNDEF;
tok.reset();
// skip
lastPos++;
continue;
}
StringBuilder tgt = null;
switch(s) {
case TOKEN:
tgt = tok.token;
break;
case NAME:
tgt = attName;
break;
case VALUE:
tgt = attVal;
break;
case UNDEF:
tgt = tok.token;
s = S.TOKEN;
}
if (c == '\\') {
if (s == S.TOKEN)
lastPos++;
if (i >= val.length() - 1) {
// end
tgt.append(c);
continue;
} else {
c = val.charAt(++i);
switch(c) {
case '\\':
case '=':
case ',':
case ' ':
tgt.append(c);
break;
case 'n':
tgt.append('\n');
break;
case 'r':
tgt.append('\r');
break;
case 't':
tgt.append('\t');
break;
default:
tgt.append('\\');
tgt.append(c);
lastPos++;
}
}
} else {
// state switch
if (c == ',') {
if (s == S.TOKEN) {
s = S.NAME;
} else if (s == S.VALUE) {
// end of value, start of next attr
if (attVal.length() == 0) {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
}
if (attName.length() > 0 && attVal.length() > 0) {
tok.attr.put(attName.toString(), attVal.toString());
}
// reset
attName.setLength(0);
attVal.setLength(0);
s = S.NAME;
} else {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
}
} else if (c == '=') {
if (s == S.NAME) {
s = S.VALUE;
} else {
throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
}
} else {
tgt.append(c);
if (s == S.TOKEN)
lastPos++;
}
}
}
// collect leftovers
if (!tok.isEmpty() || s == S.NAME || s == S.VALUE) {
// remaining attrib?
if (s == S.VALUE) {
if (attName.length() > 0 && attVal.length() > 0) {
tok.attr.put(attName.toString(), attVal.toString());
}
}
AttributeSource.State state = createState(parent, tok, lastPos);
if (state != null)
res.states.add(state.clone());
}
return res;
}
Aggregations