use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.
the class DirectSpellChecker method suggestSimilar.
/**
* Provide spelling corrections based on several parameters.
*
* @param term The term to suggest spelling corrections for
* @param numSug The maximum number of spelling corrections
* @param ir The index reader to fetch the candidate spelling corrections from
* @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
* @param editDistance The maximum edit distance candidates are allowed to have
* @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
* @param spare a chars scratch
* @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
* @throws IOException If I/O related errors occur
*/
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException {
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
Terms terms = MultiFields.getTerms(ir, term.field());
if (terms == null) {
return Collections.emptyList();
}
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
BytesRef queryTerm = new BytesRef(term.text());
BytesRef candidateTerm;
ScoreTerm st = new ScoreTerm();
BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class);
while ((candidateTerm = e.next()) != null) {
// For FuzzyQuery, boost is the score:
float score = boostAtt.getBoost();
// ignore uncompetitive hits
if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
continue;
}
// ignore exact match of the same term
if (queryTerm.bytesEquals(candidateTerm)) {
continue;
}
int df = e.docFreq();
// check docFreq if required
if (df <= docfreq) {
continue;
}
final String termAsString;
if (distance == INTERNAL_LEVENSHTEIN) {
// delay creating strings until the end
termAsString = null;
} else {
spare.copyUTF8Bytes(candidateTerm);
termAsString = spare.toString();
score = distance.getDistance(term.text(), termAsString);
}
if (score < accuracy) {
continue;
}
// add new entry in PQ
st.term = BytesRef.deepCopyOf(candidateTerm);
st.boost = score;
st.docfreq = df;
st.termAsString = termAsString;
st.score = score;
stQueue.offer(st);
// possibly drop entries from queue
st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
}
return stQueue;
}
use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method convertTokensToNamedLists.
/**
* Converts the list of Tokens to a list of NamedLists representing the tokens.
*
* @param tokenList Tokens to convert
* @param context The analysis context
*
* @return List of NamedLists containing the relevant information taken from the tokens
*/
private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) {
final List<NamedList> tokensNamedLists = new ArrayList<>();
final FieldType fieldType = context.getFieldType();
final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]);
// sort the tokens by absolute position
ArrayUtil.timSort(tokens, new Comparator<AttributeSource>() {
@Override
public int compare(AttributeSource a, AttributeSource b) {
return arrayCompare(a.getAttribute(TokenTrackingAttribute.class).getPositions(), b.getAttribute(TokenTrackingAttribute.class).getPositions());
}
private int arrayCompare(int[] a, int[] b) {
int p = 0;
final int stop = Math.min(a.length, b.length);
while (p < stop) {
int diff = a[p] - b[p];
if (diff != 0)
return diff;
p++;
}
// One is a prefix of the other, or, they are equal:
return a.length - b.length;
}
});
for (int i = 0; i < tokens.length; i++) {
AttributeSource token = tokens[i];
final NamedList<Object> tokenNamedList = new SimpleOrderedMap<>();
final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
BytesRef rawBytes = termAtt.getBytesRef();
final String text = fieldType.indexedToReadable(rawBytes, new CharsRefBuilder()).toString();
tokenNamedList.add("text", text);
if (token.hasAttribute(CharTermAttribute.class)) {
final String rawText = token.getAttribute(CharTermAttribute.class).toString();
if (!rawText.equals(text)) {
tokenNamedList.add("raw_text", rawText);
}
}
tokenNamedList.add("raw_bytes", rawBytes.toString());
if (context.getTermsToMatch().contains(rawBytes)) {
tokenNamedList.add("match", true);
}
token.reflectWith(new AttributeReflector() {
@Override
public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
// leave out position and bytes term
if (TermToBytesRefAttribute.class.isAssignableFrom(attClass))
return;
if (CharTermAttribute.class.isAssignableFrom(attClass))
return;
if (PositionIncrementAttribute.class.isAssignableFrom(attClass))
return;
String k = attClass.getName() + '#' + key;
// map keys for "standard attributes":
if (ATTRIBUTE_MAPPING.containsKey(k)) {
k = ATTRIBUTE_MAPPING.get(k);
}
if (value instanceof BytesRef) {
final BytesRef p = (BytesRef) value;
value = p.toString();
}
tokenNamedList.add(k, value);
}
});
tokensNamedLists.add(tokenNamedList);
}
return tokensNamedLists;
}
use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method analyzeValue.
/**
* Analyzes the given value using the given Analyzer.
*
* @param value Value to analyze
* @param context The {@link AnalysisContext analysis context}.
*
* @return NamedList containing the tokens produced by analyzing the given value
*/
protected NamedList<? extends Object> analyzeValue(String value, AnalysisContext context) {
Analyzer analyzer = context.getAnalyzer();
if (!TokenizerChain.class.isInstance(analyzer)) {
try (TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), value)) {
NamedList<List<NamedList>> namedList = new NamedList<>();
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
return namedList;
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
}
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
NamedList<Object> namedList = new NamedList<>();
if (0 < cfiltfacs.length) {
String source = value;
for (CharFilterFactory cfiltfac : cfiltfacs) {
Reader reader = new StringReader(source);
reader = cfiltfac.create(reader);
source = writeCharStream(namedList, reader);
}
}
TokenStream tokenStream = tfac.create();
((Tokenizer) tokenStream).setReader(tokenizerChain.initReader(null, new StringReader(value)));
List<AttributeSource> tokens = analyzeTokenStream(tokenStream);
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokenStream, tokens);
for (TokenFilterFactory tokenFilterFactory : filtfacs) {
for (final AttributeSource tok : tokens) {
tok.getAttribute(TokenTrackingAttribute.class).freezeStage();
}
// overwrite the vars "tokenStream", "tokens", and "listBasedTokenStream"
tokenStream = tokenFilterFactory.create(listBasedTokenStream);
tokens = analyzeTokenStream(tokenStream);
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
listBasedTokenStream = new ListBasedTokenStream(listBasedTokenStream, tokens);
}
return namedList;
}
use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.
the class ShingleFilter method getNextToken.
/**
* <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>,
* <code>positionIncrement - 1</code> {@link #fillerToken}s are
* inserted first.
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
* @throws IOException if the input stream has a problem
*/
private InputWindowToken getNextToken(InputWindowToken target) throws IOException {
InputWindowToken newTarget = target;
if (numFillerTokensToInsert > 0) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
} else if (!exhausted) {
if (input.incrementToken()) {
if (null == target) {
newTarget = new InputWindowToken(cloneAttributes());
} else {
this.copyTo(target.attSource);
}
if (posIncrAtt.getPositionIncrement() > 1) {
// Each output shingle must contain at least one input token,
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
// Save the current token as the next input stream token
if (null == nextInputStreamToken) {
nextInputStreamToken = cloneAttributes();
} else {
this.copyTo(nextInputStreamToken);
}
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
newTarget.isFiller = false;
}
} else {
exhausted = true;
input.end();
endState = captureState();
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
if (numFillerTokensToInsert > 0) {
nextInputStreamToken = new AttributeSource(getAttributeFactory());
nextInputStreamToken.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
// Recurse/loop just once:
return getNextToken(target);
} else {
newTarget = null;
}
}
} else {
newTarget = null;
}
return newTarget;
}
use of org.apache.lucene.util.AttributeSource in project commons by twitter.
the class TokenTypeAttributeSerializerTest method serialize.
private byte[] serialize(TokenType tokenType) throws IOException {
AttributeSource attributeSource = new AttributeSource();
TokenTypeAttribute tokenTypeAttribute = attributeSource.addAttribute(TokenTypeAttribute.class);
tokenTypeAttribute.setType(tokenType);
TokenTypeAttributeSerializer serializer = new TokenTypeAttributeSerializer();
serializer.initialize(attributeSource, TokenStreamSerializer.CURRENT_VERSION);
ByteArrayOutputStream output = new ByteArrayOutputStream();
TokenStreamSerializer.AttributeOutputStream outputStream = new TokenStreamSerializer.AttributeOutputStream(output);
serializer.serialize(outputStream);
return output.toByteArray();
}
Aggregations