use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method convertTokensToNamedLists.
/**
* Converts the list of Tokens to a list of NamedLists representing the tokens.
*
* @param tokenList Tokens to convert
* @param context The analysis context
*
* @return List of NamedLists containing the relevant information taken from the tokens
*/
private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) {
final List<NamedList> tokensNamedLists = new ArrayList<>();
final FieldType fieldType = context.getFieldType();
final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]);
// sort the tokens by absolute position
ArrayUtil.timSort(tokens, new Comparator<AttributeSource>() {
@Override
public int compare(AttributeSource a, AttributeSource b) {
return arrayCompare(a.getAttribute(TokenTrackingAttribute.class).getPositions(), b.getAttribute(TokenTrackingAttribute.class).getPositions());
}
private int arrayCompare(int[] a, int[] b) {
int p = 0;
final int stop = Math.min(a.length, b.length);
while (p < stop) {
int diff = a[p] - b[p];
if (diff != 0)
return diff;
p++;
}
// One is a prefix of the other, or, they are equal:
return a.length - b.length;
}
});
for (int i = 0; i < tokens.length; i++) {
AttributeSource token = tokens[i];
final NamedList<Object> tokenNamedList = new SimpleOrderedMap<>();
final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
BytesRef rawBytes = termAtt.getBytesRef();
final String text = fieldType.indexedToReadable(rawBytes, new CharsRefBuilder()).toString();
tokenNamedList.add("text", text);
if (token.hasAttribute(CharTermAttribute.class)) {
final String rawText = token.getAttribute(CharTermAttribute.class).toString();
if (!rawText.equals(text)) {
tokenNamedList.add("raw_text", rawText);
}
}
tokenNamedList.add("raw_bytes", rawBytes.toString());
if (context.getTermsToMatch().contains(rawBytes)) {
tokenNamedList.add("match", true);
}
token.reflectWith(new AttributeReflector() {
@Override
public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
// leave out position and bytes term
if (TermToBytesRefAttribute.class.isAssignableFrom(attClass))
return;
if (CharTermAttribute.class.isAssignableFrom(attClass))
return;
if (PositionIncrementAttribute.class.isAssignableFrom(attClass))
return;
String k = attClass.getName() + '#' + key;
// map keys for "standard attributes":
if (ATTRIBUTE_MAPPING.containsKey(k)) {
k = ATTRIBUTE_MAPPING.get(k);
}
if (value instanceof BytesRef) {
final BytesRef p = (BytesRef) value;
value = p.toString();
}
tokenNamedList.add(k, value);
}
});
tokensNamedLists.add(tokenNamedList);
}
return tokensNamedLists;
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class PayloadUtils method createSpanQuery.
/**
* The generated SpanQuery will be either a SpanTermQuery or an ordered, zero slop SpanNearQuery, depending
* on how many tokens are emitted.
*/
public static SpanQuery createSpanQuery(String field, String value, Analyzer analyzer) throws IOException {
// adapted this from QueryBuilder.createSpanQuery (which isn't currently public) and added reset(), end(), and close() calls
List<SpanTermQuery> terms = new ArrayList<>();
try (TokenStream in = analyzer.tokenStream(field, value)) {
in.reset();
TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
while (in.incrementToken()) {
terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
}
in.end();
}
SpanQuery query;
if (terms.isEmpty()) {
query = null;
} else if (terms.size() == 1) {
query = terms.get(0);
} else {
query = new SpanNearQuery(terms.toArray(new SpanTermQuery[terms.size()]), 0, true);
}
return query;
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class MemoryIndex method storeTerms.
private void storeTerms(Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) {
int pos = -1;
int offset = 0;
if (info.numTokens > 0) {
pos = info.lastPosition + positionIncrementGap;
offset = info.lastOffset + offsetGap;
}
try (TokenStream stream = tokenStream) {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
stream.reset();
while (stream.incrementToken()) {
// if (DEBUG) System.err.println("token='" + term + "'");
info.numTokens++;
final int posIncr = posIncrAttribute.getPositionIncrement();
if (posIncr == 0) {
info.numOverlapTokens++;
}
pos += posIncr;
int ord = info.terms.add(termAtt.getBytesRef());
if (ord < 0) {
ord = (-ord) - 1;
postingsWriter.reset(info.sliceArray.end[ord]);
} else {
info.sliceArray.start[ord] = postingsWriter.startNewSlice();
}
info.sliceArray.freq[ord]++;
info.sumTotalTermFreq++;
postingsWriter.writeInt(pos);
if (storeOffsets) {
postingsWriter.writeInt(offsetAtt.startOffset() + offset);
postingsWriter.writeInt(offsetAtt.endOffset() + offset);
}
if (storePayloads) {
final BytesRef payload = payloadAtt.getPayload();
final int pIndex;
if (payload == null || payload.length == 0) {
pIndex = -1;
} else {
pIndex = payloadsBytesRefs.append(payload);
}
postingsWriter.writeInt(pIndex);
}
info.sliceArray.end[ord] = postingsWriter.getCurrentOffset();
}
stream.end();
if (info.numTokens > 0) {
info.lastPosition = pos;
info.lastOffset = offsetAtt.endOffset() + offset;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class ICUCollationField method getCollationKey.
/**
* analyze the text with the analyzer, instead of the collator.
* because icu collators are not thread safe, this keeps things
* simple (we already have a threadlocal clone in the reused TS)
*/
private BytesRef getCollationKey(String field, String text) {
try (TokenStream source = analyzer.tokenStream(field, text)) {
source.reset();
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
// we control the analyzer here: most errors are impossible
if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
assert !source.incrementToken();
source.end();
return bytes;
} catch (IOException e) {
throw new RuntimeException("Unable to analyze text: " + text, e);
}
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class QueryBuilder method analyzeMultiPhrase.
/**
* Creates complex phrase query from the cached tokenstream contents
*/
protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException {
MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder();
mpqb.setSlop(slop);
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
int position = -1;
List<Term> multiTerms = new ArrayList<>();
stream.reset();
while (stream.incrementToken()) {
int positionIncrement = posIncrAtt.getPositionIncrement();
if (positionIncrement > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpqb.add(multiTerms.toArray(new Term[0]), position);
} else {
mpqb.add(multiTerms.toArray(new Term[0]));
}
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new Term(field, termAtt.getBytesRef()));
}
if (enablePositionIncrements) {
mpqb.add(multiTerms.toArray(new Term[0]), position);
} else {
mpqb.add(multiTerms.toArray(new Term[0]));
}
return mpqb.build();
}
Aggregations