use of org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute in project elasticsearch by elastic.
the class TransportAnalyzeAction method simpleAnalyze.
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
int lastPosition = -1;
int lastOffset = 0;
for (String text : request.text()) {
try (TokenStream stream = analyzer.tokenStream(field, text)) {
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));
}
stream.end();
lastOffset += offset.endOffset();
lastPosition += posIncr.getPositionIncrement();
lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
}
}
return tokens;
}
use of org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute in project lucene-solr by apache.
the class TokenStreamToTermAutomatonQuery method toQuery.
/** Pulls the graph (including {@link
* PositionLengthAttribute}) from the provided {@link
* TokenStream}, and creates the corresponding
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
public TermAutomatonQuery toQuery(String field, TokenStream in) throws IOException {
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
in.reset();
TermAutomatonQuery query = new TermAutomatonQuery(field);
int pos = -1;
int lastPos = 0;
int maxOffset = 0;
int maxPos = -1;
int state = -1;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (preservePositionIncrements == false && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
if (posInc > 1) {
throw new IllegalArgumentException("cannot handle holes; to accept any term, use '*' term");
}
if (posInc > 0) {
// New node:
pos += posInc;
}
int endPos = pos + posLengthAtt.getPositionLength();
while (state < endPos) {
state = query.createState();
}
BytesRef term = termBytesAtt.getBytesRef();
//System.out.println(pos + "-" + endPos + ": " + term.utf8ToString() + ": posInc=" + posInc);
if (term.length == 1 && term.bytes[term.offset] == (byte) '*') {
query.addAnyTransition(pos, endPos);
} else {
query.addTransition(pos, endPos, term);
}
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
maxPos = Math.max(maxPos, endPos);
}
in.end();
// TODO: look at endOffset? ts2a did...
// TODO: this (setting "last" state as the only accept state) may be too simplistic?
query.setAccept(state, true);
query.finish();
return query;
}
use of org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute in project lucene-solr by apache.
the class FreeTextSuggester method lookup.
/** Retrieve suggestions. */
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
if (fst == null) {
throw new IllegalStateException("Lookup not supported at this time");
}
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
//System.out.println("lookup: key='" + key + "'");
// Run full analysis, but save only the
// last 1gram, last 2gram, etc.:
int maxEndOffset = -1;
boolean sawRealToken = false;
while (ts.incrementToken()) {
BytesRef tokenBytes = termBytesAtt.getBytesRef();
sawRealToken |= tokenBytes.length > 0;
// TODO: this is somewhat iffy; today, ShingleFilter
// sets posLen to the gram count; maybe we should make
// a separate dedicated att for this?
int gramCount = posLenAtt.getPositionLength();
assert gramCount <= grams;
// Safety: make sure the recalculated count "agrees":
if (countGrams(tokenBytes) != gramCount) {
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
}
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
BytesRefBuilder b = new BytesRefBuilder();
b.append(tokenBytes);
lastTokens[gramCount - 1] = b;
}
ts.end();
if (!sawRealToken) {
throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
}
// Carefully fill last tokens with _ tokens;
// ShingleFilter appraently won't emit "only hole"
// tokens:
int endPosInc = posIncAtt.getPositionIncrement();
// Note this will also be true if input is the empty
// string (in which case we saw no tokens and
// maxEndOffset is still -1), which in fact works out OK
// because we fill the unigram with an empty BytesRef
// below:
boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
if (lastTokenEnded) {
// starting with "foo":
for (int i = grams - 1; i > 0; i--) {
BytesRefBuilder token = lastTokens[i - 1];
if (token == null) {
continue;
}
token.append(separator);
lastTokens[i] = token;
}
lastTokens[0] = new BytesRefBuilder();
}
Arc<Long> arc = new Arc<>();
BytesReader bytesReader = fst.getBytesReader();
// Try highest order models first, and if they return
// results, return that; else, fallback:
double backoff = 1.0;
List<LookupResult> results = new ArrayList<>(num);
// We only add a given suffix once, from the highest
// order model that saw it; for subsequent lower order
// models we skip it:
final Set<BytesRef> seen = new HashSet<>();
for (int gram = grams - 1; gram >= 0; gram--) {
BytesRefBuilder token = lastTokens[gram];
// Don't make unigram predictions from empty string:
if (token == null || (token.length() == 0 && key.length() > 0)) {
//System.out.println(" gram=" + gram + ": skip: not enough input");
continue;
}
if (endPosInc > 0 && gram <= endPosInc) {
//System.out.println(" break: only holes now");
break;
}
//System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
// TODO: we could add fuzziness here
// match the prefix portion exactly
//Pair<Long,BytesRef> prefixOutput = null;
Long prefixOutput = null;
try {
prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
if (prefixOutput == null) {
// This model never saw this prefix, e.g. the
// trigram model never saw context "purple mushroom"
backoff *= ALPHA;
continue;
}
// TODO: we could do this division at build time, and
// bake it into the FST?
// Denominator for computing scores from current
// model's predictions:
long contextCount = totTokens;
BytesRef lastTokenFragment = null;
for (int i = token.length() - 1; i >= 0; i--) {
if (token.byteAt(i) == separator) {
BytesRef context = new BytesRef(token.bytes(), 0, i);
Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
assert output != null;
contextCount = decodeWeight(output);
lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
break;
}
}
final BytesRefBuilder finalLastToken = new BytesRefBuilder();
if (lastTokenFragment == null) {
finalLastToken.copyBytes(token.get());
} else {
finalLastToken.copyBytes(lastTokenFragment);
}
CharsRefBuilder spare = new CharsRefBuilder();
// complete top-N
TopResults<Long> completions = null;
try {
// Because we store multiple models in one FST
// (1gram, 2gram, 3gram), we must restrict the
// search so that it only considers the current
// model. For highest order model, this is not
// necessary since all completions in the FST
// must be from this model, but for lower order
// models we have to filter out the higher order
// ones:
// Must do num+seen.size() for queue depth because we may
// reject up to seen.size() paths in acceptResult():
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num + seen.size(), weightComparator) {
BytesRefBuilder scratchBytes = new BytesRefBuilder();
@Override
protected void addIfCompetitive(Util.FSTPath<Long> path) {
if (path.arc.label != separator) {
//System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
super.addIfCompetitive(path);
} else {
//System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
}
}
@Override
protected boolean acceptResult(IntsRef input, Long output) {
Util.toBytesRef(input, scratchBytes);
finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
int lenSav = finalLastToken.length();
finalLastToken.append(scratchBytes);
//System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
boolean ret = seen.contains(finalLastToken.get()) == false;
finalLastToken.setLength(lenSav);
return ret;
}
};
// since this search is initialized with a single start node
// it is okay to start with an empty input path here
searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
completions = searcher.search();
assert completions.isComplete;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
int prefixLength = token.length();
BytesRefBuilder suffix = new BytesRefBuilder();
nextCompletion: for (Result<Long> completion : completions) {
token.setLength(prefixLength);
// append suffix
Util.toBytesRef(completion.input, suffix);
token.append(suffix);
//System.out.println(" completion " + token.utf8ToString());
// Skip this path if a higher-order model already
// saw/predicted its last token:
BytesRef lastToken = token.get();
for (int i = token.length() - 1; i >= 0; i--) {
if (token.byteAt(i) == separator) {
assert token.length() - i - 1 > 0;
lastToken = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
break;
}
}
if (seen.contains(lastToken)) {
//System.out.println(" skip dup " + lastToken.utf8ToString());
continue nextCompletion;
}
seen.add(BytesRef.deepCopyOf(lastToken));
spare.copyUTF8Bytes(token.get());
LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
results.add(result);
assert results.size() == seen.size();
//System.out.println(" add result=" + result);
}
backoff *= ALPHA;
}
Collections.sort(results, new Comparator<LookupResult>() {
@Override
public int compare(LookupResult a, LookupResult b) {
if (a.value > b.value) {
return -1;
} else if (a.value < b.value) {
return 1;
} else {
// Tie break by UTF16 sort order:
return ((String) a.key).compareTo((String) b.key);
}
}
});
if (results.size() > num) {
results.subList(num, results.size()).clear();
}
return results;
}
}
use of org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute in project lucene-solr by apache.
the class GraphTokenStreamFiniteStrings method build.
/**
* Build an automaton from the provided {@link TokenStream}.
*/
private Automaton build(final TokenStream in) throws IOException {
Automaton.Builder builder = new Automaton.Builder();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
in.reset();
int pos = -1;
int prevIncr = 1;
int state = -1;
while (in.incrementToken()) {
int currentIncr = posIncAtt.getPositionIncrement();
if (pos == -1 && currentIncr < 1) {
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
}
// always use inc 1 while building, but save original increment
int incr = Math.min(1, currentIncr);
if (incr > 0) {
pos += incr;
}
int endPos = pos + posLengthAtt.getPositionLength();
while (state < endPos) {
state = builder.createState();
}
BytesRef term = termBytesAtt.getBytesRef();
int id = getTermID(currentIncr, prevIncr, term);
builder.addTransition(pos, endPos, id);
// only save last increment on non-zero increment in case we have multiple stacked tokens
if (currentIncr > 0) {
prevIncr = currentIncr;
}
}
in.end();
if (state != -1) {
builder.setAccept(state, true);
}
return builder.finish();
}
use of org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute in project lucene-solr by apache.
the class TokenStreamToAutomaton method toAutomaton.
/** Pulls the graph (including {@link
* PositionLengthAttribute}) from the provided {@link
* TokenStream}, and creates the corresponding
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
final Automaton.Builder builder = new Automaton.Builder();
builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
in.reset();
// Only temporarily holds states ahead of our current
// position:
final RollingBuffer<Position> positions = new Positions();
int pos = -1;
int freedPos = 0;
Position posData = null;
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (preservePositionIncrements == false && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
if (posInc > 0) {
// New node:
pos += posInc;
posData = positions.get(pos);
assert posData.leaving == -1;
if (posData.arriving == -1) {
// No token ever arrived to this position
if (pos == 0) {
// OK: this is the first token
posData.leaving = 0;
} else {
// This means there's a hole (eg, StopFilter
// does this):
posData.leaving = builder.createState();
addHoles(builder, positions, pos);
}
} else {
posData.leaving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
// A token spanned over a hole; add holes
// "under" it:
addHoles(builder, positions, pos);
}
}
while (freedPos <= pos) {
Position freePosData = positions.get(freedPos);
// don't free this position yet if we may still need to fill holes over it:
if (freePosData.arriving == -1 || freePosData.leaving == -1) {
break;
}
positions.freeBefore(freedPos);
freedPos++;
}
}
final int endPos = pos + posLengthAtt.getPositionLength();
final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
if (endPosData.arriving == -1) {
endPosData.arriving = builder.createState();
}
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
}
} else {
termLen = termUTF8.length;
}
int state = posData.leaving;
for (int byteIDX = 0; byteIDX < termLen; byteIDX++) {
final int nextState = byteIDX == termLen - 1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
builder.addTransition(state, nextState, c);
state = nextState;
}
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
}
in.end();
int endState = -1;
int endPosInc = posIncAtt.getPositionIncrement();
if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
endPosInc = 1;
}
if (endPosInc > 0) {
// there were hole(s) after the last token
endState = builder.createState();
// add trailing holes now:
int lastState = endState;
while (true) {
int state1 = builder.createState();
builder.addTransition(lastState, state1, HOLE);
endPosInc--;
if (endPosInc == 0) {
builder.setAccept(state1, true);
break;
}
int state2 = builder.createState();
builder.addTransition(state1, state2, POS_SEP);
lastState = state2;
}
} else {
endState = -1;
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != -1) {
if (endState != -1) {
builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
builder.setAccept(posData.arriving, true);
}
}
pos++;
}
return builder.finish();
}
Aggregations