use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class GraphTokenStreamFiniteStrings method build.
/**
* Build an automaton from the provided {@link TokenStream}.
*/
private Automaton build(final TokenStream in) throws IOException {
Automaton.Builder builder = new Automaton.Builder();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
in.reset();
int pos = -1;
int prevIncr = 1;
int state = -1;
while (in.incrementToken()) {
int currentIncr = posIncAtt.getPositionIncrement();
if (pos == -1 && currentIncr < 1) {
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
}
// always use inc 1 while building, but save original increment
int incr = Math.min(1, currentIncr);
if (incr > 0) {
pos += incr;
}
int endPos = pos + posLengthAtt.getPositionLength();
while (state < endPos) {
state = builder.createState();
}
BytesRef term = termBytesAtt.getBytesRef();
int id = getTermID(currentIncr, prevIncr, term);
builder.addTransition(pos, endPos, id);
// only save last increment on non-zero increment in case we have multiple stacked tokens
if (currentIncr > 0) {
prevIncr = currentIncr;
}
}
in.end();
if (state != -1) {
builder.setAccept(state, true);
}
return builder.finish();
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class ReadTokensTask method doLogic.
@Override
public int doLogic() throws Exception {
List<IndexableField> fields = doc.getFields();
Analyzer analyzer = getRunData().getAnalyzer();
int tokenCount = 0;
for (final IndexableField field : fields) {
if (field.fieldType().indexOptions() == IndexOptions.NONE || field.fieldType().tokenized() == false) {
continue;
}
final TokenStream stream = field.tokenStream(analyzer, null);
// reset the TokenStream to the first token
stream.reset();
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
while (stream.incrementToken()) {
termAtt.getBytesRef();
tokenCount++;
}
stream.end();
stream.close();
}
totalTokenCount += tokenCount;
return tokenCount;
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class TestLongPostings method getRandomTerm.
// Produces a realistic unicode random string that
// survives MockAnalyzer unchanged:
private String getRandomTerm(String other) throws IOException {
Analyzer a = new MockAnalyzer(random());
while (true) {
String s = TestUtil.randomRealisticUnicodeString(random());
if (other != null && s.equals(other)) {
continue;
}
try (TokenStream ts = a.tokenStream("foo", s)) {
final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
ts.reset();
int count = 0;
boolean changed = false;
while (ts.incrementToken()) {
final BytesRef termBytes = termAtt.getBytesRef();
if (count == 0 && !termBytes.utf8ToString().equals(s)) {
// The value was changed during analysis. Keep iterating so the
// tokenStream is exhausted.
changed = true;
}
count++;
}
ts.end();
// Did we iterate just once and the value was unchanged?
if (!changed && count == 1) {
return s;
}
}
}
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project janusgraph by JanusGraph.
the class LuceneIndex method customTokenize.
// adapted from SolrIndex
private List<String> customTokenize(Analyzer analyzer, String fieldName, String value) {
final List<String> terms = new ArrayList<>();
try (CachingTokenFilter stream = new CachingTokenFilter(analyzer.tokenStream(fieldName, value))) {
final TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
stream.reset();
while (stream.incrementToken()) {
terms.add(termAtt.getBytesRef().utf8ToString());
}
return terms;
} catch (IOException e) {
throw new IllegalArgumentException(e.getMessage(), e);
}
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class Analyzer method normalize.
/**
* Normalize a string down to the representation that it would have in the
* index.
* <p>
* This is typically used by query parsers in order to generate a query on
* a given term, without tokenizing or stemming, which are undesirable if
* the string to analyze is a partial word (eg. in case of a wildcard or
* fuzzy query).
* <p>
* This method uses {@link #initReaderForNormalization(String, Reader)} in
* order to apply necessary character-level normalization and then
* {@link #normalize(String, TokenStream)} in order to apply the normalizing
* token filters.
*/
public final BytesRef normalize(final String fieldName, final String text) {
try {
// apply char filters
final String filteredText;
try (Reader reader = new StringReader(text)) {
Reader filterReader = initReaderForNormalization(fieldName, reader);
char[] buffer = new char[64];
StringBuilder builder = new StringBuilder();
for (; ; ) {
final int read = filterReader.read(buffer, 0, buffer.length);
if (read == -1) {
break;
}
builder.append(buffer, 0, read);
}
filteredText = builder.toString();
} catch (IOException e) {
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
}
final AttributeFactory attributeFactory = attributeFactory(fieldName);
try (TokenStream ts = normalize(fieldName, new StringTokenStream(attributeFactory, filteredText, text.length()))) {
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
ts.reset();
if (ts.incrementToken() == false) {
throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " + this + " and input \"" + text + "\"");
}
final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef());
if (ts.incrementToken()) {
throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + this + " and input \"" + text + "\"");
}
ts.end();
return term;
}
} catch (IOException e) {
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
}
}
Aggregations