use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project jackrabbit-oak by apache.
the class LuceneIndex method tokenize.
/**
* Tries to merge back tokens that are split on relevant fulltext query
* wildcards ('*' or '?')
*
*
* @param text
* @param analyzer
* @return
*/
static List<String> tokenize(String text, Analyzer analyzer) {
List<String> tokens = new ArrayList<String>();
TokenStream stream = null;
try {
stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
// TypeAttribute type = stream.addAttribute(TypeAttribute.class);
stream.reset();
int poz = 0;
boolean hasFulltextToken = false;
StringBuilder token = new StringBuilder();
while (stream.incrementToken()) {
String term = termAtt.toString();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
if (start > poz) {
for (int i = poz; i < start; i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
hasFulltextToken = true;
}
}
}
}
poz = end;
if (hasFulltextToken) {
token.append(term);
hasFulltextToken = false;
} else {
if (token.length() > 0) {
tokens.add(token.toString());
}
token = new StringBuilder();
token.append(term);
}
}
// consume to the end of the string
if (poz < text.length()) {
for (int i = poz; i < text.length(); i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
}
}
}
}
if (token.length() > 0) {
tokens.add(token.toString());
}
stream.end();
} catch (IOException e) {
LOG.error("Building fulltext query failed", e.getMessage());
return null;
} finally {
try {
if (stream != null) {
stream.close();
}
} catch (IOException e) {
// ignore
}
}
return tokens;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class ShingleFilter method getNextToken.
/**
* <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>,
* <code>positionIncrement - 1</code> {@link #fillerToken}s are
* inserted first.
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
* @throws IOException if the input stream has a problem
*/
private InputWindowToken getNextToken(InputWindowToken target) throws IOException {
InputWindowToken newTarget = target;
if (numFillerTokensToInsert > 0) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
} else if (!exhausted) {
if (input.incrementToken()) {
if (null == target) {
newTarget = new InputWindowToken(cloneAttributes());
} else {
this.copyTo(target.attSource);
}
if (posIncrAtt.getPositionIncrement() > 1) {
// Each output shingle must contain at least one input token,
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
// Save the current token as the next input stream token
if (null == nextInputStreamToken) {
nextInputStreamToken = cloneAttributes();
} else {
this.copyTo(nextInputStreamToken);
}
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
newTarget.isFiller = false;
}
} else {
exhausted = true;
input.end();
endState = captureState();
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
if (numFillerTokensToInsert > 0) {
nextInputStreamToken = new AttributeSource(getAttributeFactory());
nextInputStreamToken.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
// Recurse/loop just once:
return getNextToken(target);
} else {
newTarget = null;
}
}
} else {
newTarget = null;
}
return newTarget;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class EdgeNGramTokenFilterTest method testSupplementaryCharacters.
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer) tk).setReader(new StringReader(s));
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, i);
assertEquals(s.substring(0, end), termAtt.toString());
}
assertFalse(tk.incrementToken());
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class NGramTokenFilterTest method testSupplementaryCharacters.
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer) tk).setReader(new StringReader(s));
tk = new NGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int startIndex = Character.offsetByCodePoints(s, 0, start);
final int endIndex = Character.offsetByCodePoints(s, 0, end);
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
}
}
assertFalse(tk.incrementToken());
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class TestCachingTokenFilter method testCaching.
public void testCaching() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
AtomicInteger resetCount = new AtomicInteger(0);
TokenStream stream = new TokenStream() {
private int index = 0;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public void reset() throws IOException {
super.reset();
resetCount.incrementAndGet();
}
@Override
public boolean incrementToken() {
if (index == tokens.length) {
return false;
} else {
clearAttributes();
termAtt.append(tokens[index++]);
offsetAtt.setOffset(0, 0);
return true;
}
}
};
stream = new CachingTokenFilter(stream);
doc.add(new TextField("preanalyzed", stream));
// 1) we consume all tokens twice before we add the doc to the index
assertFalse(((CachingTokenFilter) stream).isCached());
stream.reset();
assertFalse(((CachingTokenFilter) stream).isCached());
checkTokens(stream);
stream.reset();
checkTokens(stream);
assertTrue(((CachingTokenFilter) stream).isCached());
// 2) now add the document to the index and verify if all tokens are indexed
// don't reset the stream here, the DocumentWriter should do that implicitly
writer.addDocument(doc);
IndexReader reader = writer.getReader();
PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term1"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term2"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term3"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
reader.close();
writer.close();
// 3) reset stream and consume tokens again
stream.reset();
checkTokens(stream);
assertEquals(1, resetCount.get());
dir.close();
}
Aggregations