use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestJapaneseTokenizer method testSurrogates2.
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
int numIterations = atLeast(10000);
for (int i = 0; i < numIterations; i++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + i);
}
String s = TestUtil.randomUnicodeString(random(), 100);
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
ts.end();
}
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestJapaneseTokenizer method makeTokenList.
private ArrayList<String> makeTokenList(Analyzer a, String in) throws Exception {
ArrayList<String> list = new ArrayList<>();
TokenStream ts = a.tokenStream("dummy", in);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
list.add(termAtt.toString());
}
ts.end();
ts.close();
return list;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestCachingTokenFilter method checkTokens.
private void checkTokens(TokenStream stream) throws IOException {
int count = 0;
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
assertTrue(count < tokens.length);
assertEquals(tokens[count], termAtt.toString());
count++;
}
assertEquals(tokens.length, count);
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestCachingTokenFilter method testCaching.
public void testCaching() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
AtomicInteger resetCount = new AtomicInteger(0);
TokenStream stream = new TokenStream() {
private int index = 0;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public void reset() throws IOException {
super.reset();
resetCount.incrementAndGet();
}
@Override
public boolean incrementToken() {
if (index == tokens.length) {
return false;
} else {
clearAttributes();
termAtt.append(tokens[index++]);
offsetAtt.setOffset(0, 0);
return true;
}
}
};
stream = new CachingTokenFilter(stream);
doc.add(new TextField("preanalyzed", stream));
// 1) we consume all tokens twice before we add the doc to the index
assertFalse(((CachingTokenFilter) stream).isCached());
stream.reset();
assertFalse(((CachingTokenFilter) stream).isCached());
checkTokens(stream);
stream.reset();
checkTokens(stream);
assertTrue(((CachingTokenFilter) stream).isCached());
// 2) now add the document to the index and verify if all tokens are indexed
// don't reset the stream here, the DocumentWriter should do that implicitly
writer.addDocument(doc);
IndexReader reader = writer.getReader();
PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term1"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term2"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term3"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
reader.close();
writer.close();
// 3) reset stream and consume tokens again
stream.reset();
checkTokens(stream);
assertEquals(1, resetCount.get());
dir.close();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestPositionIncrement method testSetPosition.
public void testSetPosition() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new Tokenizer() {
// TODO: use CannedTokenStream
private final String[] TOKENS = { "1", "2", "3", "4", "5" };
private final int[] INCREMENTS = { 1, 2, 1, 0, 1 };
private int i = 0;
PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public boolean incrementToken() {
if (i == TOKENS.length)
return false;
clearAttributes();
termAtt.append(TOKENS[i]);
offsetAtt.setOffset(i, i);
posIncrAtt.setPositionIncrement(INCREMENTS[i]);
i++;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.i = 0;
}
});
}
};
Directory store = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
Document d = new Document();
d.add(newTextField("field", "bogus", Field.Store.YES));
writer.addDocument(d);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
pos.nextDoc();
// first token should be at position 0
assertEquals(0, pos.nextPosition());
pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
pos.nextDoc();
// second token should be at position 2
assertEquals(2, pos.nextPosition());
PhraseQuery q;
ScoreDoc[] hits;
q = new PhraseQuery("field", "1", "2");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// same as previous, using the builder with implicit positions
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"));
builder.add(new Term("field", "2"));
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// same as previous, just specify positions explicitely.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"), 0);
builder.add(new Term("field", "2"), 1);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// specifying correct positions should find the phrase.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"), 0);
builder.add(new Term("field", "2"), 2);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "3");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "3", "4");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// phrase query would find it when correct positions are specified.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "3"), 0);
builder.add(new Term("field", "4"), 0);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
// phrase query should fail for non existing searched term
// even if there exist another searched terms in the same searched position.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "3"), 0);
builder.add(new Term("field", "9"), 0);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// multi-phrase query should succed for non existing searched term
// because there exist another searched terms in the same searched position.
MultiPhraseQuery.Builder mqb = new MultiPhraseQuery.Builder();
mqb.add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
hits = searcher.search(mqb.build(), 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "4");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "3", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "4", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
store.close();
}
Aggregations