use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class FastVectorHighlighterTest method token.
private static Token token(String term, int posInc, int startOffset, int endOffset) {
Token t = new Token(term, startOffset, endOffset);
t.setPositionIncrement(posInc);
return t;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class FastVectorHighlighterTest method testBooleanPhraseWithSynonym.
public void testBooleanPhraseWithSynonym() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType type = new FieldType(TextField.TYPE_NOT_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Token syn = new Token("httpwwwfacebookcom", 6, 29);
syn.setPositionIncrement(0);
CannedTokenStream ts = new CannedTokenStream(new Token("test", 0, 4), new Token("http", 6, 10), syn, new Token("www", 13, 16), new Token("facebook", 17, 25), new Token("com", 26, 29));
Field field = new Field("field", ts, type);
doc.add(field);
doc.add(new StoredField("field", "Test: http://www.facebook.com"));
writer.addDocument(doc);
FastVectorHighlighter highlighter = new FastVectorHighlighter();
IndexReader reader = DirectoryReader.open(writer);
int docId = 0;
// query1: match
PhraseQuery pq = new PhraseQuery("field", "test", "http", "www", "facebook", "com");
FieldQuery fieldQuery = highlighter.getFieldQuery(pq, reader);
String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
// query2: match
PhraseQuery pq2 = new PhraseQuery("field", "test", "httpwwwfacebookcom", "www", "facebook", "com");
fieldQuery = highlighter.getFieldQuery(pq2, reader);
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
// query3: OR query1 and query2 together
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(pq, BooleanClause.Occur.SHOULD);
bq.add(pq2, BooleanClause.Occur.SHOULD);
fieldQuery = highlighter.getFieldQuery(bq.build(), reader);
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
reader.close();
writer.close();
dir.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class BaseTermVectorsFormatTestCase method testPostingsEnumPayloads.
public void testPostingsEnumPayloads() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(null);
IndexWriter iw = new IndexWriter(dir, iwc);
Document doc = new Document();
Token token1 = new Token("bar", 0, 3);
token1.setPayload(new BytesRef("pay1"));
Token token2 = new Token("bar", 4, 7);
token2.setPayload(new BytesRef("pay2"));
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorPayloads(true);
doc.add(new Field("foo", new CannedTokenStream(token1, token2), ft));
iw.addDocument(doc);
DirectoryReader reader = DirectoryReader.open(iw);
Terms terms = getOnlyLeafReader(reader).getTermVector(0, "foo");
TermsEnum termsEnum = terms.iterator();
assertNotNull(termsEnum);
assertEquals(new BytesRef("bar"), termsEnum.next());
// sugar method (FREQS)
PostingsEnum postings = termsEnum.postings(null);
assertEquals(-1, postings.docID());
assertEquals(0, postings.nextDoc());
assertEquals(2, postings.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
// termsenum reuse (FREQS)
PostingsEnum postings2 = termsEnum.postings(postings);
assertNotNull(postings2);
// and it had better work
assertEquals(-1, postings2.docID());
assertEquals(0, postings2.nextDoc());
assertEquals(2, postings2.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings2.nextDoc());
// asking for docs only: ok
PostingsEnum docsOnly = termsEnum.postings(null, PostingsEnum.NONE);
assertEquals(-1, docsOnly.docID());
assertEquals(0, docsOnly.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly.freq() == 1 || docsOnly.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly.nextDoc());
// reuse that too
PostingsEnum docsOnly2 = termsEnum.postings(docsOnly, PostingsEnum.NONE);
assertNotNull(docsOnly2);
// and it had better work
assertEquals(-1, docsOnly2.docID());
assertEquals(0, docsOnly2.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly2.freq() == 1 || docsOnly2.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly2.nextDoc());
// asking for positions, ok
PostingsEnum docsAndPositionsEnum = termsEnum.postings(null, PostingsEnum.POSITIONS);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum.getPayload()));
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// now reuse the positions
PostingsEnum docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.POSITIONS);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
// payloads
docsAndPositionsEnum = termsEnum.postings(null, PostingsEnum.PAYLOADS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.PAYLOADS);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum.getPayload()));
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = termsEnum.postings(null, PostingsEnum.ALL);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.ALL);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
iw.close();
reader.close();
dir.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TokenGroup method addToken.
void addToken(float score) {
if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
final int termStartOffset = offsetAtt.startOffset();
final int termEndOffset = offsetAtt.endOffset();
if (numTokens == 0) {
startOffset = matchStartOffset = termStartOffset;
endOffset = matchEndOffset = termEndOffset;
tot += score;
} else {
startOffset = Math.min(startOffset, termStartOffset);
endOffset = Math.max(endOffset, termEndOffset);
if (score > 0) {
if (tot == 0) {
matchStartOffset = termStartOffset;
matchEndOffset = termEndOffset;
} else {
matchStartOffset = Math.min(matchStartOffset, termStartOffset);
matchEndOffset = Math.max(matchEndOffset, termEndOffset);
}
tot += score;
}
}
Token token = new Token();
token.setOffset(termStartOffset, termEndOffset);
token.setEmpty().append(termAtt);
tokens[numTokens] = token;
scores[numTokens] = score;
numTokens++;
}
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestMemoryIndexAgainstRAMDir method testEmptyString.
// LUCENE-4880
public void testEmptyString() throws IOException {
MemoryIndex memory = new MemoryIndex();
memory.addField("foo", new CannedTokenStream(new Token("", 0, 5)));
IndexSearcher searcher = memory.createSearcher();
TopDocs docs = searcher.search(new TermQuery(new Term("foo", "")), 10);
assertEquals(1, docs.totalHits);
TestUtil.checkReader(searcher.getIndexReader());
}
Aggregations