use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestPostingsOffsets method testBasic.
public void testBasic() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(random().nextBoolean());
ft.setStoreTermVectorOffsets(random().nextBoolean());
}
Token[] tokens = new Token[] { makeToken("a", 1, 0, 6), makeToken("b", 1, 8, 9), makeToken("a", 1, 9, 17), makeToken("c", 1, 19, 50) };
doc.add(new Field("content", new CannedTokenStream(tokens), ft));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
PostingsEnum dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("a"));
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(2, dp.freq());
assertEquals(0, dp.nextPosition());
assertEquals(0, dp.startOffset());
assertEquals(6, dp.endOffset());
assertEquals(2, dp.nextPosition());
assertEquals(9, dp.startOffset());
assertEquals(17, dp.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("b"));
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(1, dp.freq());
assertEquals(1, dp.nextPosition());
assertEquals(8, dp.startOffset());
assertEquals(9, dp.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("c"));
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(1, dp.freq());
assertEquals(3, dp.nextPosition());
assertEquals(19, dp.startOffset());
assertEquals(50, dp.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
r.close();
dir.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestPostingsOffsets method makeToken.
private Token makeToken(String text, int posIncr, int startOffset, int endOffset) {
final Token t = new Token();
t.append(text);
t.setPositionIncrement(posIncr);
t.setOffset(startOffset, endOffset);
return t;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestPostingsOffsets method testLegalbutVeryLargeOffsets.
public void testLegalbutVeryLargeOffsets() throws Exception {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
Token t1 = new Token("foo", 0, Integer.MAX_VALUE - 500);
if (random().nextBoolean()) {
t1.setPayload(new BytesRef("test"));
}
Token t2 = new Token("foo", Integer.MAX_VALUE - 500, Integer.MAX_VALUE);
TokenStream tokenStream = new CannedTokenStream(new Token[] { t1, t2 });
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
// store some term vectors for the checkindex cross-check
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", tokenStream, ft);
doc.add(field);
iw.addDocument(doc);
iw.close();
dir.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestPayloadsOnVectors method testMixupMultiValued.
/** some field instances have payload att, some not */
public void testMixupMultiValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorPayloads(true);
customType.setStoreTermVectorOffsets(random().nextBoolean());
Field field = new Field("field", "", customType);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("here we go"));
field.setTokenStream(ts);
doc.add(field);
Field field2 = new Field("field", "", customType);
Token withPayload = new Token("withPayload", 0, 11);
withPayload.setPayload(new BytesRef("test"));
ts = new CannedTokenStream(withPayload);
assertTrue(ts.hasAttribute(PayloadAttribute.class));
field2.setTokenStream(ts);
doc.add(field2);
Field field3 = new Field("field", "", customType);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("nopayload"));
field3.setTokenStream(ts);
doc.add(field3);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
Terms terms = reader.getTermVector(0, "field");
assert terms != null;
TermsEnum termsEnum = terms.iterator();
assertTrue(termsEnum.seekExact(new BytesRef("withPayload")));
PostingsEnum de = termsEnum.postings(null, PostingsEnum.ALL);
assertEquals(0, de.nextDoc());
assertEquals(3, de.nextPosition());
assertEquals(new BytesRef("test"), de.getPayload());
writer.close();
reader.close();
dir.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestPayloads method testMixupDocs.
/** some docs have payload att, some not */
public void testMixupDocs() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(null);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
Field field = new TextField("field", "", Field.Store.NO);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("here we go"));
field.setTokenStream(ts);
doc.add(field);
writer.addDocument(doc);
Token withPayload = new Token("withPayload", 0, 11);
withPayload.setPayload(new BytesRef("test"));
ts = new CannedTokenStream(withPayload);
assertTrue(ts.hasAttribute(PayloadAttribute.class));
field.setTokenStream(ts);
writer.addDocument(doc);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("another"));
field.setTokenStream(ts);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
TermsEnum te = MultiFields.getFields(reader).terms("field").iterator();
assertTrue(te.seekExact(new BytesRef("withPayload")));
PostingsEnum de = te.postings(null, PostingsEnum.PAYLOADS);
de.nextDoc();
de.nextPosition();
assertEquals(new BytesRef("test"), de.getPayload());
writer.close();
reader.close();
dir.close();
}
Aggregations