use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestRemoveDuplicatesTokenFilterFactory method testDups.
public void testDups(final String expected, final Token... tokens) throws Exception {
TokenStream stream = new CannedTokenStream(tokens);
stream = tokenFilterFactory("RemoveDuplicates").create(stream);
assertTokenStreamContents(stream, expected.split("\\s"));
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestFixBrokenOffsetsFilter method testBogusTermVectors.
public void testBogusTermVectors() throws IOException {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", "", ft);
field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))));
doc.add(field);
iw.addDocument(doc);
iw.close();
dir.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestPostingsOffsets method testBasic.
public void testBasic() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(random().nextBoolean());
ft.setStoreTermVectorOffsets(random().nextBoolean());
}
Token[] tokens = new Token[] { makeToken("a", 1, 0, 6), makeToken("b", 1, 8, 9), makeToken("a", 1, 9, 17), makeToken("c", 1, 19, 50) };
doc.add(new Field("content", new CannedTokenStream(tokens), ft));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
PostingsEnum dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("a"));
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(2, dp.freq());
assertEquals(0, dp.nextPosition());
assertEquals(0, dp.startOffset());
assertEquals(6, dp.endOffset());
assertEquals(2, dp.nextPosition());
assertEquals(9, dp.startOffset());
assertEquals(17, dp.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("b"));
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(1, dp.freq());
assertEquals(1, dp.nextPosition());
assertEquals(8, dp.startOffset());
assertEquals(9, dp.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("c"));
assertNotNull(dp);
assertEquals(0, dp.nextDoc());
assertEquals(1, dp.freq());
assertEquals(3, dp.nextPosition());
assertEquals(19, dp.startOffset());
assertEquals(50, dp.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
r.close();
dir.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestPostingsOffsets method testLegalbutVeryLargeOffsets.
public void testLegalbutVeryLargeOffsets() throws Exception {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
Token t1 = new Token("foo", 0, Integer.MAX_VALUE - 500);
if (random().nextBoolean()) {
t1.setPayload(new BytesRef("test"));
}
Token t2 = new Token("foo", Integer.MAX_VALUE - 500, Integer.MAX_VALUE);
TokenStream tokenStream = new CannedTokenStream(new Token[] { t1, t2 });
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
// store some term vectors for the checkindex cross-check
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", tokenStream, ft);
doc.add(field);
iw.addDocument(doc);
iw.close();
dir.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestPayloadsOnVectors method testMixupMultiValued.
/** some field instances have payload att, some not */
public void testMixupMultiValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorPayloads(true);
customType.setStoreTermVectorOffsets(random().nextBoolean());
Field field = new Field("field", "", customType);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("here we go"));
field.setTokenStream(ts);
doc.add(field);
Field field2 = new Field("field", "", customType);
Token withPayload = new Token("withPayload", 0, 11);
withPayload.setPayload(new BytesRef("test"));
ts = new CannedTokenStream(withPayload);
assertTrue(ts.hasAttribute(PayloadAttribute.class));
field2.setTokenStream(ts);
doc.add(field2);
Field field3 = new Field("field", "", customType);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer) ts).setReader(new StringReader("nopayload"));
field3.setTokenStream(ts);
doc.add(field3);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
Terms terms = reader.getTermVector(0, "field");
assert terms != null;
TermsEnum termsEnum = terms.iterator();
assertTrue(termsEnum.seekExact(new BytesRef("withPayload")));
PostingsEnum de = termsEnum.postings(null, PostingsEnum.ALL);
assertEquals(0, de.nextDoc());
assertEquals(3, de.nextPosition());
assertEquals(new BytesRef("test"), de.getPayload());
writer.close();
reader.close();
dir.close();
}
Aggregations