use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class FastVectorHighlighterTest method matchedFieldsTestCase.
private void matchedFieldsTestCase(boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses) throws IOException {
Document doc = new Document();
FieldType stored = new FieldType(TextField.TYPE_STORED);
stored.setStoreTermVectorOffsets(true);
stored.setStoreTermVectorPositions(true);
stored.setStoreTermVectors(true);
stored.freeze();
FieldType matched = new FieldType(TextField.TYPE_NOT_STORED);
matched.setStoreTermVectorOffsets(true);
matched.setStoreTermVectorPositions(true);
matched.setStoreTermVectors(true);
matched.freeze();
// Whitespace tokenized with English stop words
doc.add(new Field("field", fieldValue, stored));
// Whitespace tokenized without stop words
doc.add(new Field("field_exact", fieldValue, matched));
// Whitespace tokenized without toLower
doc.add(new Field("field_super_exact", fieldValue, matched));
// Each letter is a token
doc.add(new Field("field_characters", fieldValue, matched));
// Every three letters is a token
doc.add(new Field("field_tripples", fieldValue, matched));
doc.add(new Field("field_sliced", // Sliced at 10 chars then analyzed just like field
fieldValue.substring(// Sliced at 10 chars then analyzed just like field
0, Math.min(fieldValue.length() - 1, 10)), matched));
doc.add(new Field("field_der_red", new // Hacky field containing "der" and "red" at pos = 0
CannedTokenStream(token("der", 1, 0, 3), token("red", 0, 0, 3)), matched));
final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
fieldAnalyzers.put("field", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET));
fieldAnalyzers.put("field_exact", new MockAnalyzer(random()));
fieldAnalyzers.put("field_super_exact", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
fieldAnalyzers.put("field_characters", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
fieldAnalyzers.put("field_tripples", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
fieldAnalyzers.put("field_sliced", fieldAnalyzers.get("field"));
// This is required even though we provide a token stream
fieldAnalyzers.put("field_der_red", fieldAnalyzers.get("field"));
Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
public Analyzer getWrappedAnalyzer(String fieldName) {
return fieldAnalyzers.get(fieldName);
}
};
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
writer.addDocument(doc);
FastVectorHighlighter highlighter = new FastVectorHighlighter();
FragListBuilder fragListBuilder = new SimpleFragListBuilder();
FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
IndexReader reader = DirectoryReader.open(writer);
String[] preTags = new String[] { "<b>" };
String[] postTags = new String[] { "</b>" };
Encoder encoder = new DefaultEncoder();
int docId = 0;
BooleanQuery.Builder query = new BooleanQuery.Builder();
for (Query clause : queryClauses) {
query.add(clause, Occur.MUST);
}
FieldQuery fieldQuery = new FieldQuery(query.build(), reader, true, fieldMatch);
String[] bestFragments;
if (useMatchedFields) {
Set<String> matchedFields = new HashSet<>();
matchedFields.add("field");
matchedFields.add("field_exact");
matchedFields.add("field_super_exact");
matchedFields.add("field_characters");
matchedFields.add("field_tripples");
matchedFields.add("field_sliced");
matchedFields.add("field_der_red");
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
} else {
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
}
assertEquals(expected, bestFragments[0]);
reader.close();
writer.close();
dir.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class FastVectorHighlighterTest method testBooleanPhraseWithSynonym.
public void testBooleanPhraseWithSynonym() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
FieldType type = new FieldType(TextField.TYPE_NOT_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Token syn = new Token("httpwwwfacebookcom", 6, 29);
syn.setPositionIncrement(0);
CannedTokenStream ts = new CannedTokenStream(new Token("test", 0, 4), new Token("http", 6, 10), syn, new Token("www", 13, 16), new Token("facebook", 17, 25), new Token("com", 26, 29));
Field field = new Field("field", ts, type);
doc.add(field);
doc.add(new StoredField("field", "Test: http://www.facebook.com"));
writer.addDocument(doc);
FastVectorHighlighter highlighter = new FastVectorHighlighter();
IndexReader reader = DirectoryReader.open(writer);
int docId = 0;
// query1: match
PhraseQuery pq = new PhraseQuery("field", "test", "http", "www", "facebook", "com");
FieldQuery fieldQuery = highlighter.getFieldQuery(pq, reader);
String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
// query2: match
PhraseQuery pq2 = new PhraseQuery("field", "test", "httpwwwfacebookcom", "www", "facebook", "com");
fieldQuery = highlighter.getFieldQuery(pq2, reader);
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
// query3: OR query1 and query2 together
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(pq, BooleanClause.Occur.SHOULD);
bq.add(pq2, BooleanClause.Occur.SHOULD);
fieldQuery = highlighter.getFieldQuery(bq.build(), reader);
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
reader.close();
writer.close();
dir.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class BaseTermVectorsFormatTestCase method testPostingsEnumPayloads.
public void testPostingsEnumPayloads() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(null);
IndexWriter iw = new IndexWriter(dir, iwc);
Document doc = new Document();
Token token1 = new Token("bar", 0, 3);
token1.setPayload(new BytesRef("pay1"));
Token token2 = new Token("bar", 4, 7);
token2.setPayload(new BytesRef("pay2"));
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorPayloads(true);
doc.add(new Field("foo", new CannedTokenStream(token1, token2), ft));
iw.addDocument(doc);
DirectoryReader reader = DirectoryReader.open(iw);
Terms terms = getOnlyLeafReader(reader).getTermVector(0, "foo");
TermsEnum termsEnum = terms.iterator();
assertNotNull(termsEnum);
assertEquals(new BytesRef("bar"), termsEnum.next());
// sugar method (FREQS)
PostingsEnum postings = termsEnum.postings(null);
assertEquals(-1, postings.docID());
assertEquals(0, postings.nextDoc());
assertEquals(2, postings.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
// termsenum reuse (FREQS)
PostingsEnum postings2 = termsEnum.postings(postings);
assertNotNull(postings2);
// and it had better work
assertEquals(-1, postings2.docID());
assertEquals(0, postings2.nextDoc());
assertEquals(2, postings2.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings2.nextDoc());
// asking for docs only: ok
PostingsEnum docsOnly = termsEnum.postings(null, PostingsEnum.NONE);
assertEquals(-1, docsOnly.docID());
assertEquals(0, docsOnly.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly.freq() == 1 || docsOnly.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly.nextDoc());
// reuse that too
PostingsEnum docsOnly2 = termsEnum.postings(docsOnly, PostingsEnum.NONE);
assertNotNull(docsOnly2);
// and it had better work
assertEquals(-1, docsOnly2.docID());
assertEquals(0, docsOnly2.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly2.freq() == 1 || docsOnly2.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly2.nextDoc());
// asking for positions, ok
PostingsEnum docsAndPositionsEnum = termsEnum.postings(null, PostingsEnum.POSITIONS);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum.getPayload()));
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// now reuse the positions
PostingsEnum docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.POSITIONS);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
// payloads
docsAndPositionsEnum = termsEnum.postings(null, PostingsEnum.PAYLOADS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.PAYLOADS);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum.getPayload()));
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = termsEnum.postings(null, PostingsEnum.ALL);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.ALL);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
iw.close();
reader.close();
dir.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestMemoryIndexAgainstRAMDir method testEmptyString.
// LUCENE-4880
public void testEmptyString() throws IOException {
MemoryIndex memory = new MemoryIndex();
memory.addField("foo", new CannedTokenStream(new Token("", 0, 5)));
IndexSearcher searcher = memory.createSearcher();
TopDocs docs = searcher.search(new TermQuery(new Term("foo", "")), 10);
assertEquals(1, docs.totalHits);
TestUtil.checkReader(searcher.getIndexReader());
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestTermAutomatonQuery method testAnyFromTokenStream.
public void testAnyFromTokenStream() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "here comes the sun", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newTextField("field", "here comes the moon", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newTextField("field", "here comes sun", Field.Store.NO));
w.addDocument(doc);
// Should not match:
doc = new Document();
doc.add(newTextField("field", "here comes the other sun", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
TokenStream ts = new CannedTokenStream(new Token[] { token("comes", 1, 1), token("comes", 0, 2), token("*", 1, 1), token("sun", 1, 1), token("moon", 0, 1) });
TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts);
// System.out.println("DOT: " + q.toDot());
assertEquals(3, s.search(q, 1).totalHits);
w.close();
r.close();
dir.close();
}
Aggregations