use of org.apache.lucene.analysis.MockPayloadAnalyzer in project lucene-solr by apache.
the class TestMemoryIndex method testToStringDebug.
public void testToStringDebug() {
MemoryIndex mi = new MemoryIndex(true, true);
Analyzer analyzer = new MockPayloadAnalyzer();
mi.addField("analyzedField", "aa bb aa", analyzer);
FieldType type = new FieldType();
type.setDimensions(1, 4);
type.setDocValuesType(DocValuesType.BINARY);
type.freeze();
mi.addField(new BinaryPoint("pointAndDvField", "term".getBytes(StandardCharsets.UTF_8), type), analyzer);
assertEquals("analyzedField:\n" + "\t'[61 61]':2: [(0, 0, 2, [70 6f 73 3a 20 30]), (1, 6, 8, [70 6f 73 3a 20 32])]\n" + "\t'[62 62]':1: [(1, 3, 5, [70 6f 73 3a 20 31])]\n" + "\tterms=2, positions=3\n" + "pointAndDvField:\n" + "\tterms=0, positions=0\n" + "\n" + "fields=2, terms=2, positions=3", mi.toStringDebug());
}
use of org.apache.lucene.analysis.MockPayloadAnalyzer in project lucene-solr by apache.
the class TestMemoryIndex method testReaderConsistency.
public void testReaderConsistency() throws IOException {
Analyzer analyzer = new MockPayloadAnalyzer();
// defaults
MemoryIndex mi = new MemoryIndex();
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
// all combinations of offsets/payloads options
mi = new MemoryIndex(true, true);
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
mi = new MemoryIndex(true, false);
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
mi = new MemoryIndex(false, true);
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
mi = new MemoryIndex(false, false);
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
analyzer.close();
}
use of org.apache.lucene.analysis.MockPayloadAnalyzer in project lucene-solr by apache.
the class TestPostingsOffsets method doTestNumbers.
public void doTestNumbers(boolean withPayloads) throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random());
iwc = newIndexWriterConfig(analyzer);
// will rely on docids a bit for skipping
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(random().nextBoolean());
ft.setStoreTermVectorPositions(random().nextBoolean());
}
int numDocs = atLeast(500);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new Field("numbers", English.intToEnglish(i), ft));
doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft));
doc.add(new StringField("id", "" + i, Field.Store.NO));
w.addDocument(doc);
}
IndexReader reader = w.getReader();
w.close();
String[] terms = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" };
for (String term : terms) {
PostingsEnum dp = MultiFields.getTermPositionsEnum(reader, "numbers", new BytesRef(term));
int doc;
while ((doc = dp.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
String storedNumbers = reader.document(doc).get("numbers");
int freq = dp.freq();
for (int i = 0; i < freq; i++) {
dp.nextPosition();
int start = dp.startOffset();
assert start >= 0;
int end = dp.endOffset();
assert end >= 0 && end >= start;
// check that the offsets correspond to the term in the src text
assertTrue(storedNumbers.substring(start, end).equals(term));
if (withPayloads) {
// check that we have a payload and it starts with "pos"
assertNotNull(dp.getPayload());
BytesRef payload = dp.getPayload();
assertTrue(payload.utf8ToString().startsWith("pos:"));
}
// note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
}
}
}
// check we can skip correctly
int numSkippingTests = atLeast(50);
for (int j = 0; j < numSkippingTests; j++) {
int num = TestUtil.nextInt(random(), 100, Math.min(numDocs - 1, 999));
PostingsEnum dp = MultiFields.getTermPositionsEnum(reader, "numbers", new BytesRef("hundred"));
int doc = dp.advance(num);
assertEquals(num, doc);
int freq = dp.freq();
for (int i = 0; i < freq; i++) {
String storedNumbers = reader.document(doc).get("numbers");
dp.nextPosition();
int start = dp.startOffset();
assert start >= 0;
int end = dp.endOffset();
assert end >= 0 && end >= start;
// check that the offsets correspond to the term in the src text
assertTrue(storedNumbers.substring(start, end).equals("hundred"));
if (withPayloads) {
// check that we have a payload and it starts with "pos"
assertNotNull(dp.getPayload());
BytesRef payload = dp.getPayload();
assertTrue(payload.utf8ToString().startsWith("pos:"));
}
// note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
}
}
for (int i = 0; i < numDocs; i++) {
PostingsEnum dp = MultiFields.getTermDocsEnum(reader, "id", new BytesRef("" + i), 0);
assertEquals(i, dp.nextDoc());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
}
reader.close();
dir.close();
}
use of org.apache.lucene.analysis.MockPayloadAnalyzer in project lucene-solr by apache.
the class TestPositionIncrement method testPayloadsPos0.
public void testPayloadsPos0() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockPayloadAnalyzer());
Document doc = new Document();
doc.add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k")));
writer.addDocument(doc);
final IndexReader readerFromWriter = writer.getReader();
LeafReader r = getOnlyLeafReader(readerFromWriter);
PostingsEnum tp = r.postings(new Term("content", "a"), PostingsEnum.ALL);
int count = 0;
assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
// "a" occurs 4 times
assertEquals(4, tp.freq());
assertEquals(0, tp.nextPosition());
assertEquals(1, tp.nextPosition());
assertEquals(3, tp.nextPosition());
assertEquals(6, tp.nextPosition());
// only one doc has "a"
assertEquals(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc());
IndexSearcher is = newSearcher(getOnlyLeafReader(readerFromWriter));
SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
SpanQuery[] sqs = { stq1, stq2 };
SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
count = 0;
boolean sawZero = false;
if (VERBOSE) {
System.out.println("\ngetPayloadSpans test");
}
PayloadSpanCollector collector = new PayloadSpanCollector();
Spans pspans = snq.createWeight(is, false, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);
while (pspans.nextDoc() != Spans.NO_MORE_DOCS) {
while (pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
if (VERBOSE) {
System.out.println("doc " + pspans.docID() + ": span " + pspans.startPosition() + " to " + pspans.endPosition());
}
collector.reset();
pspans.collect(collector);
sawZero |= pspans.startPosition() == 0;
for (BytesRef payload : collector.payloads) {
count++;
if (VERBOSE) {
System.out.println(" payload: " + Term.toString(payload));
}
}
}
}
assertTrue(sawZero);
assertEquals(8, count);
// System.out.println("\ngetSpans test");
Spans spans = snq.createWeight(is, false, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);
count = 0;
sawZero = false;
while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
count++;
sawZero |= spans.startPosition() == 0;
// System.out.println(spans.doc() + " - " + spans.start() + " - " +
// spans.end());
}
}
assertEquals(4, count);
assertTrue(sawZero);
writer.close();
is.getIndexReader().close();
dir.close();
}
use of org.apache.lucene.analysis.MockPayloadAnalyzer in project lucene-solr by apache.
the class SynonymTokenizer method testPayloadQuery.
/** We can highlight based on payloads. It's supported both via term vectors and MemoryIndex since Lucene 5. */
public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException {
//"words" at positions 1 & 4
final String text = "random words and words";
//sets payload to "pos: X" (where X is position #)
Analyzer analyzer = new MockPayloadAnalyzer();
try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer))) {
writer.deleteAll();
Document doc = new Document();
doc.add(new Field(FIELD_NAME, text, fieldType));
writer.addDocument(doc);
writer.commit();
}
try (IndexReader reader = DirectoryReader.open(dir)) {
Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")), //just match the first "word" occurrence
Collections.singletonList(new BytesRef("pos: 1")));
IndexSearcher searcher = newSearcher(reader);
QueryScorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
scorer.setUsePayloads(true);
Highlighter h = new Highlighter(scorer);
TopDocs hits = searcher.search(query, 10);
assertEquals(1, hits.scoreDocs.length);
TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer);
if (random().nextBoolean()) {
//conceals detection of TokenStreamFromTermVector
stream = new CachingTokenFilter(stream);
}
String result = h.getBestFragment(stream, text);
//only highlight first "word"
assertEquals("random <B>words</B> and words", result);
}
}
Aggregations