use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class TestMemoryIndex method testBuildFromDocument.
@Test
public void testBuildFromDocument() {
Document doc = new Document();
doc.add(new TextField("field1", "some text", Field.Store.NO));
doc.add(new TextField("field1", "some more text", Field.Store.NO));
doc.add(new StringField("field2", "untokenized text", Field.Store.NO));
analyzer.setPositionIncrementGap(100);
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer);
assertThat(mi.search(new TermQuery(new Term("field1", "text"))), not(0.0f));
assertThat(mi.search(new TermQuery(new Term("field2", "text"))), is(0.0f));
assertThat(mi.search(new TermQuery(new Term("field2", "untokenized text"))), not(0.0f));
assertThat(mi.search(new PhraseQuery("field1", "some", "more", "text")), not(0.0f));
assertThat(mi.search(new PhraseQuery("field1", "some", "text")), not(0.0f));
assertThat(mi.search(new PhraseQuery("field1", "text", "some")), is(0.0f));
}
use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class TestMemoryIndex method testFieldsOnlyReturnsIndexedFields.
public void testFieldsOnlyReturnsIndexedFields() throws IOException {
Document doc = new Document();
doc.add(new NumericDocValuesField("numeric", 29L));
doc.add(new TextField("text", "some text", Field.Store.NO));
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer);
IndexSearcher searcher = mi.createSearcher();
IndexReader reader = searcher.getIndexReader();
assertEquals(reader.getTermVectors(0).size(), 1);
}
use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class DocumentDictionaryTest method testWithOptionalPayload.
@Test
public void testWithOptionalPayload() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
// Create a document that is missing the payload field
Document doc = new Document();
Field field = new TextField(FIELD_NAME, "some field", Field.Store.YES);
doc.add(field);
// do not store the payload or the contexts
Field weight = new NumericDocValuesField(WEIGHT_FIELD_NAME, 100);
doc.add(weight);
writer.addDocument(doc);
writer.commit();
writer.close();
IndexReader ir = DirectoryReader.open(dir);
// Even though the payload field is missing, the dictionary iterator should not skip the document
// because the payload field is optional.
Dictionary dictionaryOptionalPayload = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME);
InputIterator inputIterator = dictionaryOptionalPayload.getEntryIterator();
BytesRef f = inputIterator.next();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
IndexableField weightField = doc.getField(WEIGHT_FIELD_NAME);
assertEquals(inputIterator.weight(), weightField.numericValue().longValue());
IndexableField payloadField = doc.getField(PAYLOAD_FIELD_NAME);
assertNull(payloadField);
assertTrue(inputIterator.payload().length == 0);
IOUtils.close(ir, analyzer, dir);
}
use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class DocumentDictionaryTest method generateIndexDocuments.
/** Returns Pair(list of invalid document terms, Map of document term -> document) */
private Map.Entry<List<String>, Map<String, Document>> generateIndexDocuments(int ndocs, boolean requiresContexts) {
Map<String, Document> docs = new HashMap<>();
List<String> invalidDocTerms = new ArrayList<>();
for (int i = 0; i < ndocs; i++) {
Document doc = new Document();
boolean invalidDoc = false;
Field field = null;
// usually have valid term field in document
if (usually()) {
field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
doc.add(field);
} else {
invalidDoc = true;
}
// even if payload is not required usually have it
if (usually()) {
Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
doc.add(payload);
}
if (requiresContexts || usually()) {
if (usually()) {
for (int j = 0; j < atLeast(2); j++) {
doc.add(new StoredField(CONTEXT_FIELD_NAME, new BytesRef("context_" + i + "_" + j)));
}
}
// we should allow entries without context
}
// usually have valid weight field in document
if (usually()) {
Field weight = (rarely()) ? new StoredField(WEIGHT_FIELD_NAME, 100d + i) : new NumericDocValuesField(WEIGHT_FIELD_NAME, 100 + i);
doc.add(weight);
}
String term = null;
if (invalidDoc) {
term = (field != null) ? field.stringValue() : "invalid_" + i;
invalidDocTerms.add(term);
} else {
term = field.stringValue();
}
docs.put(term, doc);
}
return new SimpleEntry<>(invalidDocTerms, docs);
}
use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class BasePostingsFormatTestCase method testPostingsEnumPositions.
public void testPostingsEnumPositions() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer());
}
});
IndexWriter iw = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new TextField("foo", "bar bar", Field.Store.NO));
iw.addDocument(doc);
DirectoryReader reader = DirectoryReader.open(iw);
// sugar method (FREQS)
PostingsEnum postings = getOnlyLeafReader(reader).postings(new Term("foo", "bar"));
assertEquals(-1, postings.docID());
assertEquals(0, postings.nextDoc());
assertEquals(2, postings.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
// termsenum reuse (FREQS)
TermsEnum termsEnum = getOnlyLeafReader(reader).terms("foo").iterator();
termsEnum.seekExact(new BytesRef("bar"));
PostingsEnum postings2 = termsEnum.postings(postings);
assertNotNull(postings2);
assertReused("foo", postings, postings2);
// and it had better work
assertEquals(-1, postings2.docID());
assertEquals(0, postings2.nextDoc());
assertEquals(2, postings2.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings2.nextDoc());
// asking for docs only: ok
PostingsEnum docsOnly = termsEnum.postings(null, PostingsEnum.NONE);
assertEquals(-1, docsOnly.docID());
assertEquals(0, docsOnly.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly.freq() == 1 || docsOnly.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly.nextDoc());
// reuse that too
PostingsEnum docsOnly2 = termsEnum.postings(docsOnly, PostingsEnum.NONE);
assertNotNull(docsOnly2);
assertReused("foo", docsOnly, docsOnly2);
// and it had better work
assertEquals(-1, docsOnly2.docID());
assertEquals(0, docsOnly2.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly2.freq() == 1 || docsOnly2.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly2.nextDoc());
// asking for positions, ok
PostingsEnum docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.POSITIONS);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertNull(docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertNull(docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// now reuse the positions
PostingsEnum docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.POSITIONS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertNull(docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertNull(docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
// payloads, offsets, etc don't cause an error if they aren't there
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.PAYLOADS);
assertNotNull(docsAndPositionsEnum);
// but make sure they work
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertNull(docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertNull(docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.PAYLOADS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertNull(docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertNull(docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.OFFSETS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertNull(docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertNull(docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertNull(docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertNull(docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.ALL);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertNull(docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(-1, docsAndPositionsEnum.startOffset());
assertEquals(-1, docsAndPositionsEnum.endOffset());
assertNull(docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.ALL);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertNull(docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(-1, docsAndPositionsEnum2.startOffset());
assertEquals(-1, docsAndPositionsEnum2.endOffset());
assertNull(docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
iw.close();
reader.close();
dir.close();
}
Aggregations