use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class TestMemoryIndex method testOmitNorms.
@Test
public void testOmitNorms() throws IOException {
MemoryIndex mi = new MemoryIndex();
FieldType ft = new FieldType();
ft.setTokenized(true);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
ft.setOmitNorms(true);
mi.addField(new Field("f1", "some text in here", ft), analyzer);
mi.freeze();
LeafReader leader = (LeafReader) mi.createSearcher().getIndexReader();
NumericDocValues norms = leader.getNormValues("f1");
assertNull(norms);
}
use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class TestMemoryIndex method testToStringDebug.
public void testToStringDebug() {
MemoryIndex mi = new MemoryIndex(true, true);
Analyzer analyzer = new MockPayloadAnalyzer();
mi.addField("analyzedField", "aa bb aa", analyzer);
FieldType type = new FieldType();
type.setDimensions(1, 4);
type.setDocValuesType(DocValuesType.BINARY);
type.freeze();
mi.addField(new BinaryPoint("pointAndDvField", "term".getBytes(StandardCharsets.UTF_8), type), analyzer);
assertEquals("analyzedField:\n" + "\t'[61 61]':2: [(0, 0, 2, [70 6f 73 3a 20 30]), (1, 6, 8, [70 6f 73 3a 20 32])]\n" + "\t'[62 62]':1: [(1, 3, 5, [70 6f 73 3a 20 31])]\n" + "\tterms=2, positions=3\n" + "pointAndDvField:\n" + "\tterms=0, positions=0\n" + "\n" + "fields=2, terms=2, positions=3", mi.toStringDebug());
}
use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class BasePostingsFormatTestCase method testPostingsEnumAll.
public void testPostingsEnumAll() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(null);
IndexWriter iw = new IndexWriter(dir, iwc);
Document doc = new Document();
Token token1 = new Token("bar", 0, 3);
token1.setPayload(new BytesRef("pay1"));
Token token2 = new Token("bar", 4, 7);
token2.setPayload(new BytesRef("pay2"));
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
doc.add(new Field("foo", new CannedTokenStream(token1, token2), ft));
iw.addDocument(doc);
DirectoryReader reader = DirectoryReader.open(iw);
// sugar method (FREQS)
PostingsEnum postings = getOnlyLeafReader(reader).postings(new Term("foo", "bar"));
assertEquals(-1, postings.docID());
assertEquals(0, postings.nextDoc());
assertEquals(2, postings.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
// termsenum reuse (FREQS)
TermsEnum termsEnum = getOnlyLeafReader(reader).terms("foo").iterator();
termsEnum.seekExact(new BytesRef("bar"));
PostingsEnum postings2 = termsEnum.postings(postings);
assertNotNull(postings2);
assertReused("foo", postings, postings2);
// and it had better work
assertEquals(-1, postings2.docID());
assertEquals(0, postings2.nextDoc());
assertEquals(2, postings2.freq());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings2.nextDoc());
// asking for docs only: ok
PostingsEnum docsOnly = termsEnum.postings(null, PostingsEnum.NONE);
assertEquals(-1, docsOnly.docID());
assertEquals(0, docsOnly.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly.freq() == 1 || docsOnly.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly.nextDoc());
// reuse that too
PostingsEnum docsOnly2 = termsEnum.postings(docsOnly, PostingsEnum.NONE);
assertNotNull(docsOnly2);
assertReused("foo", docsOnly, docsOnly2);
// and it had better work
assertEquals(-1, docsOnly2.docID());
assertEquals(0, docsOnly2.nextDoc());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsOnly2.freq() == 1 || docsOnly2.freq() == 2);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsOnly2.nextDoc());
// asking for positions, ok
PostingsEnum docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.POSITIONS);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.startOffset() == -1 || docsAndPositionsEnum.startOffset() == 0);
assertTrue(docsAndPositionsEnum.endOffset() == -1 || docsAndPositionsEnum.endOffset() == 3);
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum.getPayload()));
assertEquals(1, docsAndPositionsEnum.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.startOffset() == -1 || docsAndPositionsEnum.startOffset() == 4);
assertTrue(docsAndPositionsEnum.endOffset() == -1 || docsAndPositionsEnum.endOffset() == 7);
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// now reuse the positions
PostingsEnum docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.POSITIONS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.startOffset() == -1 || docsAndPositionsEnum2.startOffset() == 0);
assertTrue(docsAndPositionsEnum2.endOffset() == -1 || docsAndPositionsEnum2.endOffset() == 3);
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(1, docsAndPositionsEnum2.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.startOffset() == -1 || docsAndPositionsEnum2.startOffset() == 4);
assertTrue(docsAndPositionsEnum2.endOffset() == -1 || docsAndPositionsEnum2.endOffset() == 7);
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
// payloads
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.PAYLOADS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.startOffset() == -1 || docsAndPositionsEnum.startOffset() == 0);
assertTrue(docsAndPositionsEnum.endOffset() == -1 || docsAndPositionsEnum.endOffset() == 3);
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.startOffset() == -1 || docsAndPositionsEnum.startOffset() == 4);
assertTrue(docsAndPositionsEnum.endOffset() == -1 || docsAndPositionsEnum.endOffset() == 7);
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.PAYLOADS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.startOffset() == -1 || docsAndPositionsEnum2.startOffset() == 0);
assertTrue(docsAndPositionsEnum2.endOffset() == -1 || docsAndPositionsEnum2.endOffset() == 3);
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.startOffset() == -1 || docsAndPositionsEnum2.startOffset() == 4);
assertTrue(docsAndPositionsEnum2.endOffset() == -1 || docsAndPositionsEnum2.endOffset() == 7);
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.OFFSETS);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(0, docsAndPositionsEnum.startOffset());
assertEquals(3, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum.getPayload()));
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(4, docsAndPositionsEnum.startOffset());
assertEquals(7, docsAndPositionsEnum.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
// reuse
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(0, docsAndPositionsEnum2.startOffset());
assertEquals(3, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay1").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(4, docsAndPositionsEnum2.startOffset());
assertEquals(7, docsAndPositionsEnum2.endOffset());
// we don't define what it is, but if its something else, we should look into it?
assertTrue(docsAndPositionsEnum2.getPayload() == null || new BytesRef("pay2").equals(docsAndPositionsEnum2.getPayload()));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
docsAndPositionsEnum = getOnlyLeafReader(reader).postings(new Term("foo", "bar"), PostingsEnum.ALL);
assertNotNull(docsAndPositionsEnum);
assertEquals(-1, docsAndPositionsEnum.docID());
assertEquals(0, docsAndPositionsEnum.nextDoc());
assertEquals(2, docsAndPositionsEnum.freq());
assertEquals(0, docsAndPositionsEnum.nextPosition());
assertEquals(0, docsAndPositionsEnum.startOffset());
assertEquals(3, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum.getPayload());
assertEquals(1, docsAndPositionsEnum.nextPosition());
assertEquals(4, docsAndPositionsEnum.startOffset());
assertEquals(7, docsAndPositionsEnum.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
docsAndPositionsEnum2 = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.ALL);
assertReused("foo", docsAndPositionsEnum, docsAndPositionsEnum2);
assertEquals(-1, docsAndPositionsEnum2.docID());
assertEquals(0, docsAndPositionsEnum2.nextDoc());
assertEquals(2, docsAndPositionsEnum2.freq());
assertEquals(0, docsAndPositionsEnum2.nextPosition());
assertEquals(0, docsAndPositionsEnum2.startOffset());
assertEquals(3, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay1"), docsAndPositionsEnum2.getPayload());
assertEquals(1, docsAndPositionsEnum2.nextPosition());
assertEquals(4, docsAndPositionsEnum2.startOffset());
assertEquals(7, docsAndPositionsEnum2.endOffset());
assertEquals(new BytesRef("pay2"), docsAndPositionsEnum2.getPayload());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsAndPositionsEnum2.nextDoc());
iw.close();
reader.close();
dir.close();
}
use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class BaseStoredFieldsFormatTestCase method testWriteReadMerge.
public void testWriteReadMerge() throws IOException {
// get another codec, other than the default: so we are merging segments across different codecs
final Codec otherCodec;
if ("SimpleText".equals(Codec.getDefault().getName())) {
otherCodec = TestUtil.getDefaultCodec();
} else {
otherCodec = new SimpleTextCodec();
}
Directory dir = newDirectory();
IndexWriterConfig iwConf = newIndexWriterConfig(new MockAnalyzer(random()));
iwConf.setMaxBufferedDocs(RandomNumbers.randomIntBetween(random(), 2, 30));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
final int docCount = atLeast(200);
final byte[][][] data = new byte[docCount][][];
for (int i = 0; i < docCount; ++i) {
final int fieldCount = rarely() ? RandomNumbers.randomIntBetween(random(), 1, 500) : RandomNumbers.randomIntBetween(random(), 1, 5);
data[i] = new byte[fieldCount][];
for (int j = 0; j < fieldCount; ++j) {
final int length = rarely() ? random().nextInt(1000) : random().nextInt(10);
final int max = rarely() ? 256 : 2;
data[i][j] = randomByteArray(length, max);
}
}
final FieldType type = new FieldType(StringField.TYPE_STORED);
type.setIndexOptions(IndexOptions.NONE);
type.freeze();
IntPoint id = new IntPoint("id", 0);
StoredField idStored = new StoredField("id", 0);
for (int i = 0; i < data.length; ++i) {
Document doc = new Document();
doc.add(id);
doc.add(idStored);
id.setIntValue(i);
idStored.setIntValue(i);
for (int j = 0; j < data[i].length; ++j) {
Field f = new Field("bytes" + j, data[i][j], type);
doc.add(f);
}
iw.w.addDocument(doc);
if (random().nextBoolean() && (i % (data.length / 10) == 0)) {
iw.w.close();
IndexWriterConfig iwConfNew = newIndexWriterConfig(new MockAnalyzer(random()));
// test merging against a non-compressing codec
if (iwConf.getCodec() == otherCodec) {
iwConfNew.setCodec(Codec.getDefault());
} else {
iwConfNew.setCodec(otherCodec);
}
iwConf = iwConfNew;
iw = new RandomIndexWriter(random(), dir, iwConf);
}
}
for (int i = 0; i < 10; ++i) {
final int min = random().nextInt(data.length);
final int max = min + random().nextInt(20);
iw.deleteDocuments(IntPoint.newRangeQuery("id", min, max - 1));
}
// force merges with deletions
iw.forceMerge(2);
iw.commit();
final DirectoryReader ir = DirectoryReader.open(dir);
assertTrue(ir.numDocs() > 0);
int numDocs = 0;
for (int i = 0; i < ir.maxDoc(); ++i) {
final Document doc = ir.document(i);
if (doc == null) {
continue;
}
++numDocs;
final int docId = doc.getField("id").numericValue().intValue();
assertEquals(data[docId].length + 1, doc.getFields().size());
for (int j = 0; j < data[docId].length; ++j) {
final byte[] arr = data[docId][j];
final BytesRef arr2Ref = doc.getBinaryValue("bytes" + j);
final byte[] arr2 = Arrays.copyOfRange(arr2Ref.bytes, arr2Ref.offset, arr2Ref.offset + arr2Ref.length);
assertArrayEquals(arr, arr2);
}
}
assertTrue(ir.numDocs() <= numDocs);
ir.close();
iw.deleteAll();
iw.commit();
iw.forceMerge(1);
iw.close();
dir.close();
}
use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class BaseStoredFieldsFormatTestCase method testBigDocuments.
@Nightly
public void testBigDocuments() throws IOException {
assumeWorkingMMapOnWindows();
// "big" as "much bigger than the chunk size"
// for this test we force a FS dir
// we can't just use newFSDirectory, because this test doesn't really index anything.
// so if we get NRTCachingDir+SimpleText, we make massive stored fields and OOM (LUCENE-4484)
Directory dir = new MockDirectoryWrapper(random(), new MMapDirectory(createTempDir("testBigDocuments")));
IndexWriterConfig iwConf = newIndexWriterConfig(new MockAnalyzer(random()));
iwConf.setMaxBufferedDocs(RandomNumbers.randomIntBetween(random(), 2, 30));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
if (dir instanceof MockDirectoryWrapper) {
((MockDirectoryWrapper) dir).setThrottling(Throttling.NEVER);
}
// emptyDoc
final Document emptyDoc = new Document();
// lot of small fields
final Document bigDoc1 = new Document();
// 1 very big field
final Document bigDoc2 = new Document();
final Field idField = new StringField("id", "", Store.NO);
emptyDoc.add(idField);
bigDoc1.add(idField);
bigDoc2.add(idField);
final FieldType onlyStored = new FieldType(StringField.TYPE_STORED);
onlyStored.setIndexOptions(IndexOptions.NONE);
final Field smallField = new Field("fld", randomByteArray(random().nextInt(10), 256), onlyStored);
final int numFields = RandomNumbers.randomIntBetween(random(), 500000, 1000000);
for (int i = 0; i < numFields; ++i) {
bigDoc1.add(smallField);
}
final Field bigField = new Field("fld", randomByteArray(RandomNumbers.randomIntBetween(random(), 1000000, 5000000), 2), onlyStored);
bigDoc2.add(bigField);
final int numDocs = atLeast(5);
final Document[] docs = new Document[numDocs];
for (int i = 0; i < numDocs; ++i) {
docs[i] = RandomPicks.randomFrom(random(), Arrays.asList(emptyDoc, bigDoc1, bigDoc2));
}
for (int i = 0; i < numDocs; ++i) {
idField.setStringValue("" + i);
iw.addDocument(docs[i]);
if (random().nextInt(numDocs) == 0) {
iw.commit();
}
}
iw.commit();
// look at what happens when big docs are merged
iw.forceMerge(1);
final DirectoryReader rd = DirectoryReader.open(dir);
final IndexSearcher searcher = new IndexSearcher(rd);
for (int i = 0; i < numDocs; ++i) {
final Query query = new TermQuery(new Term("id", "" + i));
final TopDocs topDocs = searcher.search(query, 1);
assertEquals("" + i, 1, topDocs.totalHits);
final Document doc = rd.document(topDocs.scoreDocs[0].doc);
assertNotNull(doc);
final IndexableField[] fieldValues = doc.getFields("fld");
assertEquals(docs[i].getFields("fld").length, fieldValues.length);
if (fieldValues.length > 0) {
assertEquals(docs[i].getFields("fld")[0].binaryValue(), fieldValues[0].binaryValue());
}
}
rd.close();
iw.close();
dir.close();
}
Aggregations