use of org.apache.lucene.document.NumericDocValuesField in project lucene-solr by apache.
the class TestPostingsOffsets method testRandom.
public void testRandom() throws Exception {
// token -> docID -> tokens
final Map<String, Map<Integer, List<Token>>> actualTokens = new HashMap<>();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
final int numDocs = atLeast(20);
//final int numDocs = atLeast(5);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
// TODO: randomize what IndexOptions we use; also test
// changing this up in one IW buffered segment...:
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(random().nextBoolean());
ft.setStoreTermVectorPositions(random().nextBoolean());
}
for (int docCount = 0; docCount < numDocs; docCount++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("id", docCount));
List<Token> tokens = new ArrayList<>();
final int numTokens = atLeast(100);
//final int numTokens = atLeast(20);
int pos = -1;
int offset = 0;
//System.out.println("doc id=" + docCount);
for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) {
final String text;
if (random().nextBoolean()) {
text = "a";
} else if (random().nextBoolean()) {
text = "b";
} else if (random().nextBoolean()) {
text = "c";
} else {
text = "d";
}
int posIncr = random().nextBoolean() ? 1 : random().nextInt(5);
if (tokenCount == 0 && posIncr == 0) {
posIncr = 1;
}
final int offIncr = random().nextBoolean() ? 0 : random().nextInt(5);
final int tokenOffset = random().nextInt(5);
final Token token = makeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset);
if (!actualTokens.containsKey(text)) {
actualTokens.put(text, new HashMap<Integer, List<Token>>());
}
final Map<Integer, List<Token>> postingsByDoc = actualTokens.get(text);
if (!postingsByDoc.containsKey(docCount)) {
postingsByDoc.put(docCount, new ArrayList<Token>());
}
postingsByDoc.get(docCount).add(token);
tokens.add(token);
pos += posIncr;
// stuff abs position into type:
token.setType("" + pos);
offset += offIncr + tokenOffset;
//System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
}
doc.add(new Field("content", new CannedTokenStream(tokens.toArray(new Token[tokens.size()])), ft));
w.addDocument(doc);
}
final DirectoryReader r = w.getReader();
w.close();
final String[] terms = new String[] { "a", "b", "c", "d" };
for (LeafReaderContext ctx : r.leaves()) {
// TODO: improve this
LeafReader sub = ctx.reader();
//System.out.println("\nsub=" + sub);
final TermsEnum termsEnum = sub.fields().terms("content").iterator();
PostingsEnum docs = null;
PostingsEnum docsAndPositions = null;
PostingsEnum docsAndPositionsAndOffsets = null;
int[] docIDToID = new int[sub.maxDoc()];
NumericDocValues values = DocValues.getNumeric(sub, "id");
for (int i = 0; i < sub.maxDoc(); i++) {
assertEquals(i, values.nextDoc());
docIDToID[i] = (int) values.longValue();
}
for (String term : terms) {
//System.out.println(" term=" + term);
if (termsEnum.seekExact(new BytesRef(term))) {
docs = termsEnum.postings(docs);
assertNotNull(docs);
int doc;
//System.out.println(" doc/freq");
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docs.freq());
}
// explicitly exclude offsets here
docsAndPositions = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
assertNotNull(docsAndPositions);
//System.out.println(" doc/freq/pos");
while ((doc = docsAndPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositions.freq());
for (Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositions.nextPosition());
}
}
docsAndPositionsAndOffsets = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
assertNotNull(docsAndPositionsAndOffsets);
//System.out.println(" doc/freq/pos/offs");
while ((doc = docsAndPositionsAndOffsets.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositionsAndOffsets.freq());
for (Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositionsAndOffsets.nextPosition());
assertEquals(token.startOffset(), docsAndPositionsAndOffsets.startOffset());
assertEquals(token.endOffset(), docsAndPositionsAndOffsets.endOffset());
}
}
}
}
// TODO: test advance:
}
r.close();
dir.close();
}
use of org.apache.lucene.document.NumericDocValuesField in project lucene-solr by apache.
the class TestPointQueries method verifyLongs.
// verify for long values
private static void verifyLongs(long[] values, int[] ids) throws Exception {
IndexWriterConfig iwc = newIndexWriterConfig();
// Else we can get O(N^2) merging:
int mbd = iwc.getMaxBufferedDocs();
if (mbd != -1 && mbd < values.length / 100) {
iwc.setMaxBufferedDocs(values.length / 100);
}
iwc.setCodec(getCodec());
Directory dir;
if (values.length > 100000) {
dir = newMaybeVirusCheckingFSDirectory(createTempDir("TestRangeTree"));
} else {
dir = newMaybeVirusCheckingDirectory();
}
int missingPct = random().nextInt(100);
int deletedPct = random().nextInt(100);
if (VERBOSE) {
System.out.println(" missingPct=" + missingPct);
System.out.println(" deletedPct=" + deletedPct);
}
BitSet missing = new BitSet();
BitSet deleted = new BitSet();
Document doc = null;
int lastID = -1;
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
for (int ord = 0; ord < values.length; ord++) {
int id;
if (ids == null) {
id = ord;
} else {
id = ids[ord];
}
if (id != lastID) {
if (random().nextInt(100) < missingPct) {
missing.set(id);
if (VERBOSE) {
System.out.println(" missing id=" + id);
}
}
if (doc != null) {
w.addDocument(doc);
if (random().nextInt(100) < deletedPct) {
int idToDelete = random().nextInt(id);
w.deleteDocuments(new Term("id", "" + idToDelete));
deleted.set(idToDelete);
if (VERBOSE) {
System.out.println(" delete id=" + idToDelete);
}
}
}
doc = new Document();
doc.add(newStringField("id", "" + id, Field.Store.NO));
doc.add(new NumericDocValuesField("id", id));
lastID = id;
}
if (missing.get(id) == false) {
doc.add(new LongPoint("sn_value", values[id]));
byte[] bytes = new byte[8];
NumericUtils.longToSortableBytes(values[id], bytes, 0);
doc.add(new BinaryPoint("ss_value", bytes));
}
}
w.addDocument(doc);
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println(" forceMerge(1)");
}
w.forceMerge(1);
}
final IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r, false);
int numThreads = TestUtil.nextInt(random(), 2, 5);
if (VERBOSE) {
System.out.println("TEST: use " + numThreads + " query threads; searcher=" + s);
}
List<Thread> threads = new ArrayList<>();
final int iters = atLeast(100);
final CountDownLatch startingGun = new CountDownLatch(1);
final AtomicBoolean failed = new AtomicBoolean();
for (int i = 0; i < numThreads; i++) {
Thread thread = new Thread() {
@Override
public void run() {
try {
_run();
} catch (Exception e) {
failed.set(true);
throw new RuntimeException(e);
}
}
private void _run() throws Exception {
startingGun.await();
for (int iter = 0; iter < iters && failed.get() == false; iter++) {
Long lower = randomValue();
Long upper = randomValue();
if (upper < lower) {
long x = lower;
lower = upper;
upper = x;
}
Query query;
if (VERBOSE) {
System.out.println("\n" + Thread.currentThread().getName() + ": TEST: iter=" + iter + " value=" + lower + " TO " + upper);
byte[] tmp = new byte[8];
if (lower != null) {
NumericUtils.longToSortableBytes(lower, tmp, 0);
System.out.println(" lower bytes=" + Arrays.toString(tmp));
}
if (upper != null) {
NumericUtils.longToSortableBytes(upper, tmp, 0);
System.out.println(" upper bytes=" + Arrays.toString(tmp));
}
}
if (random().nextBoolean()) {
query = LongPoint.newRangeQuery("sn_value", lower, upper);
} else {
byte[] lowerBytes = new byte[8];
NumericUtils.longToSortableBytes(lower, lowerBytes, 0);
byte[] upperBytes = new byte[8];
NumericUtils.longToSortableBytes(upper, upperBytes, 0);
query = BinaryPoint.newRangeQuery("ss_value", lowerBytes, upperBytes);
}
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": using query: " + query);
}
final BitSet hits = new BitSet();
s.search(query, new SimpleCollector() {
private int docBase;
@Override
public boolean needsScores() {
return false;
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
docBase = context.docBase;
}
@Override
public void collect(int doc) {
hits.set(docBase + doc);
}
});
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": hitCount: " + hits.cardinality());
}
NumericDocValues docIDToID = MultiDocValues.getNumericValues(r, "id");
for (int docID = 0; docID < r.maxDoc(); docID++) {
assertEquals(docID, docIDToID.nextDoc());
int id = (int) docIDToID.longValue();
boolean expected = missing.get(id) == false && deleted.get(id) == false && values[id] >= lower && values[id] <= upper;
if (hits.get(docID) != expected) {
// We do exact quantized comparison so the bbox query should never disagree:
fail(Thread.currentThread().getName() + ": iter=" + iter + " id=" + id + " docID=" + docID + " value=" + values[id] + " (range: " + lower + " TO " + upper + ") expected " + expected + " but got: " + hits.get(docID) + " deleted?=" + deleted.get(id) + " query=" + query);
}
}
}
}
};
thread.setName("T" + i);
thread.start();
threads.add(thread);
}
startingGun.countDown();
for (Thread thread : threads) {
thread.join();
}
IOUtils.close(r, dir);
}
use of org.apache.lucene.document.NumericDocValuesField in project lucene-solr by apache.
the class TestLongValuesSource method setUp.
@Override
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
int numDocs = TestUtil.nextInt(random(), 2049, 4000);
for (int i = 0; i < numDocs; i++) {
Document document = new Document();
document.add(newTextField("english", English.intToEnglish(i), Field.Store.NO));
document.add(newTextField("oddeven", (i % 2 == 0) ? "even" : "odd", Field.Store.NO));
document.add(new NumericDocValuesField("int", random().nextInt()));
document.add(new NumericDocValuesField("long", random().nextLong()));
if (i == 545)
document.add(new NumericDocValuesField("onefield", 45));
iw.addDocument(document);
}
reader = iw.getReader();
iw.close();
searcher = newSearcher(reader);
}
use of org.apache.lucene.document.NumericDocValuesField in project lucene-solr by apache.
the class TestDemoParallelLeafReader method getReindexerNewDVFields.
/** Schema change by adding a new number_<schemaGen> DV field each time. */
private ReindexingReader getReindexerNewDVFields(Path root, final AtomicLong currentSchemaGen) throws IOException {
return new ReindexingReader(root) {
@Override
protected IndexWriterConfig getIndexWriterConfig() throws IOException {
IndexWriterConfig iwc = newIndexWriterConfig();
TieredMergePolicy tmp = new TieredMergePolicy();
// We write tiny docs, so we need tiny floor to avoid O(N^2) merging:
tmp.setFloorSegmentMB(.01);
iwc.setMergePolicy(tmp);
return iwc;
}
@Override
protected Directory openDirectory(Path path) throws IOException {
MockDirectoryWrapper dir = newMockFSDirectory(path);
dir.setUseSlowOpenClosers(false);
dir.setThrottling(Throttling.NEVER);
return dir;
}
@Override
protected void reindex(long oldSchemaGen, long newSchemaGen, LeafReader reader, Directory parallelDir) throws IOException {
IndexWriterConfig iwc = newIndexWriterConfig();
// The order of our docIDs must precisely matching incoming reader:
iwc.setMergePolicy(new LogByteSizeMergePolicy());
IndexWriter w = new IndexWriter(parallelDir, iwc);
int maxDoc = reader.maxDoc();
if (oldSchemaGen <= 0) {
// Must slowly parse the stored field into a new doc values field:
for (int i = 0; i < maxDoc; i++) {
// TODO: is this still O(blockSize^2)?
Document oldDoc = reader.document(i);
Document newDoc = new Document();
long value = Long.parseLong(oldDoc.get("text").split(" ")[1]);
newDoc.add(new NumericDocValuesField("number_" + newSchemaGen, value));
newDoc.add(new LongPoint("number", value));
w.addDocument(newDoc);
}
} else {
// Just carry over doc values from previous field:
NumericDocValues oldValues = reader.getNumericDocValues("number_" + oldSchemaGen);
assertNotNull("oldSchemaGen=" + oldSchemaGen, oldValues);
for (int i = 0; i < maxDoc; i++) {
// TODO: is this still O(blockSize^2)?
assertEquals(i, oldValues.nextDoc());
Document oldDoc = reader.document(i);
Document newDoc = new Document();
newDoc.add(new NumericDocValuesField("number_" + newSchemaGen, oldValues.longValue()));
w.addDocument(newDoc);
}
}
w.forceMerge(1);
w.close();
}
@Override
protected long getCurrentSchemaGen() {
return currentSchemaGen.get();
}
@Override
protected void checkParallelReader(LeafReader r, LeafReader parR, long schemaGen) throws IOException {
String fieldName = "number_" + schemaGen;
if (DEBUG)
System.out.println(Thread.currentThread().getName() + ": TEST: now check parallel number DVs field=" + fieldName + " r=" + r + " parR=" + parR);
NumericDocValues numbers = parR.getNumericDocValues(fieldName);
if (numbers == null) {
return;
}
int maxDoc = r.maxDoc();
boolean failed = false;
for (int i = 0; i < maxDoc; i++) {
Document oldDoc = r.document(i);
long value = Long.parseLong(oldDoc.get("text").split(" ")[1]);
assertEquals(i, numbers.nextDoc());
if (value != numbers.longValue()) {
if (DEBUG)
System.out.println("FAIL: docID=" + i + " " + oldDoc + " value=" + value + " number=" + numbers.longValue() + " numbers=" + numbers);
failed = true;
} else if (failed) {
if (DEBUG)
System.out.println("OK: docID=" + i + " " + oldDoc + " value=" + value + " number=" + numbers.longValue());
}
}
assertFalse("FAILED field=" + fieldName + " r=" + r, failed);
}
};
}
use of org.apache.lucene.document.NumericDocValuesField in project lucene-solr by apache.
the class TestBinaryDocValuesUpdates method testMultipleDocValuesTypes.
public void testMultipleDocValuesTypes() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
// prevent merges
conf.setMaxBufferedDocs(10);
IndexWriter writer = new IndexWriter(dir, conf);
for (int i = 0; i < 4; i++) {
Document doc = new Document();
doc.add(new StringField("dvUpdateKey", "dv", Store.NO));
doc.add(new NumericDocValuesField("ndv", i));
doc.add(new BinaryDocValuesField("bdv", new BytesRef(Integer.toString(i))));
doc.add(new SortedDocValuesField("sdv", new BytesRef(Integer.toString(i))));
doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(Integer.toString(i))));
doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(Integer.toString(i * 2))));
writer.addDocument(doc);
}
writer.commit();
// update all docs' bdv field
writer.updateBinaryDocValue(new Term("dvUpdateKey", "dv"), "bdv", toBytes(17L));
writer.close();
final DirectoryReader reader = DirectoryReader.open(dir);
LeafReader r = reader.leaves().get(0).reader();
NumericDocValues ndv = r.getNumericDocValues("ndv");
BinaryDocValues bdv = r.getBinaryDocValues("bdv");
SortedDocValues sdv = r.getSortedDocValues("sdv");
SortedSetDocValues ssdv = r.getSortedSetDocValues("ssdv");
for (int i = 0; i < r.maxDoc(); i++) {
assertEquals(i, ndv.nextDoc());
assertEquals(i, ndv.longValue());
assertEquals(i, bdv.nextDoc());
assertEquals(17, getValue(bdv));
assertEquals(i, sdv.nextDoc());
BytesRef term = sdv.binaryValue();
assertEquals(new BytesRef(Integer.toString(i)), term);
assertEquals(i, ssdv.nextDoc());
long ord = ssdv.nextOrd();
term = ssdv.lookupOrd(ord);
assertEquals(i, Integer.parseInt(term.utf8ToString()));
// For the i=0 case, we added the same value twice, which was dedup'd by IndexWriter so it has only one value:
if (i != 0) {
ord = ssdv.nextOrd();
term = ssdv.lookupOrd(ord);
assertEquals(i * 2, Integer.parseInt(term.utf8ToString()));
}
assertEquals(SortedSetDocValues.NO_MORE_ORDS, ssdv.nextOrd());
}
reader.close();
dir.close();
}
Aggregations