use of org.apache.lucene.util.LineFileDocs in project lucene-solr by apache.
the class TestMemoryIndexAgainstRAMDir method testDuellMemIndex.
public void testDuellMemIndex() throws IOException {
LineFileDocs lineFileDocs = new LineFileDocs(random());
int numDocs = atLeast(10);
MemoryIndex memory = randomMemoryIndex();
for (int i = 0; i < numDocs; i++) {
Directory dir = newDirectory();
MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
mockAnalyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random(), mockAnalyzer));
Document nextDoc = lineFileDocs.nextDoc();
Document doc = new Document();
for (IndexableField field : nextDoc.getFields()) {
if (field.fieldType().indexOptions() != IndexOptions.NONE) {
doc.add(field);
if (random().nextInt(3) == 0) {
// randomly add the same field twice
doc.add(field);
}
}
}
writer.addDocument(doc);
writer.close();
for (IndexableField field : doc) {
memory.addField(field.name(), ((Field) field).stringValue(), mockAnalyzer);
}
DirectoryReader competitor = DirectoryReader.open(dir);
LeafReader memIndexReader = (LeafReader) memory.createSearcher().getIndexReader();
TestUtil.checkReader(memIndexReader);
duellReaders(competitor, memIndexReader);
IOUtils.close(competitor, memIndexReader);
memory.reset();
dir.close();
}
lineFileDocs.close();
}
use of org.apache.lucene.util.LineFileDocs in project lucene-solr by apache.
the class BasePostingsFormatTestCase method testInvertedWrite.
// LUCENE-5123: make sure we can visit postings twice
// during flush/merge
public void testInvertedWrite() throws Exception {
Directory dir = newDirectory();
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
// Must be concurrent because thread(s) can be merging
// while up to one thread flushes, and each of those
// threads iterates over the map while the flushing
// thread might be adding to it:
final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>();
final AtomicLong sumDocFreq = new AtomicLong();
final AtomicLong sumTotalTermFreq = new AtomicLong();
// TODO: would be better to use / delegate to the current
// Codec returned by getCodec()
iwc.setCodec(new FilterCodec(getCodec().getName(), getCodec()) {
@Override
public PostingsFormat postingsFormat() {
final PostingsFormat defaultPostingsFormat = delegate.postingsFormat();
final Thread mainThread = Thread.currentThread();
return new PostingsFormat(defaultPostingsFormat.getName()) {
@Override
public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException {
final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state);
return new FieldsConsumer() {
@Override
public void write(Fields fields) throws IOException {
fieldsConsumer.write(fields);
boolean isMerge = state.context.context == IOContext.Context.MERGE;
// in this test:
assert isMerge || Thread.currentThread() == mainThread;
// We iterate the provided TermsEnum
// twice, so we excercise this new freedom
// with the inverted API; if
// addOnSecondPass is true, we add up
// term stats on the 2nd iteration:
boolean addOnSecondPass = random().nextBoolean();
//System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass);
// Gather our own stats:
Terms terms = fields.terms("body");
assert terms != null;
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while (termsEnum.next() != null) {
BytesRef term = termsEnum.term();
// TODO: also sometimes ask for payloads/offsets?
boolean noPositions = random().nextBoolean();
if (noPositions) {
docs = termsEnum.postings(docs, PostingsEnum.FREQS);
} else {
docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
}
int docFreq = 0;
long totalTermFreq = 0;
while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
docFreq++;
totalTermFreq += docs.freq();
int limit = TestUtil.nextInt(random(), 1, docs.freq());
if (!noPositions) {
for (int i = 0; i < limit; i++) {
docs.nextPosition();
}
}
}
String termString = term.utf8ToString();
// During merge we should only see terms
// we had already seen during a
// previous flush:
assertTrue(isMerge == false || termFreqs.containsKey(termString));
if (isMerge == false) {
if (addOnSecondPass == false) {
TermFreqs tf = termFreqs.get(termString);
if (tf == null) {
tf = new TermFreqs();
termFreqs.put(termString, tf);
}
tf.docFreq += docFreq;
tf.totalTermFreq += totalTermFreq;
sumDocFreq.addAndGet(docFreq);
sumTotalTermFreq.addAndGet(totalTermFreq);
} else if (termFreqs.containsKey(termString) == false) {
// Add placeholder (2nd pass will
// set its counts):
termFreqs.put(termString, new TermFreqs());
}
}
}
// Also test seeking the TermsEnum:
for (String term : termFreqs.keySet()) {
if (termsEnum.seekExact(new BytesRef(term))) {
// TODO: also sometimes ask for payloads/offsets?
boolean noPositions = random().nextBoolean();
if (noPositions) {
docs = termsEnum.postings(docs, PostingsEnum.FREQS);
} else {
docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
}
int docFreq = 0;
long totalTermFreq = 0;
while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
docFreq++;
totalTermFreq += docs.freq();
int limit = TestUtil.nextInt(random(), 1, docs.freq());
if (!noPositions) {
for (int i = 0; i < limit; i++) {
docs.nextPosition();
}
}
}
if (isMerge == false && addOnSecondPass) {
TermFreqs tf = termFreqs.get(term);
assert tf != null;
tf.docFreq += docFreq;
tf.totalTermFreq += totalTermFreq;
sumDocFreq.addAndGet(docFreq);
sumTotalTermFreq.addAndGet(totalTermFreq);
}
//System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term));
assertTrue(docFreq <= termFreqs.get(term).docFreq);
assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq);
}
}
// Also test seekCeil
for (int iter = 0; iter < 10; iter++) {
BytesRef term = new BytesRef(TestUtil.randomRealisticUnicodeString(random()));
SeekStatus status = termsEnum.seekCeil(term);
if (status == SeekStatus.NOT_FOUND) {
assertTrue(term.compareTo(termsEnum.term()) < 0);
}
}
}
@Override
public void close() throws IOException {
fieldsConsumer.close();
}
};
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
return defaultPostingsFormat.fieldsProducer(state);
}
};
}
});
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
LineFileDocs docs = new LineFileDocs(random());
int bytesToIndex = atLeast(100) * 1024;
int bytesIndexed = 0;
while (bytesIndexed < bytesToIndex) {
Document doc = docs.nextDoc();
Document justBodyDoc = new Document();
justBodyDoc.add(doc.getField("body"));
w.addDocument(justBodyDoc);
bytesIndexed += RamUsageTester.sizeOf(justBodyDoc);
}
IndexReader r = w.getReader();
w.close();
Terms terms = MultiFields.getTerms(r, "body");
assertEquals(sumDocFreq.get(), terms.getSumDocFreq());
assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq());
TermsEnum termsEnum = terms.iterator();
long termCount = 0;
boolean supportsOrds = true;
while (termsEnum.next() != null) {
BytesRef term = termsEnum.term();
assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq());
assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq());
if (supportsOrds) {
long ord;
try {
ord = termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
supportsOrds = false;
ord = -1;
}
if (ord != -1) {
assertEquals(termCount, ord);
}
}
termCount++;
}
assertEquals(termFreqs.size(), termCount);
r.close();
dir.close();
}
use of org.apache.lucene.util.LineFileDocs in project lucene-solr by apache.
the class TestFSTs method testRealTerms.
// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
final LineFileDocs docs = new LineFileDocs(random());
final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
final Path tempDir = createTempDir("fstlines");
final Directory dir = newFSDirectory(tempDir);
final IndexWriter writer = new IndexWriter(dir, conf);
Document doc;
int docCount = 0;
while ((doc = docs.nextDoc()) != null && docCount < numDocs) {
writer.addDocument(doc);
docCount++;
}
IndexReader r = DirectoryReader.open(writer);
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
if (storeOrd) {
System.out.println("FST stores ord");
} else {
System.out.println("FST stores docFreq");
}
}
Terms terms = MultiFields.getTerms(r, "body");
if (terms != null) {
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
final TermsEnum termsEnum = terms.iterator();
if (VERBOSE) {
System.out.println("TEST: got termsEnum=" + termsEnum);
}
BytesRef term;
int ord = 0;
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
while ((term = termsEnum.next()) != null) {
BytesRef term2 = termsEnum2.next();
assertNotNull(term2);
assertEquals(term, term2);
assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
if (ord == 0) {
try {
termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
if (VERBOSE) {
System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
}
storeOrd = false;
}
}
final int output;
if (storeOrd) {
output = ord;
} else {
output = termsEnum.docFreq();
}
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
}
FST<Long> fst = builder.finish();
if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
}
if (ord > 0) {
final Random random = new Random(random().nextLong());
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
int num = atLeast(1000);
for (int iter = 0; iter < num; iter++) {
final BytesRef randomTerm = new BytesRef(getRandomString(random));
if (VERBOSE) {
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
}
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
if (seekResult == TermsEnum.SeekStatus.END) {
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
} else {
assertSame(termsEnum, fstEnum, storeOrd);
for (int nextIter = 0; nextIter < 10; nextIter++) {
if (VERBOSE) {
System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
}
if (termsEnum.next() != null) {
if (VERBOSE) {
System.out.println(" term=" + termsEnum.term().utf8ToString());
}
assertNotNull(fstEnum.next());
assertSame(termsEnum, fstEnum, storeOrd);
} else {
if (VERBOSE) {
System.out.println(" end!");
}
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
}
}
}
}
}
}
r.close();
dir.close();
}
use of org.apache.lucene.util.LineFileDocs in project lucene-solr by apache.
the class TestAllFilesCheckIndexHeader method test.
public void test() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setCodec(TestUtil.getDefaultCodec());
// Disable CFS 80% of the time so we can truncate individual files, but the other 20% of the time we test truncation of .cfs/.cfe too:
if (random().nextInt(5) != 1) {
conf.setUseCompoundFile(false);
conf.getMergePolicy().setNoCFSRatio(0.0);
}
RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf);
// Use LineFileDocs so we (hopefully) get most Lucene features
// tested, e.g. IntPoint was recently added to it:
LineFileDocs docs = new LineFileDocs(random());
for (int i = 0; i < 100; i++) {
riw.addDocument(docs.nextDoc());
if (random().nextInt(7) == 0) {
riw.commit();
}
if (random().nextInt(20) == 0) {
riw.deleteDocuments(new Term("docid", Integer.toString(i)));
}
if (random().nextInt(15) == 0) {
riw.updateNumericDocValue(new Term("docid", Integer.toString(i)), "docid_intDV", Long.valueOf(i));
}
}
if (TEST_NIGHTLY == false) {
riw.forceMerge(1);
}
riw.close();
checkIndexHeader(dir);
dir.close();
}
use of org.apache.lucene.util.LineFileDocs in project lucene-solr by apache.
the class TestBackwardsCompatibility method testCreateMoreTermsIndex.
public void testCreateMoreTermsIndex() throws Exception {
Path indexDir = getIndexDir().resolve("moreterms");
Files.deleteIfExists(indexDir);
Directory dir = newFSDirectory(indexDir);
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(1.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
// TODO: remove randomness
IndexWriterConfig conf = new IndexWriterConfig(analyzer).setMergePolicy(mp).setUseCompoundFile(false);
IndexWriter writer = new IndexWriter(dir, conf);
LineFileDocs docs = new LineFileDocs(null);
for (int i = 0; i < 50; i++) {
writer.addDocument(docs.nextDoc());
}
docs.close();
writer.close();
dir.close();
// Gives you time to copy the index out!: (there is also
// a test option to not remove temp dir...):
Thread.sleep(100000);
}
Aggregations