use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.
the class FreqProxTermsWriter method flush.
@Override
public void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
super.flush(fieldsToFlush, state, sortMap);
// Gather all fields that saw any postings:
List<FreqProxTermsWriterPerField> allFields = new ArrayList<>();
for (TermsHashPerField f : fieldsToFlush.values()) {
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f;
if (perField.bytesHash.size() > 0) {
perField.sortPostings();
assert perField.fieldInfo.getIndexOptions() != IndexOptions.NONE;
allFields.add(perField);
}
}
// Sort by field name
CollectionUtil.introSort(allFields);
Fields fields = new FreqProxFields(allFields);
applyDeletes(state, fields);
if (sortMap != null) {
fields = new SortingLeafReader.SortingFields(fields, state.fieldInfos, sortMap);
}
FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state);
boolean success = false;
try {
consumer.write(fields);
success = true;
} finally {
if (success) {
IOUtils.close(consumer);
} else {
IOUtils.closeWhileHandlingException(consumer);
}
}
}
use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.
the class FSTOrdPostingsFormat method fieldsConsumer.
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret = new FSTOrdTermsWriter(state, postingsWriter);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.
the class RandomPostingsTester method buildIndex.
// maxAllowed = the "highest" we can index, but we will still
// randomly index at lower IndexOption
public FieldsProducer buildIndex(Codec codec, Directory dir, IndexOptions maxAllowed, boolean allowPayloads, boolean alwaysTestMax) throws IOException {
SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "_0", maxDoc, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null);
int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed);
if (LuceneTestCase.VERBOSE) {
System.out.println("\nTEST: now build index");
}
// TODO use allowPayloads
FieldInfo[] newFieldInfoArray = new FieldInfo[fields.size()];
for (int fieldUpto = 0; fieldUpto < fields.size(); fieldUpto++) {
FieldInfo oldFieldInfo = fieldInfos.fieldInfo(fieldUpto);
// Randomly picked the IndexOptions to index this
// field with:
IndexOptions indexOptions = IndexOptions.values()[alwaysTestMax ? maxIndexOption : TestUtil.nextInt(random, 1, maxIndexOption)];
boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;
newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.name, fieldUpto, false, false, doPayloads, indexOptions, DocValuesType.NONE, -1, new HashMap<>(), 0, 0);
}
FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);
// Estimate that flushed segment size will be 25% of
// what we use in RAM:
long bytes = totalPostings * 8 + totalPayloadBytes;
SegmentWriteState writeState = new SegmentWriteState(null, dir, segmentInfo, newFieldInfos, null, new IOContext(new FlushInfo(maxDoc, bytes)));
Fields seedFields = new SeedFields(fields, newFieldInfos, maxAllowed, allowPayloads);
FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(writeState);
boolean success = false;
try {
consumer.write(seedFields);
success = true;
} finally {
if (success) {
IOUtils.close(consumer);
} else {
IOUtils.closeWhileHandlingException(consumer);
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: after indexing: files=");
for (String file : dir.listAll()) {
System.out.println(" " + file + ": " + dir.fileLength(file) + " bytes");
}
}
currentFieldInfos = newFieldInfos;
SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.READ);
return codec.postingsFormat().fieldsProducer(readState);
}
use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.
the class BasePostingsFormatTestCase method testInvertedWrite.
// LUCENE-5123: make sure we can visit postings twice
// during flush/merge
public void testInvertedWrite() throws Exception {
Directory dir = newDirectory();
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
// Must be concurrent because thread(s) can be merging
// while up to one thread flushes, and each of those
// threads iterates over the map while the flushing
// thread might be adding to it:
final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>();
final AtomicLong sumDocFreq = new AtomicLong();
final AtomicLong sumTotalTermFreq = new AtomicLong();
// TODO: would be better to use / delegate to the current
// Codec returned by getCodec()
iwc.setCodec(new FilterCodec(getCodec().getName(), getCodec()) {
@Override
public PostingsFormat postingsFormat() {
final PostingsFormat defaultPostingsFormat = delegate.postingsFormat();
final Thread mainThread = Thread.currentThread();
return new PostingsFormat(defaultPostingsFormat.getName()) {
@Override
public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException {
final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state);
return new FieldsConsumer() {
@Override
public void write(Fields fields) throws IOException {
fieldsConsumer.write(fields);
boolean isMerge = state.context.context == IOContext.Context.MERGE;
// in this test:
assert isMerge || Thread.currentThread() == mainThread;
// We iterate the provided TermsEnum
// twice, so we excercise this new freedom
// with the inverted API; if
// addOnSecondPass is true, we add up
// term stats on the 2nd iteration:
boolean addOnSecondPass = random().nextBoolean();
//System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass);
// Gather our own stats:
Terms terms = fields.terms("body");
assert terms != null;
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while (termsEnum.next() != null) {
BytesRef term = termsEnum.term();
// TODO: also sometimes ask for payloads/offsets?
boolean noPositions = random().nextBoolean();
if (noPositions) {
docs = termsEnum.postings(docs, PostingsEnum.FREQS);
} else {
docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
}
int docFreq = 0;
long totalTermFreq = 0;
while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
docFreq++;
totalTermFreq += docs.freq();
int limit = TestUtil.nextInt(random(), 1, docs.freq());
if (!noPositions) {
for (int i = 0; i < limit; i++) {
docs.nextPosition();
}
}
}
String termString = term.utf8ToString();
// During merge we should only see terms
// we had already seen during a
// previous flush:
assertTrue(isMerge == false || termFreqs.containsKey(termString));
if (isMerge == false) {
if (addOnSecondPass == false) {
TermFreqs tf = termFreqs.get(termString);
if (tf == null) {
tf = new TermFreqs();
termFreqs.put(termString, tf);
}
tf.docFreq += docFreq;
tf.totalTermFreq += totalTermFreq;
sumDocFreq.addAndGet(docFreq);
sumTotalTermFreq.addAndGet(totalTermFreq);
} else if (termFreqs.containsKey(termString) == false) {
// Add placeholder (2nd pass will
// set its counts):
termFreqs.put(termString, new TermFreqs());
}
}
}
// Also test seeking the TermsEnum:
for (String term : termFreqs.keySet()) {
if (termsEnum.seekExact(new BytesRef(term))) {
// TODO: also sometimes ask for payloads/offsets?
boolean noPositions = random().nextBoolean();
if (noPositions) {
docs = termsEnum.postings(docs, PostingsEnum.FREQS);
} else {
docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
}
int docFreq = 0;
long totalTermFreq = 0;
while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
docFreq++;
totalTermFreq += docs.freq();
int limit = TestUtil.nextInt(random(), 1, docs.freq());
if (!noPositions) {
for (int i = 0; i < limit; i++) {
docs.nextPosition();
}
}
}
if (isMerge == false && addOnSecondPass) {
TermFreqs tf = termFreqs.get(term);
assert tf != null;
tf.docFreq += docFreq;
tf.totalTermFreq += totalTermFreq;
sumDocFreq.addAndGet(docFreq);
sumTotalTermFreq.addAndGet(totalTermFreq);
}
//System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term));
assertTrue(docFreq <= termFreqs.get(term).docFreq);
assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq);
}
}
// Also test seekCeil
for (int iter = 0; iter < 10; iter++) {
BytesRef term = new BytesRef(TestUtil.randomRealisticUnicodeString(random()));
SeekStatus status = termsEnum.seekCeil(term);
if (status == SeekStatus.NOT_FOUND) {
assertTrue(term.compareTo(termsEnum.term()) < 0);
}
}
}
@Override
public void close() throws IOException {
fieldsConsumer.close();
}
};
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
return defaultPostingsFormat.fieldsProducer(state);
}
};
}
});
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
LineFileDocs docs = new LineFileDocs(random());
int bytesToIndex = atLeast(100) * 1024;
int bytesIndexed = 0;
while (bytesIndexed < bytesToIndex) {
Document doc = docs.nextDoc();
Document justBodyDoc = new Document();
justBodyDoc.add(doc.getField("body"));
w.addDocument(justBodyDoc);
bytesIndexed += RamUsageTester.sizeOf(justBodyDoc);
}
IndexReader r = w.getReader();
w.close();
Terms terms = MultiFields.getTerms(r, "body");
assertEquals(sumDocFreq.get(), terms.getSumDocFreq());
assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq());
TermsEnum termsEnum = terms.iterator();
long termCount = 0;
boolean supportsOrds = true;
while (termsEnum.next() != null) {
BytesRef term = termsEnum.term();
assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq());
assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq());
if (supportsOrds) {
long ord;
try {
ord = termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
supportsOrds = false;
ord = -1;
}
if (ord != -1) {
assertEquals(termCount, ord);
}
}
termCount++;
}
assertEquals(termFreqs.size(), termCount);
r.close();
dir.close();
}
use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.
the class LuceneFixedGap method fieldsConsumer.
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docs = new Lucene50PostingsWriter(state);
// TODO: should we make the terms index more easily
// pluggable? Ie so that this codec would record which
// index impl was used, and switch on loading?
// Or... you must make a new Codec for this?
TermsIndexWriterBase indexWriter;
boolean success = false;
try {
indexWriter = new FixedGapTermsIndexWriter(state, termIndexInterval);
success = true;
} finally {
if (!success) {
docs.close();
}
}
success = false;
try {
// Must use BlockTermsWriter (not BlockTree) because
// BlockTree doens't support ords (yet)...
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs);
success = true;
return ret;
} finally {
if (!success) {
try {
docs.close();
} finally {
indexWriter.close();
}
}
}
}
Aggregations