Search in sources :

Example 1 with FieldsConsumer

use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.

the class FreqProxTermsWriter method flush.

@Override
public void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
    super.flush(fieldsToFlush, state, sortMap);
    // Gather all fields that saw any postings:
    List<FreqProxTermsWriterPerField> allFields = new ArrayList<>();
    for (TermsHashPerField f : fieldsToFlush.values()) {
        final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f;
        if (perField.bytesHash.size() > 0) {
            perField.sortPostings();
            assert perField.fieldInfo.getIndexOptions() != IndexOptions.NONE;
            allFields.add(perField);
        }
    }
    // Sort by field name
    CollectionUtil.introSort(allFields);
    Fields fields = new FreqProxFields(allFields);
    applyDeletes(state, fields);
    if (sortMap != null) {
        fields = new SortingLeafReader.SortingFields(fields, state.fieldInfos, sortMap);
    }
    FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state);
    boolean success = false;
    try {
        consumer.write(fields);
        success = true;
    } finally {
        if (success) {
            IOUtils.close(consumer);
        } else {
            IOUtils.closeWhileHandlingException(consumer);
        }
    }
}
Also used : ArrayList(java.util.ArrayList) FieldsConsumer(org.apache.lucene.codecs.FieldsConsumer)

Example 2 with FieldsConsumer

use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.

the class FSTOrdPostingsFormat method fieldsConsumer.

@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
    PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
    boolean success = false;
    try {
        FieldsConsumer ret = new FSTOrdTermsWriter(state, postingsWriter);
        success = true;
        return ret;
    } finally {
        if (!success) {
            IOUtils.closeWhileHandlingException(postingsWriter);
        }
    }
}
Also used : PostingsWriterBase(org.apache.lucene.codecs.PostingsWriterBase) FieldsConsumer(org.apache.lucene.codecs.FieldsConsumer) Lucene50PostingsWriter(org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter)

Example 3 with FieldsConsumer

use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.

the class RandomPostingsTester method buildIndex.

// maxAllowed = the "highest" we can index, but we will still
// randomly index at lower IndexOption
public FieldsProducer buildIndex(Codec codec, Directory dir, IndexOptions maxAllowed, boolean allowPayloads, boolean alwaysTestMax) throws IOException {
    SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "_0", maxDoc, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null);
    int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed);
    if (LuceneTestCase.VERBOSE) {
        System.out.println("\nTEST: now build index");
    }
    // TODO use allowPayloads
    FieldInfo[] newFieldInfoArray = new FieldInfo[fields.size()];
    for (int fieldUpto = 0; fieldUpto < fields.size(); fieldUpto++) {
        FieldInfo oldFieldInfo = fieldInfos.fieldInfo(fieldUpto);
        // Randomly picked the IndexOptions to index this
        // field with:
        IndexOptions indexOptions = IndexOptions.values()[alwaysTestMax ? maxIndexOption : TestUtil.nextInt(random, 1, maxIndexOption)];
        boolean doPayloads = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;
        newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.name, fieldUpto, false, false, doPayloads, indexOptions, DocValuesType.NONE, -1, new HashMap<>(), 0, 0);
    }
    FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);
    // Estimate that flushed segment size will be 25% of
    // what we use in RAM:
    long bytes = totalPostings * 8 + totalPayloadBytes;
    SegmentWriteState writeState = new SegmentWriteState(null, dir, segmentInfo, newFieldInfos, null, new IOContext(new FlushInfo(maxDoc, bytes)));
    Fields seedFields = new SeedFields(fields, newFieldInfos, maxAllowed, allowPayloads);
    FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(writeState);
    boolean success = false;
    try {
        consumer.write(seedFields);
        success = true;
    } finally {
        if (success) {
            IOUtils.close(consumer);
        } else {
            IOUtils.closeWhileHandlingException(consumer);
        }
    }
    if (LuceneTestCase.VERBOSE) {
        System.out.println("TEST: after indexing: files=");
        for (String file : dir.listAll()) {
            System.out.println("  " + file + ": " + dir.fileLength(file) + " bytes");
        }
    }
    currentFieldInfos = newFieldInfos;
    SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.READ);
    return codec.postingsFormat().fieldsProducer(readState);
}
Also used : HashMap(java.util.HashMap) FieldsConsumer(org.apache.lucene.codecs.FieldsConsumer) IOContext(org.apache.lucene.store.IOContext) FlushInfo(org.apache.lucene.store.FlushInfo)

Example 4 with FieldsConsumer

use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.

the class BasePostingsFormatTestCase method testInvertedWrite.

// LUCENE-5123: make sure we can visit postings twice
// during flush/merge
public void testInvertedWrite() throws Exception {
    Directory dir = newDirectory();
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    // Must be concurrent because thread(s) can be merging
    // while up to one thread flushes, and each of those
    // threads iterates over the map while the flushing
    // thread might be adding to it:
    final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>();
    final AtomicLong sumDocFreq = new AtomicLong();
    final AtomicLong sumTotalTermFreq = new AtomicLong();
    // TODO: would be better to use / delegate to the current
    // Codec returned by getCodec()
    iwc.setCodec(new FilterCodec(getCodec().getName(), getCodec()) {

        @Override
        public PostingsFormat postingsFormat() {
            final PostingsFormat defaultPostingsFormat = delegate.postingsFormat();
            final Thread mainThread = Thread.currentThread();
            return new PostingsFormat(defaultPostingsFormat.getName()) {

                @Override
                public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException {
                    final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state);
                    return new FieldsConsumer() {

                        @Override
                        public void write(Fields fields) throws IOException {
                            fieldsConsumer.write(fields);
                            boolean isMerge = state.context.context == IOContext.Context.MERGE;
                            // in this test:
                            assert isMerge || Thread.currentThread() == mainThread;
                            // We iterate the provided TermsEnum
                            // twice, so we excercise this new freedom
                            // with the inverted API; if
                            // addOnSecondPass is true, we add up
                            // term stats on the 2nd iteration:
                            boolean addOnSecondPass = random().nextBoolean();
                            //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass);
                            // Gather our own stats:
                            Terms terms = fields.terms("body");
                            assert terms != null;
                            TermsEnum termsEnum = terms.iterator();
                            PostingsEnum docs = null;
                            while (termsEnum.next() != null) {
                                BytesRef term = termsEnum.term();
                                // TODO: also sometimes ask for payloads/offsets?
                                boolean noPositions = random().nextBoolean();
                                if (noPositions) {
                                    docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                } else {
                                    docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                }
                                int docFreq = 0;
                                long totalTermFreq = 0;
                                while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                    docFreq++;
                                    totalTermFreq += docs.freq();
                                    int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                    if (!noPositions) {
                                        for (int i = 0; i < limit; i++) {
                                            docs.nextPosition();
                                        }
                                    }
                                }
                                String termString = term.utf8ToString();
                                // During merge we should only see terms
                                // we had already seen during a
                                // previous flush:
                                assertTrue(isMerge == false || termFreqs.containsKey(termString));
                                if (isMerge == false) {
                                    if (addOnSecondPass == false) {
                                        TermFreqs tf = termFreqs.get(termString);
                                        if (tf == null) {
                                            tf = new TermFreqs();
                                            termFreqs.put(termString, tf);
                                        }
                                        tf.docFreq += docFreq;
                                        tf.totalTermFreq += totalTermFreq;
                                        sumDocFreq.addAndGet(docFreq);
                                        sumTotalTermFreq.addAndGet(totalTermFreq);
                                    } else if (termFreqs.containsKey(termString) == false) {
                                        // Add placeholder (2nd pass will
                                        // set its counts):
                                        termFreqs.put(termString, new TermFreqs());
                                    }
                                }
                            }
                            // Also test seeking the TermsEnum:
                            for (String term : termFreqs.keySet()) {
                                if (termsEnum.seekExact(new BytesRef(term))) {
                                    // TODO: also sometimes ask for payloads/offsets?
                                    boolean noPositions = random().nextBoolean();
                                    if (noPositions) {
                                        docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                    } else {
                                        docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                    }
                                    int docFreq = 0;
                                    long totalTermFreq = 0;
                                    while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                        docFreq++;
                                        totalTermFreq += docs.freq();
                                        int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                        if (!noPositions) {
                                            for (int i = 0; i < limit; i++) {
                                                docs.nextPosition();
                                            }
                                        }
                                    }
                                    if (isMerge == false && addOnSecondPass) {
                                        TermFreqs tf = termFreqs.get(term);
                                        assert tf != null;
                                        tf.docFreq += docFreq;
                                        tf.totalTermFreq += totalTermFreq;
                                        sumDocFreq.addAndGet(docFreq);
                                        sumTotalTermFreq.addAndGet(totalTermFreq);
                                    }
                                    //System.out.println("  term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term));
                                    assertTrue(docFreq <= termFreqs.get(term).docFreq);
                                    assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq);
                                }
                            }
                            // Also test seekCeil
                            for (int iter = 0; iter < 10; iter++) {
                                BytesRef term = new BytesRef(TestUtil.randomRealisticUnicodeString(random()));
                                SeekStatus status = termsEnum.seekCeil(term);
                                if (status == SeekStatus.NOT_FOUND) {
                                    assertTrue(term.compareTo(termsEnum.term()) < 0);
                                }
                            }
                        }

                        @Override
                        public void close() throws IOException {
                            fieldsConsumer.close();
                        }
                    };
                }

                @Override
                public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
                    return defaultPostingsFormat.fieldsProducer(state);
                }
            };
        }
    });
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    LineFileDocs docs = new LineFileDocs(random());
    int bytesToIndex = atLeast(100) * 1024;
    int bytesIndexed = 0;
    while (bytesIndexed < bytesToIndex) {
        Document doc = docs.nextDoc();
        Document justBodyDoc = new Document();
        justBodyDoc.add(doc.getField("body"));
        w.addDocument(justBodyDoc);
        bytesIndexed += RamUsageTester.sizeOf(justBodyDoc);
    }
    IndexReader r = w.getReader();
    w.close();
    Terms terms = MultiFields.getTerms(r, "body");
    assertEquals(sumDocFreq.get(), terms.getSumDocFreq());
    assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq());
    TermsEnum termsEnum = terms.iterator();
    long termCount = 0;
    boolean supportsOrds = true;
    while (termsEnum.next() != null) {
        BytesRef term = termsEnum.term();
        assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq());
        assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq());
        if (supportsOrds) {
            long ord;
            try {
                ord = termsEnum.ord();
            } catch (UnsupportedOperationException uoe) {
                supportsOrds = false;
                ord = -1;
            }
            if (ord != -1) {
                assertEquals(termCount, ord);
            }
        }
        termCount++;
    }
    assertEquals(termFreqs.size(), termCount);
    r.close();
    dir.close();
}
Also used : FieldsConsumer(org.apache.lucene.codecs.FieldsConsumer) Document(org.apache.lucene.document.Document) FilterCodec(org.apache.lucene.codecs.FilterCodec) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) LineFileDocs(org.apache.lucene.util.LineFileDocs) FieldsProducer(org.apache.lucene.codecs.FieldsProducer) IOException(java.io.IOException) AtomicLong(java.util.concurrent.atomic.AtomicLong) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) SeekStatus(org.apache.lucene.index.TermsEnum.SeekStatus)

Example 5 with FieldsConsumer

use of org.apache.lucene.codecs.FieldsConsumer in project lucene-solr by apache.

the class LuceneFixedGap method fieldsConsumer.

@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
    PostingsWriterBase docs = new Lucene50PostingsWriter(state);
    // TODO: should we make the terms index more easily
    // pluggable?  Ie so that this codec would record which
    // index impl was used, and switch on loading?
    // Or... you must make a new Codec for this?
    TermsIndexWriterBase indexWriter;
    boolean success = false;
    try {
        indexWriter = new FixedGapTermsIndexWriter(state, termIndexInterval);
        success = true;
    } finally {
        if (!success) {
            docs.close();
        }
    }
    success = false;
    try {
        // Must use BlockTermsWriter (not BlockTree) because
        // BlockTree doens't support ords (yet)...
        FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs);
        success = true;
        return ret;
    } finally {
        if (!success) {
            try {
                docs.close();
            } finally {
                indexWriter.close();
            }
        }
    }
}
Also used : PostingsWriterBase(org.apache.lucene.codecs.PostingsWriterBase) FixedGapTermsIndexWriter(org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter) BlockTermsWriter(org.apache.lucene.codecs.blockterms.BlockTermsWriter) TermsIndexWriterBase(org.apache.lucene.codecs.blockterms.TermsIndexWriterBase) FieldsConsumer(org.apache.lucene.codecs.FieldsConsumer) Lucene50PostingsWriter(org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter)

Aggregations

FieldsConsumer (org.apache.lucene.codecs.FieldsConsumer)14 PostingsWriterBase (org.apache.lucene.codecs.PostingsWriterBase)9 Lucene50PostingsWriter (org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter)7 BlockTermsWriter (org.apache.lucene.codecs.blockterms.BlockTermsWriter)4 TermsIndexWriterBase (org.apache.lucene.codecs.blockterms.TermsIndexWriterBase)4 VariableGapTermsIndexWriter (org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter)3 BytesRef (org.apache.lucene.util.BytesRef)3 IOException (java.io.IOException)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 Codec (org.apache.lucene.codecs.Codec)2 FieldsProducer (org.apache.lucene.codecs.FieldsProducer)2 FixedGapTermsIndexWriter (org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter)2 BlockTreeTermsWriter (org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter)2 Document (org.apache.lucene.document.Document)2 Directory (org.apache.lucene.store.Directory)2 FlushInfo (org.apache.lucene.store.FlushInfo)2 IOContext (org.apache.lucene.store.IOContext)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Random (java.util.Random)1