Search in sources :

Example 56 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class SimpleLuceneTests method testNumericTermDocsFreqs.

/**
     * A test just to verify that term freqs are not stored for numeric fields. <tt>int1</tt> is not storing termFreq
     * and <tt>int2</tt> does.
     */
public void testNumericTermDocsFreqs() throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
    Document doc = new Document();
    FieldType type = LegacyIntField.TYPE_NOT_STORED;
    LegacyIntField field = new LegacyIntField("int1", 1, type);
    doc.add(field);
    type = new FieldType(LegacyIntField.TYPE_NOT_STORED);
    type.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    type.freeze();
    field = new LegacyIntField("int1", 1, type);
    doc.add(field);
    field = new LegacyIntField("int2", 1, type);
    doc.add(field);
    field = new LegacyIntField("int2", 1, type);
    doc.add(field);
    indexWriter.addDocument(doc);
    IndexReader reader = DirectoryReader.open(indexWriter);
    LeafReader atomicReader = SlowCompositeReaderWrapper.wrap(reader);
    Terms terms = atomicReader.terms("int1");
    TermsEnum termsEnum = terms.iterator();
    termsEnum.next();
    PostingsEnum termDocs = termsEnum.postings(null);
    assertThat(termDocs.nextDoc(), equalTo(0));
    assertThat(termDocs.docID(), equalTo(0));
    assertThat(termDocs.freq(), equalTo(1));
    terms = atomicReader.terms("int2");
    termsEnum = terms.iterator();
    termsEnum.next();
    termDocs = termsEnum.postings(termDocs);
    assertThat(termDocs.nextDoc(), equalTo(0));
    assertThat(termDocs.docID(), equalTo(0));
    assertThat(termDocs.freq(), equalTo(2));
    reader.close();
    indexWriter.close();
}
Also used : LeafReader(org.apache.lucene.index.LeafReader) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) Terms(org.apache.lucene.index.Terms) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) RAMDirectory(org.apache.lucene.store.RAMDirectory) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) FieldType(org.apache.lucene.document.FieldType) LegacyIntField(org.apache.lucene.document.LegacyIntField) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 57 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class AbstractTermVectorsTestCase method validateResponse.

protected void validateResponse(TermVectorsResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException {
    assertThat(esResponse.getIndex(), equalTo(testConfig.doc.index));
    TestDoc testDoc = testConfig.doc;
    HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<>(Arrays.asList(testConfig.selectedFields));
    Fields esTermVectorFields = esResponse.getFields();
    for (TestFieldSetting field : testDoc.fieldSettings) {
        Terms esTerms = esTermVectorFields.terms(field.name);
        if (selectedFields != null && !selectedFields.contains(field.name)) {
            assertNull(esTerms);
            continue;
        }
        assertNotNull(esTerms);
        Terms luceneTerms = luceneFields.terms(field.name);
        TermsEnum esTermEnum = esTerms.iterator();
        TermsEnum luceneTermEnum = luceneTerms.iterator();
        while (esTermEnum.next() != null) {
            assertNotNull(luceneTermEnum.next());
            assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
            PostingsEnum esDocsPosEnum = esTermEnum.postings(null, PostingsEnum.POSITIONS);
            PostingsEnum luceneDocsPosEnum = luceneTermEnum.postings(null, PostingsEnum.POSITIONS);
            if (luceneDocsPosEnum == null) {
                // test we expect that...
                assertFalse(field.storedOffset);
                assertFalse(field.storedPayloads);
                assertFalse(field.storedPositions);
                continue;
            }
            String currentTerm = esTermEnum.term().utf8ToString();
            assertThat("Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString()));
            esDocsPosEnum.nextDoc();
            luceneDocsPosEnum.nextDoc();
            int freq = esDocsPosEnum.freq();
            assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
            for (int i = 0; i < freq; i++) {
                String failDesc = " (field:" + field.name + " term:" + currentTerm + ")";
                int lucenePos = luceneDocsPosEnum.nextPosition();
                int esPos = esDocsPosEnum.nextPosition();
                if (field.storedPositions && testConfig.requestPositions) {
                    assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos));
                } else {
                    assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1));
                }
                if (field.storedOffset && testConfig.requestOffsets) {
                    assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
                    assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
                } else {
                    assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
                    assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
                }
                if (field.storedPayloads && testConfig.requestPayloads) {
                    assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
                } else {
                    assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
                }
            }
        }
        assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
    }
}
Also used : Fields(org.apache.lucene.index.Fields) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 58 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method checkBrownFoxTermVector.

private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException {
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
    Terms terms = fields.terms(fieldName);
    assertThat(terms.size(), equalTo(8L));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, notNullValue());
        // do not test ttf or doc frequency, because here we have many
        // shards and do not know how documents are distributed
        PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            if (withPayloads) {
                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
            }
        }
    }
    assertThat(iterator.next(), nullValue());
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 59 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method testRandomPayloadWithDelimitedPayloadTokenFilter.

public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException {
    //create the test document
    int encoding = randomIntBetween(0, 2);
    String encodingString = "";
    if (encoding == 0) {
        encodingString = "float";
    }
    if (encoding == 1) {
        encodingString = "int";
    }
    if (encoding == 2) {
        encodingString = "identity";
    }
    String[] tokens = crateRandomTokens();
    Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
    String delimiter = createRandomDelimiter(tokens);
    String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
    //create the mapping
    XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads").field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
    assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(Settings.builder().put(indexSettings()).put("index.analysis.analyzer.payload_test.tokenizer", "whitespace").putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter").put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter).put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString).put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter")));
    client().prepareIndex("test", "type1", Integer.toString(1)).setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
    refresh();
    TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields();
    TermVectorsResponse response = resp.execute().actionGet();
    assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    TermsEnum iterator = terms.iterator();
    while (iterator.next() != null) {
        String term = iterator.term().utf8ToString();
        PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        List<BytesRef> curPayloads = payloads.get(term);
        assertThat(term, curPayloads, notNullValue());
        assertNotNull(docsAndPositions);
        for (int k = 0; k < docsAndPositions.freq(); k++) {
            docsAndPositions.nextPosition();
            if (docsAndPositions.getPayload() != null) {
                String infoString = "\nterm: " + term + " has payload \n" + docsAndPositions.getPayload().toString() + "\n but should have payload \n" + curPayloads.get(k).toString();
                assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
            } else {
                String infoString = "\nterm: " + term + " has no payload but should have payload \n" + curPayloads.get(k).toString();
                assertThat(infoString, curPayloads.get(k).length, equalTo(0));
            }
        }
    }
    assertThat(iterator.next(), nullValue());
}
Also used : Terms(org.apache.lucene.index.Terms) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) ArrayList(java.util.ArrayList) List(java.util.List) PostingsEnum(org.apache.lucene.index.PostingsEnum) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 60 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method testRandomSingleTermVectors.

public void testRandomSingleTermVectors() throws IOException {
    FieldType ft = new FieldType();
    int config = randomInt(6);
    boolean storePositions = false;
    boolean storeOffsets = false;
    boolean storePayloads = false;
    boolean storeTermVectors = false;
    switch(config) {
        case 0:
            {
                // do nothing
                break;
            }
        case 1:
            {
                storeTermVectors = true;
                break;
            }
        case 2:
            {
                storeTermVectors = true;
                storePositions = true;
                break;
            }
        case 3:
            {
                storeTermVectors = true;
                storeOffsets = true;
                break;
            }
        case 4:
            {
                storeTermVectors = true;
                storePositions = true;
                storeOffsets = true;
                break;
            }
        case 5:
            {
                storeTermVectors = true;
                storePositions = true;
                storePayloads = true;
                break;
            }
        case 6:
            {
                storeTermVectors = true;
                storePositions = true;
                storeOffsets = true;
                storePayloads = true;
                break;
            }
    }
    ft.setStoreTermVectors(storeTermVectors);
    ft.setStoreTermVectorOffsets(storeOffsets);
    ft.setStoreTermVectorPayloads(storePayloads);
    ft.setStoreTermVectorPositions(storePositions);
    String optionString = FieldMapper.termVectorOptionsToString(ft);
    XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field").field("type", "text").field("term_vector", optionString).field("analyzer", "tv_test").endObject().endObject().endObject().endObject();
    assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(Settings.builder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace").putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i)).setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog").endObject()).execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
    boolean isPayloadRequested = randomBoolean();
    boolean isOffsetRequested = randomBoolean();
    boolean isPositionsRequested = randomBoolean();
    String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString);
    for (int i = 0; i < 10; i++) {
        TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(isPayloadRequested).setOffsets(isOffsetRequested).setPositions(isPositionsRequested).setSelectedFields();
        TermVectorsResponse response = resp.execute().actionGet();
        assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
        if (ft.storeTermVectors()) {
            Terms terms = fields.terms("field");
            assertThat(terms.size(), equalTo(8L));
            TermsEnum iterator = terms.iterator();
            for (int j = 0; j < values.length; j++) {
                String string = values[j];
                BytesRef next = iterator.next();
                assertThat(infoString, next, notNullValue());
                assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
                assertThat(infoString, next, notNullValue());
                // do not test ttf or doc frequency, because here we have
                // many shards and do not know how documents are distributed
                PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
                // docs and pos only returns something if positions or
                // payloads or offsets are stored / requestd Otherwise use
                // DocsEnum?
                assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
                assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
                int[] termPos = pos[j];
                int[] termStartOffset = startOffset[j];
                int[] termEndOffset = endOffset[j];
                if (isPositionsRequested && storePositions) {
                    assertThat(infoString, termPos.length, equalTo(freq[j]));
                }
                if (isOffsetRequested && storeOffsets) {
                    assertThat(termStartOffset.length, equalTo(freq[j]));
                    assertThat(termEndOffset.length, equalTo(freq[j]));
                }
                for (int k = 0; k < freq[j]; k++) {
                    int nextPosition = docsAndPositions.nextPosition();
                    // only return something useful if requested and stored
                    if (isPositionsRequested && storePositions) {
                        assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k]));
                    } else {
                        assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
                    }
                    // only return something useful if requested and stored
                    if (isPayloadRequested && storePayloads) {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
                    } else {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null));
                    }
                    // only return something useful if requested and stored
                    if (isOffsetRequested && storeOffsets) {
                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
                    } else {
                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1));
                    }
                }
            }
            assertThat(iterator.next(), nullValue());
        }
    }
}
Also used : Terms(org.apache.lucene.index.Terms) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) PostingsEnum(org.apache.lucene.index.PostingsEnum) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)80 BytesRef (org.apache.lucene.util.BytesRef)59 TermsEnum (org.apache.lucene.index.TermsEnum)56 Terms (org.apache.lucene.index.Terms)47 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)17 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)12 Bits (org.apache.lucene.util.Bits)11 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5