Examples with TermsEnum - org.apache.lucene.index.TermsEnum

Example 11 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class GetTermVectorsCheckDocFreqIT method checkAllInfo.

private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true).setPositions(true).setFieldStatistics(true).setTermStatistics(true).setSelectedFields();
    assertThat(resp.request().fieldStatistics(), equalTo(true));
    TermVectorsResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8L));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        if (string.equals("the")) {
            assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
        } else {
            assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
        }
        PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(numDocs));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());
    XContentBuilder xBuilder = XContentFactory.jsonBuilder();
    response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS);
    String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");
    ;
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));
}

Also used : Fields(org.apache.lucene.index.Fields) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 12 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class GetTermVectorsCheckDocFreqIT method checkWithoutTermStatistics.

private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false).setFieldStatistics(true).setSelectedFields();
    assertThat(resp.request().termStatistics(), equalTo(false));
    TermVectorsResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8L));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));
        PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(-1));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());
    XContentBuilder xBuilder = XContentFactory.jsonBuilder();
    response.toXContent(xBuilder, null);
    String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");
    ;
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));
}

Example 13 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method compareTermVectors.

private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
    Terms terms0 = fields0.terms(fieldName);
    Terms terms1 = fields1.terms(fieldName);
    assertThat(terms0, notNullValue());
    assertThat(terms1, notNullValue());
    assertThat(terms0.size(), equalTo(terms1.size()));
    TermsEnum iter0 = terms0.iterator();
    TermsEnum iter1 = terms1.iterator();
    for (int i = 0; i < terms0.size(); i++) {
        BytesRef next0 = iter0.next();
        assertThat(next0, notNullValue());
        BytesRef next1 = iter1.next();
        assertThat(next1, notNullValue());
        // compare field value
        String string0 = next0.utf8ToString();
        String string1 = next1.utf8ToString();
        assertThat("expected: " + string0, string0, equalTo(string1));
        // compare df and ttf
        assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
        assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));
        // compare freq and docs
        PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL);
        PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL);
        assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
        assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));
        // compare position, start offsets and end offsets
        for (int j = 0; j < docsAndPositions0.freq(); j++) {
            assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
            assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
            assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
        }
    }
    assertThat(iter0.next(), nullValue());
    assertThat(iter1.next(), nullValue());
}

Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 14 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method testArtificialDocWithPreference.

public void testArtificialDocWithPreference() throws ExecutionException, InterruptedException, IOException {
    // setup indices
    Settings.Builder settings = Settings.builder().put(indexSettings()).put("index.analysis.analyzer", "standard");
    assertAcked(prepareCreate("test").setSettings(settings).addMapping("type1", "field1", "type=text,term_vector=with_positions_offsets"));
    ensureGreen();
    // index document
    indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("field1", "random permutation"));
    // Get search shards
    ClusterSearchShardsResponse searchShardsResponse = client().admin().cluster().prepareSearchShards("test").get();
    List<Integer> shardIds = Arrays.stream(searchShardsResponse.getGroups()).map(s -> s.getShardId().id()).collect(Collectors.toList());
    // request termvectors of artificial document from each shard
    int sumTotalTermFreq = 0;
    int sumDocFreq = 0;
    for (Integer shardId : shardIds) {
        TermVectorsResponse tvResponse = client().prepareTermVectors().setIndex("test").setType("type1").setPreference("_shards:" + shardId).setDoc(jsonBuilder().startObject().field("field1", "random permutation").endObject()).setFieldStatistics(true).setTermStatistics(true).get();
        Fields fields = tvResponse.getFields();
        Terms terms = fields.terms("field1");
        assertNotNull(terms);
        TermsEnum termsEnum = terms.iterator();
        while (termsEnum.next() != null) {
            sumTotalTermFreq += termsEnum.totalTermFreq();
            sumDocFreq += termsEnum.docFreq();
        }
    }
    assertEquals("expected to find term statistics in exactly one shard!", 2, sumTotalTermFreq);
    assertEquals("expected to find term statistics in exactly one shard!", 2, sumDocFreq);
}

Also used : ClusterSearchShardsResponse(org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse) ElasticsearchException(org.elasticsearch.ElasticsearchException) Versions(org.elasticsearch.common.lucene.uid.Versions) Arrays(java.util.Arrays) FieldType(org.apache.lucene.document.FieldType) Alias(org.elasticsearch.action.admin.indices.alias.Alias) Fields(org.apache.lucene.index.Fields) HashMap(java.util.HashMap) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) PayloadHelper(org.apache.lucene.analysis.payloads.PayloadHelper) ActionFuture(org.elasticsearch.action.ActionFuture) Strings(org.elasticsearch.common.Strings) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Settings(org.elasticsearch.common.settings.Settings) ObjectIntHashMap(com.carrotsearch.hppc.ObjectIntHashMap) TermsEnum(org.apache.lucene.index.TermsEnum) Map(java.util.Map) Matchers.nullValue(org.hamcrest.Matchers.nullValue) XContentFactory.jsonBuilder(org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder) VersionConflictEngineException(org.elasticsearch.index.engine.VersionConflictEngineException) ElasticsearchAssertions.assertThrows(org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows) FieldMapper(org.elasticsearch.index.mapper.FieldMapper) PostingsEnum(org.apache.lucene.index.PostingsEnum) Terms(org.apache.lucene.index.Terms) Matchers.notNullValue(org.hamcrest.Matchers.notNullValue) BytesRef(org.apache.lucene.util.BytesRef) DirectoryReader(org.apache.lucene.index.DirectoryReader) ToXContent(org.elasticsearch.common.xcontent.ToXContent) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) ClusterSearchShardsResponse(org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse) ExecutionException(java.util.concurrent.ExecutionException) List(java.util.List) IndexRequestBuilder(org.elasticsearch.action.index.IndexRequestBuilder) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Collections(java.util.Collections) ElasticsearchAssertions.assertAcked(org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked) Fields(org.apache.lucene.index.Fields) Terms(org.apache.lucene.index.Terms) Settings(org.elasticsearch.common.settings.Settings) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 15 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method testArtificialNoDoc.

public void testArtificialNoDoc() throws IOException {
    // setup indices
    Settings.Builder settings = Settings.builder().put(indexSettings()).put("index.analysis.analyzer", "standard");
    assertAcked(prepareCreate("test").setSettings(settings).addMapping("type1", "field1", "type=text"));
    ensureGreen();
    // request tvs from artificial document
    String text = "the quick brown fox jumps over the lazy dog";
    TermVectorsResponse resp = client().prepareTermVectors().setIndex("test").setType("type1").setDoc(jsonBuilder().startObject().field("field1", text).endObject()).setOffsets(true).setPositions(true).setFieldStatistics(true).setTermStatistics(true).get();
    assertThat(resp.isExists(), equalTo(true));
    checkBrownFoxTermVector(resp.getFields(), "field1", false);
    // Since the index is empty, all of artificial document's "term_statistics" should be 0/absent
    Terms terms = resp.getFields().terms("field1");
    assertEquals("sumDocFreq should be 0 for a non-existing field!", 0, terms.getSumDocFreq());
    assertEquals("sumTotalTermFreq should be 0 for a non-existing field!", 0, terms.getSumTotalTermFreq());
    // we're guaranteed to receive terms for that field
    TermsEnum termsEnum = terms.iterator();
    while (termsEnum.next() != null) {
        String term = termsEnum.term().utf8ToString();
        assertEquals("term [" + term + "] does not exist in the index; ttf should be 0!", 0, termsEnum.totalTermFreq());
    }
}

Also used : Terms(org.apache.lucene.index.Terms) Settings(org.elasticsearch.common.settings.Settings) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)153 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)101 PostingsEnum (org.apache.lucene.index.PostingsEnum)51 Term (org.apache.lucene.index.Term)31 ArrayList (java.util.ArrayList)30 IndexReader (org.apache.lucene.index.IndexReader)28 LeafReader (org.apache.lucene.index.LeafReader)28 Fields (org.apache.lucene.index.Fields)26 IOException (java.io.IOException)25 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)11 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10