Search in sources :

Example 96 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method testRandomSingleTermVectors.

public void testRandomSingleTermVectors() throws IOException {
    FieldType ft = new FieldType();
    int config = randomInt(6);
    boolean storePositions = false;
    boolean storeOffsets = false;
    boolean storePayloads = false;
    boolean storeTermVectors = false;
    switch(config) {
        case 0:
            {
                // do nothing
                break;
            }
        case 1:
            {
                storeTermVectors = true;
                break;
            }
        case 2:
            {
                storeTermVectors = true;
                storePositions = true;
                break;
            }
        case 3:
            {
                storeTermVectors = true;
                storeOffsets = true;
                break;
            }
        case 4:
            {
                storeTermVectors = true;
                storePositions = true;
                storeOffsets = true;
                break;
            }
        case 5:
            {
                storeTermVectors = true;
                storePositions = true;
                storePayloads = true;
                break;
            }
        case 6:
            {
                storeTermVectors = true;
                storePositions = true;
                storeOffsets = true;
                storePayloads = true;
                break;
            }
    }
    ft.setStoreTermVectors(storeTermVectors);
    ft.setStoreTermVectorOffsets(storeOffsets);
    ft.setStoreTermVectorPayloads(storePayloads);
    ft.setStoreTermVectorPositions(storePositions);
    String optionString = FieldMapper.termVectorOptionsToString(ft);
    XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field").field("type", "text").field("term_vector", optionString).field("analyzer", "tv_test").endObject().endObject().endObject().endObject();
    assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(Settings.builder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace").putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
    for (int i = 0; i < 10; i++) {
        client().prepareIndex("test", "type1", Integer.toString(i)).setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog").endObject()).execute().actionGet();
        refresh();
    }
    String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
    int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
    int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
    int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
    int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
    boolean isPayloadRequested = randomBoolean();
    boolean isOffsetRequested = randomBoolean();
    boolean isPositionsRequested = randomBoolean();
    String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString);
    for (int i = 0; i < 10; i++) {
        TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(isPayloadRequested).setOffsets(isOffsetRequested).setPositions(isPositionsRequested).setSelectedFields();
        TermVectorsResponse response = resp.execute().actionGet();
        assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
        Fields fields = response.getFields();
        assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
        if (ft.storeTermVectors()) {
            Terms terms = fields.terms("field");
            assertThat(terms.size(), equalTo(8L));
            TermsEnum iterator = terms.iterator();
            for (int j = 0; j < values.length; j++) {
                String string = values[j];
                BytesRef next = iterator.next();
                assertThat(infoString, next, notNullValue());
                assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
                assertThat(infoString, next, notNullValue());
                // do not test ttf or doc frequency, because here we have
                // many shards and do not know how documents are distributed
                PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
                // docs and pos only returns something if positions or
                // payloads or offsets are stored / requestd Otherwise use
                // DocsEnum?
                assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
                assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
                int[] termPos = pos[j];
                int[] termStartOffset = startOffset[j];
                int[] termEndOffset = endOffset[j];
                if (isPositionsRequested && storePositions) {
                    assertThat(infoString, termPos.length, equalTo(freq[j]));
                }
                if (isOffsetRequested && storeOffsets) {
                    assertThat(termStartOffset.length, equalTo(freq[j]));
                    assertThat(termEndOffset.length, equalTo(freq[j]));
                }
                for (int k = 0; k < freq[j]; k++) {
                    int nextPosition = docsAndPositions.nextPosition();
                    // only return something useful if requested and stored
                    if (isPositionsRequested && storePositions) {
                        assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k]));
                    } else {
                        assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
                    }
                    // only return something useful if requested and stored
                    if (isPayloadRequested && storePayloads) {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
                    } else {
                        assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null));
                    }
                    // only return something useful if requested and stored
                    if (isOffsetRequested && storeOffsets) {
                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
                    } else {
                        assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1));
                        assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1));
                    }
                }
            }
            assertThat(iterator.next(), nullValue());
        }
    }
}
Also used : Terms(org.apache.lucene.index.Terms) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) PostingsEnum(org.apache.lucene.index.PostingsEnum) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 97 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method checkAnalyzedFields.

private void checkAnalyzedFields(Fields fieldsObject, Set<String> fieldNames, Map<String, String> perFieldAnalyzer) throws IOException {
    Set<String> validFields = new HashSet<>();
    for (String fieldName : fieldNames) {
        if (fieldName.startsWith("non_existing")) {
            assertThat("Non existing field\"" + fieldName + "\" should not be returned!", fieldsObject.terms(fieldName), nullValue());
            continue;
        }
        Terms terms = fieldsObject.terms(fieldName);
        assertThat("Existing field " + fieldName + "should have been returned", terms, notNullValue());
        // check overridden by keyword analyzer ...
        if (perFieldAnalyzer.containsKey(fieldName)) {
            TermsEnum iterator = terms.iterator();
            assertThat("Analyzer for " + fieldName + " should have been overridden!", iterator.next().utf8ToString(), equalTo("some text here"));
            assertThat(iterator.next(), nullValue());
        }
        validFields.add(fieldName);
    }
    // ensure no other fields are returned
    assertThat("More fields than expected are returned!", fieldsObject.size(), equalTo(validFields.size()));
}
Also used : Terms(org.apache.lucene.index.Terms) HashSet(java.util.HashSet) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 98 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class AbstractStringFieldDataTestCase method testTermsEnum.

public void testTermsEnum() throws Exception {
    fillExtendedMvSet();
    writer.forceMerge(1);
    List<LeafReaderContext> atomicReaderContexts = refreshReader();
    IndexOrdinalsFieldData ifd = getForField("value");
    for (LeafReaderContext atomicReaderContext : atomicReaderContexts) {
        AtomicOrdinalsFieldData afd = ifd.load(atomicReaderContext);
        TermsEnum termsEnum = afd.getOrdinalsValues().termsEnum();
        int size = 0;
        while (termsEnum.next() != null) {
            size++;
        }
        assertThat(size, equalTo(12));
        assertThat(termsEnum.seekExact(new BytesRef("10")), is(true));
        assertThat(termsEnum.term().utf8ToString(), equalTo("10"));
        assertThat(termsEnum.next(), nullValue());
        assertThat(termsEnum.seekExact(new BytesRef("08")), is(true));
        assertThat(termsEnum.term().utf8ToString(), equalTo("08"));
        size = 0;
        while (termsEnum.next() != null) {
            size++;
        }
        assertThat(size, equalTo(2));
        termsEnum.seekExact(8);
        assertThat(termsEnum.term().utf8ToString(), equalTo("07"));
        size = 0;
        while (termsEnum.next() != null) {
            size++;
        }
        assertThat(size, equalTo(3));
    }
}
Also used : LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 99 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project languagetool by languagetool-org.

the class HomophoneOccurrenceDumper method dumpOccurrences.

private void dumpOccurrences(Set<String> tokens) throws IOException {
    Objects.requireNonNull(tokens);
    TermsEnum iterator = getIterator();
    BytesRef byteRef;
    int i = 0;
    while ((byteRef = iterator.next()) != null) {
        String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
        String[] split = term.split(" ");
        if (split.length == 3) {
            String token = split[1];
            if (tokens.contains(token)) {
                long count = getCount(Arrays.asList(split[0], split[1], split[2]));
                if (count >= MIN_COUNT) {
                    System.out.println(token + "\t" + count + "\t" + split[0] + " " + split[1] + " " + split[2]);
                }
            }
        }
        if (i % 10_000 == 0) {
            System.err.println(i + "...");
        }
        i++;
    }
}
Also used : BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 100 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project languagetool by languagetool-org.

the class HomophoneOccurrenceDumper method getContext.

/**
   * Get the context (left and right words) for the given word(s). This is slow,
   * as it needs to scan the whole index.
   */
Map<String, Long> getContext(String... tokens) throws IOException {
    Objects.requireNonNull(tokens);
    TermsEnum iterator = getIterator();
    Map<String, Long> result = new HashMap<>();
    BytesRef byteRef;
    int i = 0;
    while ((byteRef = iterator.next()) != null) {
        String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
        for (String token : tokens) {
            if (term.contains(" " + token + " ")) {
                String[] split = term.split(" ");
                if (split.length == 3) {
                    long count = getCount(Arrays.asList(split[0], split[1], split[2]));
                    result.put(term, count);
                }
            }
        }
    /*if (i++ > 1_000_000) { // comment in for faster testing with subsets of the data
        break;
      }*/
    }
    return result;
}
Also used : BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)153 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)101 PostingsEnum (org.apache.lucene.index.PostingsEnum)51 Term (org.apache.lucene.index.Term)31 ArrayList (java.util.ArrayList)30 IndexReader (org.apache.lucene.index.IndexReader)28 LeafReader (org.apache.lucene.index.LeafReader)28 Fields (org.apache.lucene.index.Fields)26 IOException (java.io.IOException)25 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)11 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10