Search in sources :

Example 31 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class GetTermVectorsCheckDocFreqIT method checkWithoutTermStatistics.

private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException {
    TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false).setFieldStatistics(true).setSelectedFields();
    assertThat(resp.request().termStatistics(), equalTo(false));
    TermVectorsResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8L));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, Matchers.notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, Matchers.notNullValue());
        assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));
        PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        assertThat(iterator.docFreq(), equalTo(-1));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
        }
    }
    assertThat(iterator.next(), Matchers.nullValue());
    XContentBuilder xBuilder = XContentFactory.jsonBuilder();
    response.toXContent(xBuilder, null);
    String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");
    ;
    String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));
}
Also used : Fields(org.apache.lucene.index.Fields) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 32 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class GetTermVectorsIT method compareTermVectors.

private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
    Terms terms0 = fields0.terms(fieldName);
    Terms terms1 = fields1.terms(fieldName);
    assertThat(terms0, notNullValue());
    assertThat(terms1, notNullValue());
    assertThat(terms0.size(), equalTo(terms1.size()));
    TermsEnum iter0 = terms0.iterator();
    TermsEnum iter1 = terms1.iterator();
    for (int i = 0; i < terms0.size(); i++) {
        BytesRef next0 = iter0.next();
        assertThat(next0, notNullValue());
        BytesRef next1 = iter1.next();
        assertThat(next1, notNullValue());
        // compare field value
        String string0 = next0.utf8ToString();
        String string1 = next1.utf8ToString();
        assertThat("expected: " + string0, string0, equalTo(string1));
        // compare df and ttf
        assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
        assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));
        // compare freq and docs
        PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL);
        PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL);
        assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
        assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));
        // compare position, start offsets and end offsets
        for (int j = 0; j < docsAndPositions0.freq(); j++) {
            assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
            assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
            assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
        }
    }
    assertThat(iter0.next(), nullValue());
    assertThat(iter1.next(), nullValue());
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 33 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class CustomUnifiedHighlighter method getFieldHighlighter.

@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
    BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
    Set<HighlightFlag> highlightFlags = getFlags(field);
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
    OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
    BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR);
    FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
    return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) BytesRef(org.apache.lucene.util.BytesRef) BreakIterator(java.text.BreakIterator)

Example 34 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class TransportAnalyzeAction method extractExtendedAttributes.

/**
     * other attribute extract object.
     * Extracted object group by AttributeClassName
     *
     * @param stream current TokenStream
     * @param includeAttributes filtering attributes
     * @return Map&lt;key value&gt;
     */
private static Map<String, Object> extractExtendedAttributes(TokenStream stream, final Set<String> includeAttributes) {
    final Map<String, Object> extendedAttributes = new TreeMap<>();
    stream.reflectWith((attClass, key, value) -> {
        if (CharTermAttribute.class.isAssignableFrom(attClass)) {
            return;
        }
        if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) {
            return;
        }
        if (OffsetAttribute.class.isAssignableFrom(attClass)) {
            return;
        }
        if (TypeAttribute.class.isAssignableFrom(attClass)) {
            return;
        }
        if (includeAttributes == null || includeAttributes.isEmpty() || includeAttributes.contains(key.toLowerCase(Locale.ROOT))) {
            if (value instanceof BytesRef) {
                final BytesRef p = (BytesRef) value;
                value = p.toString();
            }
            extendedAttributes.put(key, value);
        }
    });
    return extendedAttributes;
}
Also used : TreeMap(java.util.TreeMap) BytesRef(org.apache.lucene.util.BytesRef)

Example 35 with BytesRef

use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.

the class BytesReference method compareIterators.

/**
     * Compares the two references using the given int function.
     */
private static int compareIterators(final BytesReference a, final BytesReference b, final ToIntBiFunction<BytesRef, BytesRef> f) {
    try {
        // we use the iterators since it's a 0-copy comparison where possible!
        final long lengthToCompare = Math.min(a.length(), b.length());
        final BytesRefIterator aIter = a.iterator();
        final BytesRefIterator bIter = b.iterator();
        BytesRef aRef = aIter.next();
        BytesRef bRef = bIter.next();
        if (aRef != null && bRef != null) {
            // do we have any data?
            // we clone since we modify the offsets and length in the iteration below
            aRef = aRef.clone();
            bRef = bRef.clone();
            if (aRef.length == a.length() && bRef.length == b.length()) {
                // is it only one array slice we are comparing?
                return f.applyAsInt(aRef, bRef);
            } else {
                for (int i = 0; i < lengthToCompare; ) {
                    if (aRef.length == 0) {
                        // must be non null otherwise we have a bug
                        aRef = aIter.next().clone();
                    }
                    if (bRef.length == 0) {
                        // must be non null otherwise we have a bug
                        bRef = bIter.next().clone();
                    }
                    final int aLength = aRef.length;
                    final int bLength = bRef.length;
                    // shrink to the same length and use the fast compare in lucene
                    final int length = Math.min(aLength, bLength);
                    aRef.length = bRef.length = length;
                    // now we move to the fast comparison - this is the hot part of the loop
                    int diff = f.applyAsInt(aRef, bRef);
                    aRef.length = aLength;
                    bRef.length = bLength;
                    if (diff != 0) {
                        return diff;
                    }
                    advance(aRef, length);
                    advance(bRef, length);
                    i += length;
                }
            }
        }
        // One is a prefix of the other, or, they are equal:
        return a.length() - b.length();
    } catch (IOException ex) {
        throw new AssertionError("can not happen", ex);
    }
}
Also used : BytesRefIterator(org.apache.lucene.util.BytesRefIterator) IOException(java.io.IOException) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

BytesRef (org.apache.lucene.util.BytesRef)1449 Document (org.apache.lucene.document.Document)410 Directory (org.apache.lucene.store.Directory)370 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)266 ArrayList (java.util.ArrayList)186 Test (org.junit.Test)182 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)164 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)152 Term (org.apache.lucene.index.Term)124 Analyzer (org.apache.lucene.analysis.Analyzer)121 IndexReader (org.apache.lucene.index.IndexReader)121 TermsEnum (org.apache.lucene.index.TermsEnum)116 SortedSetDocValuesField (org.apache.lucene.document.SortedSetDocValuesField)110 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)105 IOException (java.io.IOException)104 Field (org.apache.lucene.document.Field)101 StringField (org.apache.lucene.document.StringField)101 CrateUnitTest (io.crate.test.integration.CrateUnitTest)95 TextField (org.apache.lucene.document.TextField)95 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)87