use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.
the class GetTermVectorsCheckDocFreqIT method checkWithoutTermStatistics.
private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException {
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false).setFieldStatistics(true).setSelectedFields();
assertThat(resp.request().termStatistics(), equalTo(false));
TermVectorsResponse response = resp.execute().actionGet();
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
assertThat(terms.size(), equalTo(8L));
assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, Matchers.notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, Matchers.notNullValue());
assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
assertThat(iterator.docFreq(), equalTo(-1));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
assertThat(iterator.next(), Matchers.nullValue());
XContentBuilder xBuilder = XContentFactory.jsonBuilder();
response.toXContent(xBuilder, null);
String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");
;
String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
assertThat(utf8, equalTo(expectedString));
}
use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.
the class GetTermVectorsIT method compareTermVectors.
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
Terms terms0 = fields0.terms(fieldName);
Terms terms1 = fields1.terms(fieldName);
assertThat(terms0, notNullValue());
assertThat(terms1, notNullValue());
assertThat(terms0.size(), equalTo(terms1.size()));
TermsEnum iter0 = terms0.iterator();
TermsEnum iter1 = terms1.iterator();
for (int i = 0; i < terms0.size(); i++) {
BytesRef next0 = iter0.next();
assertThat(next0, notNullValue());
BytesRef next1 = iter1.next();
assertThat(next1, notNullValue());
// compare field value
String string0 = next0.utf8ToString();
String string1 = next1.utf8ToString();
assertThat("expected: " + string0, string0, equalTo(string1));
// compare df and ttf
assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));
// compare freq and docs
PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL);
PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL);
assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));
// compare position, start offsets and end offsets
for (int j = 0; j < docsAndPositions0.freq(); j++) {
assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
}
}
assertThat(iter0.next(), nullValue());
assertThat(iter1.next(), nullValue());
}
use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.
the class CustomUnifiedHighlighter method getFieldHighlighter.
@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR);
FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}
use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.
the class TransportAnalyzeAction method extractExtendedAttributes.
/**
* other attribute extract object.
* Extracted object group by AttributeClassName
*
* @param stream current TokenStream
* @param includeAttributes filtering attributes
* @return Map<key value>
*/
private static Map<String, Object> extractExtendedAttributes(TokenStream stream, final Set<String> includeAttributes) {
final Map<String, Object> extendedAttributes = new TreeMap<>();
stream.reflectWith((attClass, key, value) -> {
if (CharTermAttribute.class.isAssignableFrom(attClass)) {
return;
}
if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) {
return;
}
if (OffsetAttribute.class.isAssignableFrom(attClass)) {
return;
}
if (TypeAttribute.class.isAssignableFrom(attClass)) {
return;
}
if (includeAttributes == null || includeAttributes.isEmpty() || includeAttributes.contains(key.toLowerCase(Locale.ROOT))) {
if (value instanceof BytesRef) {
final BytesRef p = (BytesRef) value;
value = p.toString();
}
extendedAttributes.put(key, value);
}
});
return extendedAttributes;
}
use of org.apache.lucene.util.BytesRef in project elasticsearch by elastic.
the class BytesReference method compareIterators.
/**
* Compares the two references using the given int function.
*/
private static int compareIterators(final BytesReference a, final BytesReference b, final ToIntBiFunction<BytesRef, BytesRef> f) {
try {
// we use the iterators since it's a 0-copy comparison where possible!
final long lengthToCompare = Math.min(a.length(), b.length());
final BytesRefIterator aIter = a.iterator();
final BytesRefIterator bIter = b.iterator();
BytesRef aRef = aIter.next();
BytesRef bRef = bIter.next();
if (aRef != null && bRef != null) {
// do we have any data?
// we clone since we modify the offsets and length in the iteration below
aRef = aRef.clone();
bRef = bRef.clone();
if (aRef.length == a.length() && bRef.length == b.length()) {
// is it only one array slice we are comparing?
return f.applyAsInt(aRef, bRef);
} else {
for (int i = 0; i < lengthToCompare; ) {
if (aRef.length == 0) {
// must be non null otherwise we have a bug
aRef = aIter.next().clone();
}
if (bRef.length == 0) {
// must be non null otherwise we have a bug
bRef = bIter.next().clone();
}
final int aLength = aRef.length;
final int bLength = bRef.length;
// shrink to the same length and use the fast compare in lucene
final int length = Math.min(aLength, bLength);
aRef.length = bRef.length = length;
// now we move to the fast comparison - this is the hot part of the loop
int diff = f.applyAsInt(aRef, bRef);
aRef.length = aLength;
bRef.length = bLength;
if (diff != 0) {
return diff;
}
advance(aRef, length);
advance(bRef, length);
i += length;
}
}
}
// One is a prefix of the other, or, they are equal:
return a.length() - b.length();
} catch (IOException ex) {
throw new AssertionError("can not happen", ex);
}
}
Aggregations