use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.
the class GetTermVectorsIT method testRandomSingleTermVectors.
public void testRandomSingleTermVectors() throws IOException {
FieldType ft = new FieldType();
int config = randomInt(6);
boolean storePositions = false;
boolean storeOffsets = false;
boolean storePayloads = false;
boolean storeTermVectors = false;
switch(config) {
case 0:
{
// do nothing
break;
}
case 1:
{
storeTermVectors = true;
break;
}
case 2:
{
storeTermVectors = true;
storePositions = true;
break;
}
case 3:
{
storeTermVectors = true;
storeOffsets = true;
break;
}
case 4:
{
storeTermVectors = true;
storePositions = true;
storeOffsets = true;
break;
}
case 5:
{
storeTermVectors = true;
storePositions = true;
storePayloads = true;
break;
}
case 6:
{
storeTermVectors = true;
storePositions = true;
storeOffsets = true;
storePayloads = true;
break;
}
}
ft.setStoreTermVectors(storeTermVectors);
ft.setStoreTermVectorOffsets(storeOffsets);
ft.setStoreTermVectorPayloads(storePayloads);
ft.setStoreTermVectorPositions(storePositions);
String optionString = FieldMapper.termVectorOptionsToString(ft);
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field").field("type", "text").field("term_vector", optionString).field("analyzer", "tv_test").endObject().endObject().endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(Settings.builder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace").putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i)).setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog").endObject()).execute().actionGet();
refresh();
}
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
boolean isPayloadRequested = randomBoolean();
boolean isOffsetRequested = randomBoolean();
boolean isPositionsRequested = randomBoolean();
String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString);
for (int i = 0; i < 10; i++) {
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(isPayloadRequested).setOffsets(isOffsetRequested).setPositions(isPositionsRequested).setSelectedFields();
TermVectorsResponse response = resp.execute().actionGet();
assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
if (ft.storeTermVectors()) {
Terms terms = fields.terms("field");
assertThat(terms.size(), equalTo(8L));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(infoString, next, notNullValue());
assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
assertThat(infoString, next, notNullValue());
// do not test ttf or doc frequency, because here we have
// many shards and do not know how documents are distributed
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
// docs and pos only returns something if positions or
// payloads or offsets are stored / requestd Otherwise use
// DocsEnum?
assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
if (isPositionsRequested && storePositions) {
assertThat(infoString, termPos.length, equalTo(freq[j]));
}
if (isOffsetRequested && storeOffsets) {
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
}
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
// only return something useful if requested and stored
if (isPositionsRequested && storePositions) {
assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k]));
} else {
assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
}
// only return something useful if requested and stored
if (isPayloadRequested && storePayloads) {
assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
} else {
assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null));
}
// only return something useful if requested and stored
if (isOffsetRequested && storeOffsets) {
assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
} else {
assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1));
assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1));
}
}
}
assertThat(iterator.next(), nullValue());
}
}
}
use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.
the class GetTermVectorsIT method checkAnalyzedFields.
private void checkAnalyzedFields(Fields fieldsObject, Set<String> fieldNames, Map<String, String> perFieldAnalyzer) throws IOException {
Set<String> validFields = new HashSet<>();
for (String fieldName : fieldNames) {
if (fieldName.startsWith("non_existing")) {
assertThat("Non existing field\"" + fieldName + "\" should not be returned!", fieldsObject.terms(fieldName), nullValue());
continue;
}
Terms terms = fieldsObject.terms(fieldName);
assertThat("Existing field " + fieldName + "should have been returned", terms, notNullValue());
// check overridden by keyword analyzer ...
if (perFieldAnalyzer.containsKey(fieldName)) {
TermsEnum iterator = terms.iterator();
assertThat("Analyzer for " + fieldName + " should have been overridden!", iterator.next().utf8ToString(), equalTo("some text here"));
assertThat(iterator.next(), nullValue());
}
validFields.add(fieldName);
}
// ensure no other fields are returned
assertThat("More fields than expected are returned!", fieldsObject.size(), equalTo(validFields.size()));
}
use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.
the class AbstractStringFieldDataTestCase method testTermsEnum.
public void testTermsEnum() throws Exception {
fillExtendedMvSet();
writer.forceMerge(1);
List<LeafReaderContext> atomicReaderContexts = refreshReader();
IndexOrdinalsFieldData ifd = getForField("value");
for (LeafReaderContext atomicReaderContext : atomicReaderContexts) {
AtomicOrdinalsFieldData afd = ifd.load(atomicReaderContext);
TermsEnum termsEnum = afd.getOrdinalsValues().termsEnum();
int size = 0;
while (termsEnum.next() != null) {
size++;
}
assertThat(size, equalTo(12));
assertThat(termsEnum.seekExact(new BytesRef("10")), is(true));
assertThat(termsEnum.term().utf8ToString(), equalTo("10"));
assertThat(termsEnum.next(), nullValue());
assertThat(termsEnum.seekExact(new BytesRef("08")), is(true));
assertThat(termsEnum.term().utf8ToString(), equalTo("08"));
size = 0;
while (termsEnum.next() != null) {
size++;
}
assertThat(size, equalTo(2));
termsEnum.seekExact(8);
assertThat(termsEnum.term().utf8ToString(), equalTo("07"));
size = 0;
while (termsEnum.next() != null) {
size++;
}
assertThat(size, equalTo(3));
}
}
use of org.apache.lucene.index.TermsEnum in project languagetool by languagetool-org.
the class HomophoneOccurrenceDumper method dumpOccurrences.
private void dumpOccurrences(Set<String> tokens) throws IOException {
Objects.requireNonNull(tokens);
TermsEnum iterator = getIterator();
BytesRef byteRef;
int i = 0;
while ((byteRef = iterator.next()) != null) {
String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
String[] split = term.split(" ");
if (split.length == 3) {
String token = split[1];
if (tokens.contains(token)) {
long count = getCount(Arrays.asList(split[0], split[1], split[2]));
if (count >= MIN_COUNT) {
System.out.println(token + "\t" + count + "\t" + split[0] + " " + split[1] + " " + split[2]);
}
}
}
if (i % 10_000 == 0) {
System.err.println(i + "...");
}
i++;
}
}
use of org.apache.lucene.index.TermsEnum in project languagetool by languagetool-org.
the class HomophoneOccurrenceDumper method getContext.
/**
* Get the context (left and right words) for the given word(s). This is slow,
* as it needs to scan the whole index.
*/
Map<String, Long> getContext(String... tokens) throws IOException {
Objects.requireNonNull(tokens);
TermsEnum iterator = getIterator();
Map<String, Long> result = new HashMap<>();
BytesRef byteRef;
int i = 0;
while ((byteRef = iterator.next()) != null) {
String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
for (String token : tokens) {
if (term.contains(" " + token + " ")) {
String[] split = term.split(" ");
if (split.length == 3) {
long count = getCount(Arrays.asList(split[0], split[1], split[2]));
result.put(term, count);
}
}
}
/*if (i++ > 1_000_000) { // comment in for faster testing with subsets of the data
break;
}*/
}
return result;
}
Aggregations