use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.
the class SimpleLuceneTests method testNumericTermDocsFreqs.
/**
* A test just to verify that term freqs are not stored for numeric fields. <tt>int1</tt> is not storing termFreq
* and <tt>int2</tt> does.
*/
public void testNumericTermDocsFreqs() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
Document doc = new Document();
FieldType type = LegacyIntField.TYPE_NOT_STORED;
LegacyIntField field = new LegacyIntField("int1", 1, type);
doc.add(field);
type = new FieldType(LegacyIntField.TYPE_NOT_STORED);
type.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
type.freeze();
field = new LegacyIntField("int1", 1, type);
doc.add(field);
field = new LegacyIntField("int2", 1, type);
doc.add(field);
field = new LegacyIntField("int2", 1, type);
doc.add(field);
indexWriter.addDocument(doc);
IndexReader reader = DirectoryReader.open(indexWriter);
LeafReader atomicReader = SlowCompositeReaderWrapper.wrap(reader);
Terms terms = atomicReader.terms("int1");
TermsEnum termsEnum = terms.iterator();
termsEnum.next();
PostingsEnum termDocs = termsEnum.postings(null);
assertThat(termDocs.nextDoc(), equalTo(0));
assertThat(termDocs.docID(), equalTo(0));
assertThat(termDocs.freq(), equalTo(1));
terms = atomicReader.terms("int2");
termsEnum = terms.iterator();
termsEnum.next();
termDocs = termsEnum.postings(termDocs);
assertThat(termDocs.nextDoc(), equalTo(0));
assertThat(termDocs.docID(), equalTo(0));
assertThat(termDocs.freq(), equalTo(2));
reader.close();
indexWriter.close();
}
use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.
the class AbstractTermVectorsTestCase method validateResponse.
protected void validateResponse(TermVectorsResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException {
assertThat(esResponse.getIndex(), equalTo(testConfig.doc.index));
TestDoc testDoc = testConfig.doc;
HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<>(Arrays.asList(testConfig.selectedFields));
Fields esTermVectorFields = esResponse.getFields();
for (TestFieldSetting field : testDoc.fieldSettings) {
Terms esTerms = esTermVectorFields.terms(field.name);
if (selectedFields != null && !selectedFields.contains(field.name)) {
assertNull(esTerms);
continue;
}
assertNotNull(esTerms);
Terms luceneTerms = luceneFields.terms(field.name);
TermsEnum esTermEnum = esTerms.iterator();
TermsEnum luceneTermEnum = luceneTerms.iterator();
while (esTermEnum.next() != null) {
assertNotNull(luceneTermEnum.next());
assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
PostingsEnum esDocsPosEnum = esTermEnum.postings(null, PostingsEnum.POSITIONS);
PostingsEnum luceneDocsPosEnum = luceneTermEnum.postings(null, PostingsEnum.POSITIONS);
if (luceneDocsPosEnum == null) {
// test we expect that...
assertFalse(field.storedOffset);
assertFalse(field.storedPayloads);
assertFalse(field.storedPositions);
continue;
}
String currentTerm = esTermEnum.term().utf8ToString();
assertThat("Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString()));
esDocsPosEnum.nextDoc();
luceneDocsPosEnum.nextDoc();
int freq = esDocsPosEnum.freq();
assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
for (int i = 0; i < freq; i++) {
String failDesc = " (field:" + field.name + " term:" + currentTerm + ")";
int lucenePos = luceneDocsPosEnum.nextPosition();
int esPos = esDocsPosEnum.nextPosition();
if (field.storedPositions && testConfig.requestPositions) {
assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos));
} else {
assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1));
}
if (field.storedOffset && testConfig.requestOffsets) {
assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
} else {
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
}
if (field.storedPayloads && testConfig.requestPayloads) {
assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
} else {
assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
}
}
}
assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
}
}
use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.
the class GetTermVectorsIT method checkBrownFoxTermVector.
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException {
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
Terms terms = fields.terms(fieldName);
assertThat(terms.size(), equalTo(8L));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, notNullValue());
// do not test ttf or doc frequency, because here we have many
// shards and do not know how documents are distributed
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
if (withPayloads) {
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
}
assertThat(iterator.next(), nullValue());
}
use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.
the class GetTermVectorsIT method testRandomPayloadWithDelimitedPayloadTokenFilter.
public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException {
//create the test document
int encoding = randomIntBetween(0, 2);
String encodingString = "";
if (encoding == 0) {
encodingString = "float";
}
if (encoding == 1) {
encodingString = "int";
}
if (encoding == 2) {
encodingString = "identity";
}
String[] tokens = crateRandomTokens();
Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
String delimiter = createRandomDelimiter(tokens);
String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
//create the mapping
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads").field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(Settings.builder().put(indexSettings()).put("index.analysis.analyzer.payload_test.tokenizer", "whitespace").putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter").put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter).put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString).put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter")));
client().prepareIndex("test", "type1", Integer.toString(1)).setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
refresh();
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields();
TermVectorsResponse response = resp.execute().actionGet();
assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
TermsEnum iterator = terms.iterator();
while (iterator.next() != null) {
String term = iterator.term().utf8ToString();
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
List<BytesRef> curPayloads = payloads.get(term);
assertThat(term, curPayloads, notNullValue());
assertNotNull(docsAndPositions);
for (int k = 0; k < docsAndPositions.freq(); k++) {
docsAndPositions.nextPosition();
if (docsAndPositions.getPayload() != null) {
String infoString = "\nterm: " + term + " has payload \n" + docsAndPositions.getPayload().toString() + "\n but should have payload \n" + curPayloads.get(k).toString();
assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
} else {
String infoString = "\nterm: " + term + " has no payload but should have payload \n" + curPayloads.get(k).toString();
assertThat(infoString, curPayloads.get(k).length, equalTo(0));
}
}
}
assertThat(iterator.next(), nullValue());
}
use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.
the class GetTermVectorsIT method testRandomSingleTermVectors.
public void testRandomSingleTermVectors() throws IOException {
FieldType ft = new FieldType();
int config = randomInt(6);
boolean storePositions = false;
boolean storeOffsets = false;
boolean storePayloads = false;
boolean storeTermVectors = false;
switch(config) {
case 0:
{
// do nothing
break;
}
case 1:
{
storeTermVectors = true;
break;
}
case 2:
{
storeTermVectors = true;
storePositions = true;
break;
}
case 3:
{
storeTermVectors = true;
storeOffsets = true;
break;
}
case 4:
{
storeTermVectors = true;
storePositions = true;
storeOffsets = true;
break;
}
case 5:
{
storeTermVectors = true;
storePositions = true;
storePayloads = true;
break;
}
case 6:
{
storeTermVectors = true;
storePositions = true;
storeOffsets = true;
storePayloads = true;
break;
}
}
ft.setStoreTermVectors(storeTermVectors);
ft.setStoreTermVectorOffsets(storeOffsets);
ft.setStoreTermVectorPayloads(storePayloads);
ft.setStoreTermVectorPositions(storePositions);
String optionString = FieldMapper.termVectorOptionsToString(ft);
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field").field("type", "text").field("term_vector", optionString).field("analyzer", "tv_test").endObject().endObject().endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(Settings.builder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace").putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i)).setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog").endObject()).execute().actionGet();
refresh();
}
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
boolean isPayloadRequested = randomBoolean();
boolean isOffsetRequested = randomBoolean();
boolean isPositionsRequested = randomBoolean();
String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString);
for (int i = 0; i < 10; i++) {
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(isPayloadRequested).setOffsets(isOffsetRequested).setPositions(isPositionsRequested).setSelectedFields();
TermVectorsResponse response = resp.execute().actionGet();
assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
if (ft.storeTermVectors()) {
Terms terms = fields.terms("field");
assertThat(terms.size(), equalTo(8L));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(infoString, next, notNullValue());
assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
assertThat(infoString, next, notNullValue());
// do not test ttf or doc frequency, because here we have
// many shards and do not know how documents are distributed
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
// docs and pos only returns something if positions or
// payloads or offsets are stored / requestd Otherwise use
// DocsEnum?
assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
if (isPositionsRequested && storePositions) {
assertThat(infoString, termPos.length, equalTo(freq[j]));
}
if (isOffsetRequested && storeOffsets) {
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
}
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
// only return something useful if requested and stored
if (isPositionsRequested && storePositions) {
assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k]));
} else {
assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
}
// only return something useful if requested and stored
if (isPayloadRequested && storePayloads) {
assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
} else {
assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null));
}
// only return something useful if requested and stored
if (isOffsetRequested && storeOffsets) {
assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
} else {
assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1));
assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1));
}
}
}
assertThat(iterator.next(), nullValue());
}
}
}
Aggregations