use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestLucene70DocValuesFormat method doTestTermsEnumRandom.
// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
// set to duel against a codec which has ordinals:
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
final DocValuesFormat dv = new Lucene70DocValuesFormat();
conf.setCodec(new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return pf;
}
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return dv;
}
});
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
// index some docs
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
doc.add(idField);
int numValues = random().nextInt(17);
// create a random list of strings
List<String> values = new ArrayList<>();
for (int v = 0; v < numValues; v++) {
values.add(valuesProducer.get());
}
// add in any order to the indexed field
ArrayList<String> unordered = new ArrayList<>(values);
Collections.shuffle(unordered, random());
for (String v : values) {
doc.add(newStringField("indexed", v, Field.Store.NO));
}
// add in any order to the dv field
ArrayList<String> unordered2 = new ArrayList<>(values);
Collections.shuffle(unordered2, random());
for (String v : unordered2) {
doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
}
writer.addDocument(doc);
if (random().nextInt(31) == 0) {
writer.commit();
}
}
// delete some docs
int numDeletions = random().nextInt(numDocs / 10);
for (int i = 0; i < numDeletions; i++) {
int id = random().nextInt(numDocs);
writer.deleteDocuments(new Term("id", Integer.toString(id)));
}
// compare per-segment
DirectoryReader ir = writer.getReader();
for (LeafReaderContext context : ir.leaves()) {
LeafReader r = context.reader();
Terms terms = r.terms("indexed");
if (terms != null) {
SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
assertEquals(terms.size(), ssdv.getValueCount());
TermsEnum expected = terms.iterator();
TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
assertEquals(terms.size(), expected, actual);
doTestSortedSetEnumAdvanceIndependently(ssdv);
}
}
ir.close();
writer.forceMerge(1);
// now compare again after the merge
ir = writer.getReader();
LeafReader ar = getOnlyLeafReader(ir);
Terms terms = ar.terms("indexed");
if (terms != null) {
assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
TermsEnum expected = terms.iterator();
TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
assertEquals(terms.size(), expected, actual);
}
ir.close();
writer.close();
dir.close();
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestPhrasePrefixQuery method testPhrasePrefix.
/**
*
*/
public void testPhrasePrefix() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
Document doc1 = new Document();
Document doc2 = new Document();
Document doc3 = new Document();
Document doc4 = new Document();
Document doc5 = new Document();
doc1.add(newTextField("body", "blueberry pie", Field.Store.YES));
doc2.add(newTextField("body", "blueberry strudel", Field.Store.YES));
doc3.add(newTextField("body", "blueberry pizza", Field.Store.YES));
doc4.add(newTextField("body", "blueberry chewing gum", Field.Store.YES));
doc5.add(newTextField("body", "piccadilly circus", Field.Store.YES));
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);
writer.addDocument(doc4);
writer.addDocument(doc5);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
// PhrasePrefixQuery query1 = new PhrasePrefixQuery();
MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
// PhrasePrefixQuery query2 = new PhrasePrefixQuery();
MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
query1builder.add(new Term("body", "blueberry"));
query2builder.add(new Term("body", "strawberry"));
LinkedList<Term> termsWithPrefix = new LinkedList<>();
// this TermEnum gives "piccadilly", "pie" and "pizza".
String prefix = "pi";
TermsEnum te = MultiFields.getFields(reader).terms("body").iterator();
te.seekCeil(new BytesRef(prefix));
do {
String s = te.term().utf8ToString();
if (s.startsWith(prefix)) {
termsWithPrefix.add(new Term("body", s));
} else {
break;
}
} while (te.next() != null);
query1builder.add(termsWithPrefix.toArray(new Term[0]));
query2builder.add(termsWithPrefix.toArray(new Term[0]));
ScoreDoc[] result;
result = searcher.search(query1builder.build(), 1000).scoreDocs;
assertEquals(2, result.length);
result = searcher.search(query2builder.build(), 1000).scoreDocs;
assertEquals(0, result.length);
reader.close();
indexStore.close();
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestMultiPhraseQuery method testPhrasePrefix.
public void testPhrasePrefix() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("blueberry pie", writer);
add("blueberry strudel", writer);
add("blueberry pizza", writer);
add("blueberry chewing gum", writer);
add("bluebird pizza", writer);
add("bluebird foobar pizza", writer);
add("piccadilly circus", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
// search for "blueberry pi*":
MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
// search for "strawberry pi*":
MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
query1builder.add(new Term("body", "blueberry"));
query2builder.add(new Term("body", "strawberry"));
LinkedList<Term> termsWithPrefix = new LinkedList<>();
// this TermEnum gives "piccadilly", "pie" and "pizza".
String prefix = "pi";
TermsEnum te = MultiFields.getFields(reader).terms("body").iterator();
te.seekCeil(new BytesRef(prefix));
do {
String s = te.term().utf8ToString();
if (s.startsWith(prefix)) {
termsWithPrefix.add(new Term("body", s));
} else {
break;
}
} while (te.next() != null);
query1builder.add(termsWithPrefix.toArray(new Term[0]));
MultiPhraseQuery query1 = query1builder.build();
assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
query2builder.add(termsWithPrefix.toArray(new Term[0]));
MultiPhraseQuery query2 = query2builder.build();
assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString());
ScoreDoc[] result;
result = searcher.search(query1, 1000).scoreDocs;
assertEquals(2, result.length);
result = searcher.search(query2, 1000).scoreDocs;
assertEquals(0, result.length);
// search for "blue* pizza":
MultiPhraseQuery.Builder query3builder = new MultiPhraseQuery.Builder();
termsWithPrefix.clear();
prefix = "blue";
te.seekCeil(new BytesRef(prefix));
do {
if (te.term().utf8ToString().startsWith(prefix)) {
termsWithPrefix.add(new Term("body", te.term().utf8ToString()));
}
} while (te.next() != null);
query3builder.add(termsWithPrefix.toArray(new Term[0]));
query3builder.add(new Term("body", "pizza"));
MultiPhraseQuery query3 = query3builder.build();
result = searcher.search(query3, 1000).scoreDocs;
// blueberry pizza, bluebird pizza
assertEquals(2, result.length);
assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
// test slop:
query3builder.setSlop(1);
query3 = query3builder.build();
result = searcher.search(query3, 1000).scoreDocs;
// just make sure no exc:
searcher.explain(query3, 0);
// blueberry pizza, bluebird pizza, bluebird
assertEquals(3, result.length);
// foobar pizza
MultiPhraseQuery.Builder query4builder = new MultiPhraseQuery.Builder();
expectThrows(IllegalArgumentException.class, () -> {
query4builder.add(new Term("field1", "foo"));
query4builder.add(new Term("field2", "foobar"));
});
writer.close();
reader.close();
indexStore.close();
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class SimpleTextDocValuesWriter method addSortedField.
@Override
public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
assert fieldSeen(field.name);
assert field.getDocValuesType() == DocValuesType.SORTED;
writeFieldEntry(field, DocValuesType.SORTED);
int valueCount = 0;
int maxLength = -1;
TermsEnum terms = valuesProducer.getSorted(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
maxLength = Math.max(maxLength, value.length);
valueCount++;
}
// write numValues
SimpleTextUtil.write(data, NUMVALUES);
SimpleTextUtil.write(data, Integer.toString(valueCount), scratch);
SimpleTextUtil.writeNewline(data);
// write maxLength
SimpleTextUtil.write(data, MAXLENGTH);
SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
SimpleTextUtil.writeNewline(data);
int maxBytesLength = Integer.toString(maxLength).length();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < maxBytesLength; i++) {
sb.append('0');
}
// write our pattern for encoding lengths
SimpleTextUtil.write(data, PATTERN);
SimpleTextUtil.write(data, sb.toString(), scratch);
SimpleTextUtil.writeNewline(data);
final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
int maxOrdBytes = Long.toString(valueCount + 1L).length();
sb.setLength(0);
for (int i = 0; i < maxOrdBytes; i++) {
sb.append('0');
}
// write our pattern for ords
SimpleTextUtil.write(data, ORDPATTERN);
SimpleTextUtil.write(data, sb.toString(), scratch);
SimpleTextUtil.writeNewline(data);
final DecimalFormat ordEncoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
// for asserts:
int valuesSeen = 0;
terms = valuesProducer.getSorted(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
// write length
SimpleTextUtil.write(data, LENGTH);
SimpleTextUtil.write(data, encoder.format(value.length), scratch);
SimpleTextUtil.writeNewline(data);
// write bytes -- don't use SimpleText.write
// because it escapes:
data.writeBytes(value.bytes, value.offset, value.length);
// pad to fit
for (int i = value.length; i < maxLength; i++) {
data.writeByte((byte) ' ');
}
SimpleTextUtil.writeNewline(data);
valuesSeen++;
assert valuesSeen <= valueCount;
}
assert valuesSeen == valueCount;
SortedDocValues values = valuesProducer.getSorted(field);
for (int i = 0; i < numDocs; ++i) {
if (values.docID() < i) {
values.nextDoc();
assert values.docID() >= i;
}
int ord = -1;
if (values.docID() == i) {
ord = values.ordValue();
}
SimpleTextUtil.write(data, ordEncoder.format(ord + 1L), scratch);
SimpleTextUtil.writeNewline(data);
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class SimpleTextDocValuesWriter method addSortedSetField.
@Override
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
assert fieldSeen(field.name);
assert field.getDocValuesType() == DocValuesType.SORTED_SET;
writeFieldEntry(field, DocValuesType.SORTED_SET);
long valueCount = 0;
int maxLength = 0;
TermsEnum terms = valuesProducer.getSortedSet(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
maxLength = Math.max(maxLength, value.length);
valueCount++;
}
// write numValues
SimpleTextUtil.write(data, NUMVALUES);
SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
SimpleTextUtil.writeNewline(data);
// write maxLength
SimpleTextUtil.write(data, MAXLENGTH);
SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
SimpleTextUtil.writeNewline(data);
int maxBytesLength = Integer.toString(maxLength).length();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < maxBytesLength; i++) {
sb.append('0');
}
// write our pattern for encoding lengths
SimpleTextUtil.write(data, PATTERN);
SimpleTextUtil.write(data, sb.toString(), scratch);
SimpleTextUtil.writeNewline(data);
final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
// compute ord pattern: this is funny, we encode all values for all docs to find the maximum length
int maxOrdListLength = 0;
StringBuilder sb2 = new StringBuilder();
SortedSetDocValues values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
sb2.setLength(0);
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
if (sb2.length() > 0) {
sb2.append(",");
}
sb2.append(Long.toString(ord));
}
maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
}
sb2.setLength(0);
for (int i = 0; i < maxOrdListLength; i++) {
sb2.append('X');
}
// write our pattern for ord lists
SimpleTextUtil.write(data, ORDPATTERN);
SimpleTextUtil.write(data, sb2.toString(), scratch);
SimpleTextUtil.writeNewline(data);
// for asserts:
long valuesSeen = 0;
terms = valuesProducer.getSortedSet(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
// write length
SimpleTextUtil.write(data, LENGTH);
SimpleTextUtil.write(data, encoder.format(value.length), scratch);
SimpleTextUtil.writeNewline(data);
// write bytes -- don't use SimpleText.write
// because it escapes:
data.writeBytes(value.bytes, value.offset, value.length);
// pad to fit
for (int i = value.length; i < maxLength; i++) {
data.writeByte((byte) ' ');
}
SimpleTextUtil.writeNewline(data);
valuesSeen++;
assert valuesSeen <= valueCount;
}
assert valuesSeen == valueCount;
values = valuesProducer.getSortedSet(field);
// write the ords for each doc comma-separated
for (int i = 0; i < numDocs; ++i) {
if (values.docID() < i) {
values.nextDoc();
assert values.docID() >= i;
}
sb2.setLength(0);
if (values.docID() == i) {
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
if (sb2.length() > 0) {
sb2.append(",");
}
sb2.append(Long.toString(ord));
}
}
// now pad to fit: these are numbers so spaces work well. reader calls trim()
int numPadding = maxOrdListLength - sb2.length();
for (int j = 0; j < numPadding; j++) {
sb2.append(' ');
}
SimpleTextUtil.write(data, sb2.toString(), scratch);
SimpleTextUtil.writeNewline(data);
}
}
Aggregations