use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestMultiPhraseQuery method testPhrasePrefix.
public void testPhrasePrefix() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("blueberry pie", writer);
add("blueberry strudel", writer);
add("blueberry pizza", writer);
add("blueberry chewing gum", writer);
add("bluebird pizza", writer);
add("bluebird foobar pizza", writer);
add("piccadilly circus", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
// search for "blueberry pi*":
MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
// search for "strawberry pi*":
MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
query1builder.add(new Term("body", "blueberry"));
query2builder.add(new Term("body", "strawberry"));
LinkedList<Term> termsWithPrefix = new LinkedList<>();
// this TermEnum gives "piccadilly", "pie" and "pizza".
String prefix = "pi";
TermsEnum te = MultiFields.getFields(reader).terms("body").iterator();
te.seekCeil(new BytesRef(prefix));
do {
String s = te.term().utf8ToString();
if (s.startsWith(prefix)) {
termsWithPrefix.add(new Term("body", s));
} else {
break;
}
} while (te.next() != null);
query1builder.add(termsWithPrefix.toArray(new Term[0]));
MultiPhraseQuery query1 = query1builder.build();
assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
query2builder.add(termsWithPrefix.toArray(new Term[0]));
MultiPhraseQuery query2 = query2builder.build();
assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString());
ScoreDoc[] result;
result = searcher.search(query1, 1000).scoreDocs;
assertEquals(2, result.length);
result = searcher.search(query2, 1000).scoreDocs;
assertEquals(0, result.length);
// search for "blue* pizza":
MultiPhraseQuery.Builder query3builder = new MultiPhraseQuery.Builder();
termsWithPrefix.clear();
prefix = "blue";
te.seekCeil(new BytesRef(prefix));
do {
if (te.term().utf8ToString().startsWith(prefix)) {
termsWithPrefix.add(new Term("body", te.term().utf8ToString()));
}
} while (te.next() != null);
query3builder.add(termsWithPrefix.toArray(new Term[0]));
query3builder.add(new Term("body", "pizza"));
MultiPhraseQuery query3 = query3builder.build();
result = searcher.search(query3, 1000).scoreDocs;
// blueberry pizza, bluebird pizza
assertEquals(2, result.length);
assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
// test slop:
query3builder.setSlop(1);
query3 = query3builder.build();
result = searcher.search(query3, 1000).scoreDocs;
// just make sure no exc:
searcher.explain(query3, 0);
// blueberry pizza, bluebird pizza, bluebird
assertEquals(3, result.length);
// foobar pizza
MultiPhraseQuery.Builder query4builder = new MultiPhraseQuery.Builder();
expectThrows(IllegalArgumentException.class, () -> {
query4builder.add(new Term("field1", "foo"));
query4builder.add(new Term("field2", "foobar"));
});
writer.close();
reader.close();
indexStore.close();
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class SimpleTextDocValuesWriter method addSortedField.
@Override
public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
assert fieldSeen(field.name);
assert field.getDocValuesType() == DocValuesType.SORTED;
writeFieldEntry(field, DocValuesType.SORTED);
int valueCount = 0;
int maxLength = -1;
TermsEnum terms = valuesProducer.getSorted(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
maxLength = Math.max(maxLength, value.length);
valueCount++;
}
// write numValues
SimpleTextUtil.write(data, NUMVALUES);
SimpleTextUtil.write(data, Integer.toString(valueCount), scratch);
SimpleTextUtil.writeNewline(data);
// write maxLength
SimpleTextUtil.write(data, MAXLENGTH);
SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
SimpleTextUtil.writeNewline(data);
int maxBytesLength = Integer.toString(maxLength).length();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < maxBytesLength; i++) {
sb.append('0');
}
// write our pattern for encoding lengths
SimpleTextUtil.write(data, PATTERN);
SimpleTextUtil.write(data, sb.toString(), scratch);
SimpleTextUtil.writeNewline(data);
final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
int maxOrdBytes = Long.toString(valueCount + 1L).length();
sb.setLength(0);
for (int i = 0; i < maxOrdBytes; i++) {
sb.append('0');
}
// write our pattern for ords
SimpleTextUtil.write(data, ORDPATTERN);
SimpleTextUtil.write(data, sb.toString(), scratch);
SimpleTextUtil.writeNewline(data);
final DecimalFormat ordEncoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
// for asserts:
int valuesSeen = 0;
terms = valuesProducer.getSorted(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
// write length
SimpleTextUtil.write(data, LENGTH);
SimpleTextUtil.write(data, encoder.format(value.length), scratch);
SimpleTextUtil.writeNewline(data);
// write bytes -- don't use SimpleText.write
// because it escapes:
data.writeBytes(value.bytes, value.offset, value.length);
// pad to fit
for (int i = value.length; i < maxLength; i++) {
data.writeByte((byte) ' ');
}
SimpleTextUtil.writeNewline(data);
valuesSeen++;
assert valuesSeen <= valueCount;
}
assert valuesSeen == valueCount;
SortedDocValues values = valuesProducer.getSorted(field);
for (int i = 0; i < numDocs; ++i) {
if (values.docID() < i) {
values.nextDoc();
assert values.docID() >= i;
}
int ord = -1;
if (values.docID() == i) {
ord = values.ordValue();
}
SimpleTextUtil.write(data, ordEncoder.format(ord + 1L), scratch);
SimpleTextUtil.writeNewline(data);
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class SimpleTextDocValuesWriter method addSortedSetField.
@Override
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
assert fieldSeen(field.name);
assert field.getDocValuesType() == DocValuesType.SORTED_SET;
writeFieldEntry(field, DocValuesType.SORTED_SET);
long valueCount = 0;
int maxLength = 0;
TermsEnum terms = valuesProducer.getSortedSet(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
maxLength = Math.max(maxLength, value.length);
valueCount++;
}
// write numValues
SimpleTextUtil.write(data, NUMVALUES);
SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
SimpleTextUtil.writeNewline(data);
// write maxLength
SimpleTextUtil.write(data, MAXLENGTH);
SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
SimpleTextUtil.writeNewline(data);
int maxBytesLength = Integer.toString(maxLength).length();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < maxBytesLength; i++) {
sb.append('0');
}
// write our pattern for encoding lengths
SimpleTextUtil.write(data, PATTERN);
SimpleTextUtil.write(data, sb.toString(), scratch);
SimpleTextUtil.writeNewline(data);
final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
// compute ord pattern: this is funny, we encode all values for all docs to find the maximum length
int maxOrdListLength = 0;
StringBuilder sb2 = new StringBuilder();
SortedSetDocValues values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
sb2.setLength(0);
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
if (sb2.length() > 0) {
sb2.append(",");
}
sb2.append(Long.toString(ord));
}
maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
}
sb2.setLength(0);
for (int i = 0; i < maxOrdListLength; i++) {
sb2.append('X');
}
// write our pattern for ord lists
SimpleTextUtil.write(data, ORDPATTERN);
SimpleTextUtil.write(data, sb2.toString(), scratch);
SimpleTextUtil.writeNewline(data);
// for asserts:
long valuesSeen = 0;
terms = valuesProducer.getSortedSet(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
// write length
SimpleTextUtil.write(data, LENGTH);
SimpleTextUtil.write(data, encoder.format(value.length), scratch);
SimpleTextUtil.writeNewline(data);
// write bytes -- don't use SimpleText.write
// because it escapes:
data.writeBytes(value.bytes, value.offset, value.length);
// pad to fit
for (int i = value.length; i < maxLength; i++) {
data.writeByte((byte) ' ');
}
SimpleTextUtil.writeNewline(data);
valuesSeen++;
assert valuesSeen <= valueCount;
}
assert valuesSeen == valueCount;
values = valuesProducer.getSortedSet(field);
// write the ords for each doc comma-separated
for (int i = 0; i < numDocs; ++i) {
if (values.docID() < i) {
values.nextDoc();
assert values.docID() >= i;
}
sb2.setLength(0);
if (values.docID() == i) {
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
if (sb2.length() > 0) {
sb2.append(",");
}
sb2.append(Long.toString(ord));
}
}
// now pad to fit: these are numbers so spaces work well. reader calls trim()
int numPadding = maxOrdListLength - sb2.length();
for (int j = 0; j < numPadding; j++) {
sb2.append(' ');
}
SimpleTextUtil.write(data, sb2.toString(), scratch);
SimpleTextUtil.writeNewline(data);
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class DocValuesConsumer method mergeSortedSetField.
/**
* Merges the sortedset docvalues from <code>toMerge</code>.
* <p>
* The default implementation calls {@link #addSortedSetField}, passing
* an Iterable that merges ordinals and values and filters deleted documents .
*/
public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
List<SortedSetDocValues> toMerge = new ArrayList<>();
for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
SortedSetDocValues values = null;
DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
if (docValuesProducer != null) {
FieldInfo fieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
values = docValuesProducer.getSortedSet(fieldInfo);
}
}
if (values == null) {
values = DocValues.emptySortedSet();
}
toMerge.add(values);
}
// step 1: iterate thru each sub and mark terms still in use
TermsEnum[] liveTerms = new TermsEnum[toMerge.size()];
long[] weights = new long[liveTerms.length];
for (int sub = 0; sub < liveTerms.length; sub++) {
SortedSetDocValues dv = toMerge.get(sub);
Bits liveDocs = mergeState.liveDocs[sub];
if (liveDocs == null) {
liveTerms[sub] = dv.termsEnum();
weights[sub] = dv.getValueCount();
} else {
LongBitSet bitset = new LongBitSet(dv.getValueCount());
int docID;
while ((docID = dv.nextDoc()) != NO_MORE_DOCS) {
if (liveDocs.get(docID)) {
long ord;
while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
bitset.set(ord);
}
}
}
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
weights[sub] = bitset.cardinality();
}
}
// step 2: create ordinal map (this conceptually does the "merging")
final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT);
// step 3: add field
addSortedSetField(mergeFieldInfo, new EmptyDocValuesProducer() {
@Override
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
if (fieldInfo != mergeFieldInfo) {
throw new IllegalArgumentException("wrong FieldInfo");
}
// We must make new iterators + DocIDMerger for each iterator:
List<SortedSetDocValuesSub> subs = new ArrayList<>();
long cost = 0;
for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
SortedSetDocValues values = null;
DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
if (docValuesProducer != null) {
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
values = docValuesProducer.getSortedSet(readerFieldInfo);
}
}
if (values == null) {
values = DocValues.emptySortedSet();
}
cost += values.cost();
subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
}
final DocIDMerger<SortedSetDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
final long finalCost = cost;
return new SortedSetDocValues() {
private int docID = -1;
private SortedSetDocValuesSub currentSub;
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() throws IOException {
currentSub = docIDMerger.next();
if (currentSub == null) {
docID = NO_MORE_DOCS;
} else {
docID = currentSub.mappedDocID;
}
return docID;
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long nextOrd() throws IOException {
long subOrd = currentSub.values.nextOrd();
if (subOrd == NO_MORE_ORDS) {
return NO_MORE_ORDS;
}
return currentSub.map.get(subOrd);
}
@Override
public long cost() {
return finalCost;
}
@Override
public BytesRef lookupOrd(long ord) throws IOException {
int segmentNumber = map.getFirstSegmentNumber(ord);
long segmentOrd = map.getFirstSegmentOrd(ord);
return toMerge.get(segmentNumber).lookupOrd(segmentOrd);
}
@Override
public long getValueCount() {
return map.getValueCount();
}
};
}
});
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestOrdsBlockTree method testBasic.
public void testBasic() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "a b c", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
TermsEnum te = MultiFields.getTerms(r, "field").iterator();
// Test next()
assertEquals(new BytesRef("a"), te.next());
assertEquals(0L, te.ord());
assertEquals(new BytesRef("b"), te.next());
assertEquals(1L, te.ord());
assertEquals(new BytesRef("c"), te.next());
assertEquals(2L, te.ord());
assertNull(te.next());
// Test seekExact by term
assertTrue(te.seekExact(new BytesRef("b")));
assertEquals(1, te.ord());
assertTrue(te.seekExact(new BytesRef("a")));
assertEquals(0, te.ord());
assertTrue(te.seekExact(new BytesRef("c")));
assertEquals(2, te.ord());
// Test seekExact by ord
te.seekExact(1);
assertEquals(new BytesRef("b"), te.term());
te.seekExact(0);
assertEquals(new BytesRef("a"), te.term());
te.seekExact(2);
assertEquals(new BytesRef("c"), te.term());
r.close();
w.close();
dir.close();
}
Aggregations