Search in sources :

Example 61 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestMultiPhraseQuery method testPhrasePrefix.

public void testPhrasePrefix() throws IOException {
    Directory indexStore = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
    add("blueberry pie", writer);
    add("blueberry strudel", writer);
    add("blueberry pizza", writer);
    add("blueberry chewing gum", writer);
    add("bluebird pizza", writer);
    add("bluebird foobar pizza", writer);
    add("piccadilly circus", writer);
    IndexReader reader = writer.getReader();
    IndexSearcher searcher = newSearcher(reader);
    // search for "blueberry pi*":
    MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
    // search for "strawberry pi*":
    MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
    query1builder.add(new Term("body", "blueberry"));
    query2builder.add(new Term("body", "strawberry"));
    LinkedList<Term> termsWithPrefix = new LinkedList<>();
    // this TermEnum gives "piccadilly", "pie" and "pizza".
    String prefix = "pi";
    TermsEnum te = MultiFields.getFields(reader).terms("body").iterator();
    te.seekCeil(new BytesRef(prefix));
    do {
        String s = te.term().utf8ToString();
        if (s.startsWith(prefix)) {
            termsWithPrefix.add(new Term("body", s));
        } else {
            break;
        }
    } while (te.next() != null);
    query1builder.add(termsWithPrefix.toArray(new Term[0]));
    MultiPhraseQuery query1 = query1builder.build();
    assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
    query2builder.add(termsWithPrefix.toArray(new Term[0]));
    MultiPhraseQuery query2 = query2builder.build();
    assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString());
    ScoreDoc[] result;
    result = searcher.search(query1, 1000).scoreDocs;
    assertEquals(2, result.length);
    result = searcher.search(query2, 1000).scoreDocs;
    assertEquals(0, result.length);
    // search for "blue* pizza":
    MultiPhraseQuery.Builder query3builder = new MultiPhraseQuery.Builder();
    termsWithPrefix.clear();
    prefix = "blue";
    te.seekCeil(new BytesRef(prefix));
    do {
        if (te.term().utf8ToString().startsWith(prefix)) {
            termsWithPrefix.add(new Term("body", te.term().utf8ToString()));
        }
    } while (te.next() != null);
    query3builder.add(termsWithPrefix.toArray(new Term[0]));
    query3builder.add(new Term("body", "pizza"));
    MultiPhraseQuery query3 = query3builder.build();
    result = searcher.search(query3, 1000).scoreDocs;
    // blueberry pizza, bluebird pizza
    assertEquals(2, result.length);
    assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
    // test slop:
    query3builder.setSlop(1);
    query3 = query3builder.build();
    result = searcher.search(query3, 1000).scoreDocs;
    // just make sure no exc:
    searcher.explain(query3, 0);
    // blueberry pizza, bluebird pizza, bluebird
    assertEquals(3, result.length);
    // foobar pizza
    MultiPhraseQuery.Builder query4builder = new MultiPhraseQuery.Builder();
    expectThrows(IllegalArgumentException.class, () -> {
        query4builder.add(new Term("field1", "foo"));
        query4builder.add(new Term("field2", "foobar"));
    });
    writer.close();
    reader.close();
    indexStore.close();
}
Also used : Term(org.apache.lucene.index.Term) LinkedList(java.util.LinkedList) TermsEnum(org.apache.lucene.index.TermsEnum) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory)

Example 62 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class SimpleTextDocValuesWriter method addSortedField.

@Override
public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.SORTED;
    writeFieldEntry(field, DocValuesType.SORTED);
    int valueCount = 0;
    int maxLength = -1;
    TermsEnum terms = valuesProducer.getSorted(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
        maxLength = Math.max(maxLength, value.length);
        valueCount++;
    }
    // write numValues
    SimpleTextUtil.write(data, NUMVALUES);
    SimpleTextUtil.write(data, Integer.toString(valueCount), scratch);
    SimpleTextUtil.writeNewline(data);
    // write maxLength
    SimpleTextUtil.write(data, MAXLENGTH);
    SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
    SimpleTextUtil.writeNewline(data);
    int maxBytesLength = Integer.toString(maxLength).length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesLength; i++) {
        sb.append('0');
    }
    // write our pattern for encoding lengths
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
    int maxOrdBytes = Long.toString(valueCount + 1L).length();
    sb.setLength(0);
    for (int i = 0; i < maxOrdBytes; i++) {
        sb.append('0');
    }
    // write our pattern for ords
    SimpleTextUtil.write(data, ORDPATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat ordEncoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
    // for asserts:
    int valuesSeen = 0;
    terms = valuesProducer.getSorted(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
        // write length
        SimpleTextUtil.write(data, LENGTH);
        SimpleTextUtil.write(data, encoder.format(value.length), scratch);
        SimpleTextUtil.writeNewline(data);
        // write bytes -- don't use SimpleText.write
        // because it escapes:
        data.writeBytes(value.bytes, value.offset, value.length);
        // pad to fit
        for (int i = value.length; i < maxLength; i++) {
            data.writeByte((byte) ' ');
        }
        SimpleTextUtil.writeNewline(data);
        valuesSeen++;
        assert valuesSeen <= valueCount;
    }
    assert valuesSeen == valueCount;
    SortedDocValues values = valuesProducer.getSorted(field);
    for (int i = 0; i < numDocs; ++i) {
        if (values.docID() < i) {
            values.nextDoc();
            assert values.docID() >= i;
        }
        int ord = -1;
        if (values.docID() == i) {
            ord = values.ordValue();
        }
        SimpleTextUtil.write(data, ordEncoder.format(ord + 1L), scratch);
        SimpleTextUtil.writeNewline(data);
    }
}
Also used : DecimalFormatSymbols(java.text.DecimalFormatSymbols) DecimalFormat(java.text.DecimalFormat) BytesRef(org.apache.lucene.util.BytesRef) SortedDocValues(org.apache.lucene.index.SortedDocValues) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 63 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class SimpleTextDocValuesWriter method addSortedSetField.

@Override
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.SORTED_SET;
    writeFieldEntry(field, DocValuesType.SORTED_SET);
    long valueCount = 0;
    int maxLength = 0;
    TermsEnum terms = valuesProducer.getSortedSet(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
        maxLength = Math.max(maxLength, value.length);
        valueCount++;
    }
    // write numValues
    SimpleTextUtil.write(data, NUMVALUES);
    SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
    SimpleTextUtil.writeNewline(data);
    // write maxLength
    SimpleTextUtil.write(data, MAXLENGTH);
    SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
    SimpleTextUtil.writeNewline(data);
    int maxBytesLength = Integer.toString(maxLength).length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesLength; i++) {
        sb.append('0');
    }
    // write our pattern for encoding lengths
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
    // compute ord pattern: this is funny, we encode all values for all docs to find the maximum length
    int maxOrdListLength = 0;
    StringBuilder sb2 = new StringBuilder();
    SortedSetDocValues values = valuesProducer.getSortedSet(field);
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
        sb2.setLength(0);
        for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
            if (sb2.length() > 0) {
                sb2.append(",");
            }
            sb2.append(Long.toString(ord));
        }
        maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
    }
    sb2.setLength(0);
    for (int i = 0; i < maxOrdListLength; i++) {
        sb2.append('X');
    }
    // write our pattern for ord lists
    SimpleTextUtil.write(data, ORDPATTERN);
    SimpleTextUtil.write(data, sb2.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    // for asserts:
    long valuesSeen = 0;
    terms = valuesProducer.getSortedSet(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
        // write length
        SimpleTextUtil.write(data, LENGTH);
        SimpleTextUtil.write(data, encoder.format(value.length), scratch);
        SimpleTextUtil.writeNewline(data);
        // write bytes -- don't use SimpleText.write
        // because it escapes:
        data.writeBytes(value.bytes, value.offset, value.length);
        // pad to fit
        for (int i = value.length; i < maxLength; i++) {
            data.writeByte((byte) ' ');
        }
        SimpleTextUtil.writeNewline(data);
        valuesSeen++;
        assert valuesSeen <= valueCount;
    }
    assert valuesSeen == valueCount;
    values = valuesProducer.getSortedSet(field);
    // write the ords for each doc comma-separated
    for (int i = 0; i < numDocs; ++i) {
        if (values.docID() < i) {
            values.nextDoc();
            assert values.docID() >= i;
        }
        sb2.setLength(0);
        if (values.docID() == i) {
            for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
                if (sb2.length() > 0) {
                    sb2.append(",");
                }
                sb2.append(Long.toString(ord));
            }
        }
        // now pad to fit: these are numbers so spaces work well. reader calls trim()
        int numPadding = maxOrdListLength - sb2.length();
        for (int j = 0; j < numPadding; j++) {
            sb2.append(' ');
        }
        SimpleTextUtil.write(data, sb2.toString(), scratch);
        SimpleTextUtil.writeNewline(data);
    }
}
Also used : DecimalFormatSymbols(java.text.DecimalFormatSymbols) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) DecimalFormat(java.text.DecimalFormat) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 64 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class DocValuesConsumer method mergeSortedSetField.

/**
   * Merges the sortedset docvalues from <code>toMerge</code>.
   * <p>
   * The default implementation calls {@link #addSortedSetField}, passing
   * an Iterable that merges ordinals and values and filters deleted documents .
   */
public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
    List<SortedSetDocValues> toMerge = new ArrayList<>();
    for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
        SortedSetDocValues values = null;
        DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
        if (docValuesProducer != null) {
            FieldInfo fieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
            if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
                values = docValuesProducer.getSortedSet(fieldInfo);
            }
        }
        if (values == null) {
            values = DocValues.emptySortedSet();
        }
        toMerge.add(values);
    }
    // step 1: iterate thru each sub and mark terms still in use
    TermsEnum[] liveTerms = new TermsEnum[toMerge.size()];
    long[] weights = new long[liveTerms.length];
    for (int sub = 0; sub < liveTerms.length; sub++) {
        SortedSetDocValues dv = toMerge.get(sub);
        Bits liveDocs = mergeState.liveDocs[sub];
        if (liveDocs == null) {
            liveTerms[sub] = dv.termsEnum();
            weights[sub] = dv.getValueCount();
        } else {
            LongBitSet bitset = new LongBitSet(dv.getValueCount());
            int docID;
            while ((docID = dv.nextDoc()) != NO_MORE_DOCS) {
                if (liveDocs.get(docID)) {
                    long ord;
                    while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
                        bitset.set(ord);
                    }
                }
            }
            liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
            weights[sub] = bitset.cardinality();
        }
    }
    // step 2: create ordinal map (this conceptually does the "merging")
    final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT);
    // step 3: add field
    addSortedSetField(mergeFieldInfo, new EmptyDocValuesProducer() {

        @Override
        public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
            if (fieldInfo != mergeFieldInfo) {
                throw new IllegalArgumentException("wrong FieldInfo");
            }
            // We must make new iterators + DocIDMerger for each iterator:
            List<SortedSetDocValuesSub> subs = new ArrayList<>();
            long cost = 0;
            for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
                SortedSetDocValues values = null;
                DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
                if (docValuesProducer != null) {
                    FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
                    if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
                        values = docValuesProducer.getSortedSet(readerFieldInfo);
                    }
                }
                if (values == null) {
                    values = DocValues.emptySortedSet();
                }
                cost += values.cost();
                subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
            }
            final DocIDMerger<SortedSetDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
            final long finalCost = cost;
            return new SortedSetDocValues() {

                private int docID = -1;

                private SortedSetDocValuesSub currentSub;

                @Override
                public int docID() {
                    return docID;
                }

                @Override
                public int nextDoc() throws IOException {
                    currentSub = docIDMerger.next();
                    if (currentSub == null) {
                        docID = NO_MORE_DOCS;
                    } else {
                        docID = currentSub.mappedDocID;
                    }
                    return docID;
                }

                @Override
                public int advance(int target) throws IOException {
                    throw new UnsupportedOperationException();
                }

                @Override
                public boolean advanceExact(int target) throws IOException {
                    throw new UnsupportedOperationException();
                }

                @Override
                public long nextOrd() throws IOException {
                    long subOrd = currentSub.values.nextOrd();
                    if (subOrd == NO_MORE_ORDS) {
                        return NO_MORE_ORDS;
                    }
                    return currentSub.map.get(subOrd);
                }

                @Override
                public long cost() {
                    return finalCost;
                }

                @Override
                public BytesRef lookupOrd(long ord) throws IOException {
                    int segmentNumber = map.getFirstSegmentNumber(ord);
                    long segmentOrd = map.getFirstSegmentOrd(ord);
                    return toMerge.get(segmentNumber).lookupOrd(segmentOrd);
                }

                @Override
                public long getValueCount() {
                    return map.getValueCount();
                }
            };
        }
    });
}
Also used : ArrayList(java.util.ArrayList) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) LongBitSet(org.apache.lucene.util.LongBitSet) IOException(java.io.IOException) OrdinalMap(org.apache.lucene.index.MultiDocValues.OrdinalMap) TermsEnum(org.apache.lucene.index.TermsEnum) FilteredTermsEnum(org.apache.lucene.index.FilteredTermsEnum) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) DocIDMerger(org.apache.lucene.index.DocIDMerger) Bits(org.apache.lucene.util.Bits) ArrayList(java.util.ArrayList) List(java.util.List) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef)

Example 65 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestOrdsBlockTree method testBasic.

public void testBasic() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("field", "a b c", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator();
    // Test next()
    assertEquals(new BytesRef("a"), te.next());
    assertEquals(0L, te.ord());
    assertEquals(new BytesRef("b"), te.next());
    assertEquals(1L, te.ord());
    assertEquals(new BytesRef("c"), te.next());
    assertEquals(2L, te.ord());
    assertNull(te.next());
    // Test seekExact by term
    assertTrue(te.seekExact(new BytesRef("b")));
    assertEquals(1, te.ord());
    assertTrue(te.seekExact(new BytesRef("a")));
    assertEquals(0, te.ord());
    assertTrue(te.seekExact(new BytesRef("c")));
    assertEquals(2, te.ord());
    // Test seekExact by ord
    te.seekExact(1);
    assertEquals(new BytesRef("b"), te.term());
    te.seekExact(0);
    assertEquals(new BytesRef("a"), te.term());
    te.seekExact(2);
    assertEquals(new BytesRef("c"), te.term());
    r.close();
    w.close();
    dir.close();
}
Also used : IndexReader(org.apache.lucene.index.IndexReader) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10