Search in sources :

Example 21 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestLucene70DocValuesFormat method doTestTermsEnumRandom.

// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    // set to duel against a codec which has ordinals:
    final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
    final DocValuesFormat dv = new Lucene70DocValuesFormat();
    conf.setCodec(new AssertingCodec() {

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return pf;
        }

        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
            return dv;
        }
    });
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // index some docs
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
        doc.add(idField);
        int numValues = random().nextInt(17);
        // create a random list of strings
        List<String> values = new ArrayList<>();
        for (int v = 0; v < numValues; v++) {
            values.add(valuesProducer.get());
        }
        // add in any order to the indexed field
        ArrayList<String> unordered = new ArrayList<>(values);
        Collections.shuffle(unordered, random());
        for (String v : values) {
            doc.add(newStringField("indexed", v, Field.Store.NO));
        }
        // add in any order to the dv field
        ArrayList<String> unordered2 = new ArrayList<>(values);
        Collections.shuffle(unordered2, random());
        for (String v : unordered2) {
            doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
        }
        writer.addDocument(doc);
        if (random().nextInt(31) == 0) {
            writer.commit();
        }
    }
    // delete some docs
    int numDeletions = random().nextInt(numDocs / 10);
    for (int i = 0; i < numDeletions; i++) {
        int id = random().nextInt(numDocs);
        writer.deleteDocuments(new Term("id", Integer.toString(id)));
    }
    // compare per-segment
    DirectoryReader ir = writer.getReader();
    for (LeafReaderContext context : ir.leaves()) {
        LeafReader r = context.reader();
        Terms terms = r.terms("indexed");
        if (terms != null) {
            SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
            assertEquals(terms.size(), ssdv.getValueCount());
            TermsEnum expected = terms.iterator();
            TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
            assertEquals(terms.size(), expected, actual);
            doTestSortedSetEnumAdvanceIndependently(ssdv);
        }
    }
    ir.close();
    writer.forceMerge(1);
    // now compare again after the merge
    ir = writer.getReader();
    LeafReader ar = getOnlyLeafReader(ir);
    Terms terms = ar.terms("indexed");
    if (terms != null) {
        assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
        TermsEnum expected = terms.iterator();
        TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
        assertEquals(terms.size(), expected, actual);
    }
    ir.close();
    writer.close();
    dir.close();
}
Also used : Lucene70DocValuesFormat(org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) Lucene70DocValuesFormat(org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat) DocValuesFormat(org.apache.lucene.codecs.DocValuesFormat) TermsEnum(org.apache.lucene.index.TermsEnum) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) AssertingCodec(org.apache.lucene.codecs.asserting.AssertingCodec) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) StringField(org.apache.lucene.document.StringField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 22 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestPhrasePrefixQuery method testPhrasePrefix.

/**
     *
     */
public void testPhrasePrefix() throws IOException {
    Directory indexStore = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
    Document doc1 = new Document();
    Document doc2 = new Document();
    Document doc3 = new Document();
    Document doc4 = new Document();
    Document doc5 = new Document();
    doc1.add(newTextField("body", "blueberry pie", Field.Store.YES));
    doc2.add(newTextField("body", "blueberry strudel", Field.Store.YES));
    doc3.add(newTextField("body", "blueberry pizza", Field.Store.YES));
    doc4.add(newTextField("body", "blueberry chewing gum", Field.Store.YES));
    doc5.add(newTextField("body", "piccadilly circus", Field.Store.YES));
    writer.addDocument(doc1);
    writer.addDocument(doc2);
    writer.addDocument(doc3);
    writer.addDocument(doc4);
    writer.addDocument(doc5);
    IndexReader reader = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(reader);
    // PhrasePrefixQuery query1 = new PhrasePrefixQuery();
    MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
    // PhrasePrefixQuery query2 = new PhrasePrefixQuery();
    MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
    query1builder.add(new Term("body", "blueberry"));
    query2builder.add(new Term("body", "strawberry"));
    LinkedList<Term> termsWithPrefix = new LinkedList<>();
    // this TermEnum gives "piccadilly", "pie" and "pizza".
    String prefix = "pi";
    TermsEnum te = MultiFields.getFields(reader).terms("body").iterator();
    te.seekCeil(new BytesRef(prefix));
    do {
        String s = te.term().utf8ToString();
        if (s.startsWith(prefix)) {
            termsWithPrefix.add(new Term("body", s));
        } else {
            break;
        }
    } while (te.next() != null);
    query1builder.add(termsWithPrefix.toArray(new Term[0]));
    query2builder.add(termsWithPrefix.toArray(new Term[0]));
    ScoreDoc[] result;
    result = searcher.search(query1builder.build(), 1000).scoreDocs;
    assertEquals(2, result.length);
    result = searcher.search(query2builder.build(), 1000).scoreDocs;
    assertEquals(0, result.length);
    reader.close();
    indexStore.close();
}
Also used : Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) LinkedList(java.util.LinkedList) TermsEnum(org.apache.lucene.index.TermsEnum) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 23 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestMultiPhraseQuery method testPhrasePrefix.

public void testPhrasePrefix() throws IOException {
    Directory indexStore = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
    add("blueberry pie", writer);
    add("blueberry strudel", writer);
    add("blueberry pizza", writer);
    add("blueberry chewing gum", writer);
    add("bluebird pizza", writer);
    add("bluebird foobar pizza", writer);
    add("piccadilly circus", writer);
    IndexReader reader = writer.getReader();
    IndexSearcher searcher = newSearcher(reader);
    // search for "blueberry pi*":
    MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
    // search for "strawberry pi*":
    MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
    query1builder.add(new Term("body", "blueberry"));
    query2builder.add(new Term("body", "strawberry"));
    LinkedList<Term> termsWithPrefix = new LinkedList<>();
    // this TermEnum gives "piccadilly", "pie" and "pizza".
    String prefix = "pi";
    TermsEnum te = MultiFields.getFields(reader).terms("body").iterator();
    te.seekCeil(new BytesRef(prefix));
    do {
        String s = te.term().utf8ToString();
        if (s.startsWith(prefix)) {
            termsWithPrefix.add(new Term("body", s));
        } else {
            break;
        }
    } while (te.next() != null);
    query1builder.add(termsWithPrefix.toArray(new Term[0]));
    MultiPhraseQuery query1 = query1builder.build();
    assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
    query2builder.add(termsWithPrefix.toArray(new Term[0]));
    MultiPhraseQuery query2 = query2builder.build();
    assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString());
    ScoreDoc[] result;
    result = searcher.search(query1, 1000).scoreDocs;
    assertEquals(2, result.length);
    result = searcher.search(query2, 1000).scoreDocs;
    assertEquals(0, result.length);
    // search for "blue* pizza":
    MultiPhraseQuery.Builder query3builder = new MultiPhraseQuery.Builder();
    termsWithPrefix.clear();
    prefix = "blue";
    te.seekCeil(new BytesRef(prefix));
    do {
        if (te.term().utf8ToString().startsWith(prefix)) {
            termsWithPrefix.add(new Term("body", te.term().utf8ToString()));
        }
    } while (te.next() != null);
    query3builder.add(termsWithPrefix.toArray(new Term[0]));
    query3builder.add(new Term("body", "pizza"));
    MultiPhraseQuery query3 = query3builder.build();
    result = searcher.search(query3, 1000).scoreDocs;
    // blueberry pizza, bluebird pizza
    assertEquals(2, result.length);
    assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
    // test slop:
    query3builder.setSlop(1);
    query3 = query3builder.build();
    result = searcher.search(query3, 1000).scoreDocs;
    // just make sure no exc:
    searcher.explain(query3, 0);
    // blueberry pizza, bluebird pizza, bluebird
    assertEquals(3, result.length);
    // foobar pizza
    MultiPhraseQuery.Builder query4builder = new MultiPhraseQuery.Builder();
    expectThrows(IllegalArgumentException.class, () -> {
        query4builder.add(new Term("field1", "foo"));
        query4builder.add(new Term("field2", "foobar"));
    });
    writer.close();
    reader.close();
    indexStore.close();
}
Also used : Term(org.apache.lucene.index.Term) LinkedList(java.util.LinkedList) TermsEnum(org.apache.lucene.index.TermsEnum) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory)

Example 24 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class SimpleTextDocValuesWriter method addSortedField.

@Override
public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.SORTED;
    writeFieldEntry(field, DocValuesType.SORTED);
    int valueCount = 0;
    int maxLength = -1;
    TermsEnum terms = valuesProducer.getSorted(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
        maxLength = Math.max(maxLength, value.length);
        valueCount++;
    }
    // write numValues
    SimpleTextUtil.write(data, NUMVALUES);
    SimpleTextUtil.write(data, Integer.toString(valueCount), scratch);
    SimpleTextUtil.writeNewline(data);
    // write maxLength
    SimpleTextUtil.write(data, MAXLENGTH);
    SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
    SimpleTextUtil.writeNewline(data);
    int maxBytesLength = Integer.toString(maxLength).length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesLength; i++) {
        sb.append('0');
    }
    // write our pattern for encoding lengths
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
    int maxOrdBytes = Long.toString(valueCount + 1L).length();
    sb.setLength(0);
    for (int i = 0; i < maxOrdBytes; i++) {
        sb.append('0');
    }
    // write our pattern for ords
    SimpleTextUtil.write(data, ORDPATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat ordEncoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
    // for asserts:
    int valuesSeen = 0;
    terms = valuesProducer.getSorted(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
        // write length
        SimpleTextUtil.write(data, LENGTH);
        SimpleTextUtil.write(data, encoder.format(value.length), scratch);
        SimpleTextUtil.writeNewline(data);
        // write bytes -- don't use SimpleText.write
        // because it escapes:
        data.writeBytes(value.bytes, value.offset, value.length);
        // pad to fit
        for (int i = value.length; i < maxLength; i++) {
            data.writeByte((byte) ' ');
        }
        SimpleTextUtil.writeNewline(data);
        valuesSeen++;
        assert valuesSeen <= valueCount;
    }
    assert valuesSeen == valueCount;
    SortedDocValues values = valuesProducer.getSorted(field);
    for (int i = 0; i < numDocs; ++i) {
        if (values.docID() < i) {
            values.nextDoc();
            assert values.docID() >= i;
        }
        int ord = -1;
        if (values.docID() == i) {
            ord = values.ordValue();
        }
        SimpleTextUtil.write(data, ordEncoder.format(ord + 1L), scratch);
        SimpleTextUtil.writeNewline(data);
    }
}
Also used : DecimalFormatSymbols(java.text.DecimalFormatSymbols) DecimalFormat(java.text.DecimalFormat) BytesRef(org.apache.lucene.util.BytesRef) SortedDocValues(org.apache.lucene.index.SortedDocValues) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 25 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class SimpleTextDocValuesWriter method addSortedSetField.

@Override
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.SORTED_SET;
    writeFieldEntry(field, DocValuesType.SORTED_SET);
    long valueCount = 0;
    int maxLength = 0;
    TermsEnum terms = valuesProducer.getSortedSet(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
        maxLength = Math.max(maxLength, value.length);
        valueCount++;
    }
    // write numValues
    SimpleTextUtil.write(data, NUMVALUES);
    SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
    SimpleTextUtil.writeNewline(data);
    // write maxLength
    SimpleTextUtil.write(data, MAXLENGTH);
    SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
    SimpleTextUtil.writeNewline(data);
    int maxBytesLength = Integer.toString(maxLength).length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesLength; i++) {
        sb.append('0');
    }
    // write our pattern for encoding lengths
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
    // compute ord pattern: this is funny, we encode all values for all docs to find the maximum length
    int maxOrdListLength = 0;
    StringBuilder sb2 = new StringBuilder();
    SortedSetDocValues values = valuesProducer.getSortedSet(field);
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
        sb2.setLength(0);
        for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
            if (sb2.length() > 0) {
                sb2.append(",");
            }
            sb2.append(Long.toString(ord));
        }
        maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
    }
    sb2.setLength(0);
    for (int i = 0; i < maxOrdListLength; i++) {
        sb2.append('X');
    }
    // write our pattern for ord lists
    SimpleTextUtil.write(data, ORDPATTERN);
    SimpleTextUtil.write(data, sb2.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    // for asserts:
    long valuesSeen = 0;
    terms = valuesProducer.getSortedSet(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
        // write length
        SimpleTextUtil.write(data, LENGTH);
        SimpleTextUtil.write(data, encoder.format(value.length), scratch);
        SimpleTextUtil.writeNewline(data);
        // write bytes -- don't use SimpleText.write
        // because it escapes:
        data.writeBytes(value.bytes, value.offset, value.length);
        // pad to fit
        for (int i = value.length; i < maxLength; i++) {
            data.writeByte((byte) ' ');
        }
        SimpleTextUtil.writeNewline(data);
        valuesSeen++;
        assert valuesSeen <= valueCount;
    }
    assert valuesSeen == valueCount;
    values = valuesProducer.getSortedSet(field);
    // write the ords for each doc comma-separated
    for (int i = 0; i < numDocs; ++i) {
        if (values.docID() < i) {
            values.nextDoc();
            assert values.docID() >= i;
        }
        sb2.setLength(0);
        if (values.docID() == i) {
            for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
                if (sb2.length() > 0) {
                    sb2.append(",");
                }
                sb2.append(Long.toString(ord));
            }
        }
        // now pad to fit: these are numbers so spaces work well. reader calls trim()
        int numPadding = maxOrdListLength - sb2.length();
        for (int j = 0; j < numPadding; j++) {
            sb2.append(' ');
        }
        SimpleTextUtil.write(data, sb2.toString(), scratch);
        SimpleTextUtil.writeNewline(data);
    }
}
Also used : DecimalFormatSymbols(java.text.DecimalFormatSymbols) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) DecimalFormat(java.text.DecimalFormat) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)153 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)101 PostingsEnum (org.apache.lucene.index.PostingsEnum)51 Term (org.apache.lucene.index.Term)31 ArrayList (java.util.ArrayList)30 IndexReader (org.apache.lucene.index.IndexReader)28 LeafReader (org.apache.lucene.index.LeafReader)28 Fields (org.apache.lucene.index.Fields)26 IOException (java.io.IOException)25 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)11 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10