Search in sources :

Example 56 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class BooleanPerceptronClassifierTest method testPerformance.

@Test
public void testPerformance() throws Exception {
    MockAnalyzer analyzer = new MockAnalyzer(random());
    LeafReader leafReader = getRandomIndex(analyzer, 100);
    try {
        long trainStart = System.currentTimeMillis();
        BooleanPerceptronClassifier classifier = new BooleanPerceptronClassifier(leafReader, analyzer, null, 1, null, booleanFieldName, textFieldName);
        long trainEnd = System.currentTimeMillis();
        long trainTime = trainEnd - trainStart;
        assertTrue("training took more than 10s: " + trainTime / 1000 + "s", trainTime < 10000);
        long evaluationStart = System.currentTimeMillis();
        ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader, classifier, booleanFieldName, textFieldName, -1);
        assertNotNull(confusionMatrix);
        long evaluationEnd = System.currentTimeMillis();
        long evaluationTime = evaluationEnd - evaluationStart;
        assertTrue("evaluation took more than 1m: " + evaluationTime / 1000 + "s", evaluationTime < 60000);
        double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
        assertTrue(5000 > avgClassificationTime);
        double f1 = confusionMatrix.getF1Measure();
        assertTrue(f1 >= 0d);
        assertTrue(f1 <= 1d);
        double accuracy = confusionMatrix.getAccuracy();
        assertTrue(accuracy >= 0d);
        assertTrue(accuracy <= 1d);
        double recall = confusionMatrix.getRecall();
        assertTrue(recall >= 0d);
        assertTrue(recall <= 1d);
        double precision = confusionMatrix.getPrecision();
        assertTrue(precision >= 0d);
        assertTrue(precision <= 1d);
        Terms terms = MultiFields.getTerms(leafReader, booleanFieldName);
        TermsEnum iterator = terms.iterator();
        BytesRef term;
        while ((term = iterator.next()) != null) {
            String s = term.utf8ToString();
            recall = confusionMatrix.getRecall(s);
            assertTrue(recall >= 0d);
            assertTrue(recall <= 1d);
            precision = confusionMatrix.getPrecision(s);
            assertTrue(precision >= 0d);
            assertTrue(precision <= 1d);
            double f1Measure = confusionMatrix.getF1Measure(s);
            assertTrue(f1Measure >= 0d);
            assertTrue(f1Measure <= 1d);
        }
    } finally {
        leafReader.close();
    }
}
Also used : MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LeafReader(org.apache.lucene.index.LeafReader) Terms(org.apache.lucene.index.Terms) ConfusionMatrixGenerator(org.apache.lucene.classification.utils.ConfusionMatrixGenerator) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum) Test(org.junit.Test)

Example 57 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class DocToDoubleVectorUtils method toDenseLocalFreqDoubleArray.

/**
   * create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
   *
   * @param docTerms term vectors for a given document
   * @return a dense vector of <code>Double</code>s as an array
   * @throws IOException in case accessing the underlying index fails
   */
public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException {
    Double[] freqVector = null;
    if (docTerms != null) {
        freqVector = new Double[(int) docTerms.size()];
        int i = 0;
        TermsEnum docTermsEnum = docTerms.iterator();
        while (docTermsEnum.next() != null) {
            // the total number of occurrences of this term in the given document
            long termFreqLocal = docTermsEnum.totalTermFreq();
            freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
            i++;
        }
    }
    return freqVector;
}
Also used : TermsEnum(org.apache.lucene.index.TermsEnum)

Example 58 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestLucene54DocValuesFormat method doTestTermsEnumRandom.

// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    // set to duel against a codec which has ordinals:
    final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
    final DocValuesFormat dv = new Lucene54DocValuesFormat();
    conf.setCodec(new AssertingCodec() {

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return pf;
        }

        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
            return dv;
        }
    });
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // index some docs
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
        doc.add(idField);
        final int length = TestUtil.nextInt(random(), minLength, maxLength);
        int numValues = random().nextInt(17);
        // create a random list of strings
        List<String> values = new ArrayList<>();
        for (int v = 0; v < numValues; v++) {
            values.add(TestUtil.randomSimpleString(random(), minLength, length));
        }
        // add in any order to the indexed field
        ArrayList<String> unordered = new ArrayList<>(values);
        Collections.shuffle(unordered, random());
        for (String v : values) {
            doc.add(newStringField("indexed", v, Field.Store.NO));
        }
        // add in any order to the dv field
        ArrayList<String> unordered2 = new ArrayList<>(values);
        Collections.shuffle(unordered2, random());
        for (String v : unordered2) {
            doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
        }
        writer.addDocument(doc);
        if (random().nextInt(31) == 0) {
            writer.commit();
        }
    }
    // delete some docs
    int numDeletions = random().nextInt(numDocs / 10);
    for (int i = 0; i < numDeletions; i++) {
        int id = random().nextInt(numDocs);
        writer.deleteDocuments(new Term("id", Integer.toString(id)));
    }
    // compare per-segment
    DirectoryReader ir = writer.getReader();
    for (LeafReaderContext context : ir.leaves()) {
        LeafReader r = context.reader();
        Terms terms = r.terms("indexed");
        if (terms != null) {
            SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
            assertEquals(terms.size(), ssdv.getValueCount());
            TermsEnum expected = terms.iterator();
            TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
            assertEquals(terms.size(), expected, actual);
            doTestSortedSetEnumAdvanceIndependently(ssdv);
        }
    }
    ir.close();
    writer.forceMerge(1);
    // now compare again after the merge
    ir = writer.getReader();
    LeafReader ar = getOnlyLeafReader(ir);
    Terms terms = ar.terms("indexed");
    if (terms != null) {
        assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
        TermsEnum expected = terms.iterator();
        TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
        assertEquals(terms.size(), expected, actual);
    }
    ir.close();
    writer.close();
    dir.close();
}
Also used : ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) DocValuesFormat(org.apache.lucene.codecs.DocValuesFormat) TermsEnum(org.apache.lucene.index.TermsEnum) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) AssertingCodec(org.apache.lucene.codecs.asserting.AssertingCodec) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) StringField(org.apache.lucene.document.StringField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 59 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestLucene70DocValuesFormat method doTestTermsEnumRandom.

// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    // set to duel against a codec which has ordinals:
    final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
    final DocValuesFormat dv = new Lucene70DocValuesFormat();
    conf.setCodec(new AssertingCodec() {

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return pf;
        }

        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
            return dv;
        }
    });
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // index some docs
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
        doc.add(idField);
        int numValues = random().nextInt(17);
        // create a random list of strings
        List<String> values = new ArrayList<>();
        for (int v = 0; v < numValues; v++) {
            values.add(valuesProducer.get());
        }
        // add in any order to the indexed field
        ArrayList<String> unordered = new ArrayList<>(values);
        Collections.shuffle(unordered, random());
        for (String v : values) {
            doc.add(newStringField("indexed", v, Field.Store.NO));
        }
        // add in any order to the dv field
        ArrayList<String> unordered2 = new ArrayList<>(values);
        Collections.shuffle(unordered2, random());
        for (String v : unordered2) {
            doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
        }
        writer.addDocument(doc);
        if (random().nextInt(31) == 0) {
            writer.commit();
        }
    }
    // delete some docs
    int numDeletions = random().nextInt(numDocs / 10);
    for (int i = 0; i < numDeletions; i++) {
        int id = random().nextInt(numDocs);
        writer.deleteDocuments(new Term("id", Integer.toString(id)));
    }
    // compare per-segment
    DirectoryReader ir = writer.getReader();
    for (LeafReaderContext context : ir.leaves()) {
        LeafReader r = context.reader();
        Terms terms = r.terms("indexed");
        if (terms != null) {
            SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
            assertEquals(terms.size(), ssdv.getValueCount());
            TermsEnum expected = terms.iterator();
            TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
            assertEquals(terms.size(), expected, actual);
            doTestSortedSetEnumAdvanceIndependently(ssdv);
        }
    }
    ir.close();
    writer.forceMerge(1);
    // now compare again after the merge
    ir = writer.getReader();
    LeafReader ar = getOnlyLeafReader(ir);
    Terms terms = ar.terms("indexed");
    if (terms != null) {
        assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
        TermsEnum expected = terms.iterator();
        TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
        assertEquals(terms.size(), expected, actual);
    }
    ir.close();
    writer.close();
    dir.close();
}
Also used : Lucene70DocValuesFormat(org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) Lucene70DocValuesFormat(org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat) DocValuesFormat(org.apache.lucene.codecs.DocValuesFormat) TermsEnum(org.apache.lucene.index.TermsEnum) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) AssertingCodec(org.apache.lucene.codecs.asserting.AssertingCodec) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) StringField(org.apache.lucene.document.StringField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 60 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestPhrasePrefixQuery method testPhrasePrefix.

/**
     *
     */
public void testPhrasePrefix() throws IOException {
    Directory indexStore = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
    Document doc1 = new Document();
    Document doc2 = new Document();
    Document doc3 = new Document();
    Document doc4 = new Document();
    Document doc5 = new Document();
    doc1.add(newTextField("body", "blueberry pie", Field.Store.YES));
    doc2.add(newTextField("body", "blueberry strudel", Field.Store.YES));
    doc3.add(newTextField("body", "blueberry pizza", Field.Store.YES));
    doc4.add(newTextField("body", "blueberry chewing gum", Field.Store.YES));
    doc5.add(newTextField("body", "piccadilly circus", Field.Store.YES));
    writer.addDocument(doc1);
    writer.addDocument(doc2);
    writer.addDocument(doc3);
    writer.addDocument(doc4);
    writer.addDocument(doc5);
    IndexReader reader = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(reader);
    // PhrasePrefixQuery query1 = new PhrasePrefixQuery();
    MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
    // PhrasePrefixQuery query2 = new PhrasePrefixQuery();
    MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
    query1builder.add(new Term("body", "blueberry"));
    query2builder.add(new Term("body", "strawberry"));
    LinkedList<Term> termsWithPrefix = new LinkedList<>();
    // this TermEnum gives "piccadilly", "pie" and "pizza".
    String prefix = "pi";
    TermsEnum te = MultiFields.getFields(reader).terms("body").iterator();
    te.seekCeil(new BytesRef(prefix));
    do {
        String s = te.term().utf8ToString();
        if (s.startsWith(prefix)) {
            termsWithPrefix.add(new Term("body", s));
        } else {
            break;
        }
    } while (te.next() != null);
    query1builder.add(termsWithPrefix.toArray(new Term[0]));
    query2builder.add(termsWithPrefix.toArray(new Term[0]));
    ScoreDoc[] result;
    result = searcher.search(query1builder.build(), 1000).scoreDocs;
    assertEquals(2, result.length);
    result = searcher.search(query2builder.build(), 1000).scoreDocs;
    assertEquals(0, result.length);
    reader.close();
    indexStore.close();
}
Also used : Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) LinkedList(java.util.LinkedList) TermsEnum(org.apache.lucene.index.TermsEnum) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10