Search in sources :

Example 91 with IndexableField

use of org.apache.lucene.index.IndexableField in project lucene-solr by apache.

the class KNearestFuzzyClassifier method buildListFromTopDocs.

/**
   * build a list of classification results from search results
   *
   * @param topDocs the search results as a {@link TopDocs} object
   * @return a {@link List} of {@link ClassificationResult}, one for each existing class
   * @throws IOException if it's not possible to get the stored value of class field
   */
protected List<ClassificationResult<BytesRef>> buildListFromTopDocs(TopDocs topDocs) throws IOException {
    Map<BytesRef, Integer> classCounts = new HashMap<>();
    // this is a boost based on class ranking positions in topDocs
    Map<BytesRef, Double> classBoosts = new HashMap<>();
    float maxScore = topDocs.getMaxScore();
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
        IndexableField storableField = indexSearcher.doc(scoreDoc.doc).getField(classFieldName);
        if (storableField != null) {
            BytesRef cl = new BytesRef(storableField.stringValue());
            //update count
            Integer count = classCounts.get(cl);
            if (count != null) {
                classCounts.put(cl, count + 1);
            } else {
                classCounts.put(cl, 1);
            }
            //update boost, the boost is based on the best score
            Double totalBoost = classBoosts.get(cl);
            double singleBoost = scoreDoc.score / maxScore;
            if (totalBoost != null) {
                classBoosts.put(cl, totalBoost + singleBoost);
            } else {
                classBoosts.put(cl, singleBoost);
            }
        }
    }
    List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
    int sumdoc = 0;
    for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
        Integer count = entry.getValue();
        //the boost is normalized to be 0<b<1
        Double normBoost = classBoosts.get(entry.getKey()) / count;
        temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
        sumdoc += count;
    }
    //correction
    if (sumdoc < k) {
        for (ClassificationResult<BytesRef> cr : temporaryList) {
            returnList.add(new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
        }
    } else {
        returnList = temporaryList;
    }
    return returnList;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ScoreDoc(org.apache.lucene.search.ScoreDoc) IndexableField(org.apache.lucene.index.IndexableField) HashMap(java.util.HashMap) Map(java.util.Map) BytesRef(org.apache.lucene.util.BytesRef)

Example 92 with IndexableField

use of org.apache.lucene.index.IndexableField in project jackrabbit-oak by apache.

the class LuceneIndex method getExcerpt.

private String getExcerpt(Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc) throws IOException {
    StringBuilder excerpt = new StringBuilder();
    for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
        String name = field.name();
        // only full text or analyzed fields
        if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
            String text = field.stringValue();
            TokenStream tokenStream = analyzer.tokenStream(name, text);
            try {
                TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 2);
                if (textFragments != null && textFragments.length > 0) {
                    for (TextFragment fragment : textFragments) {
                        if (excerpt.length() > 0) {
                            excerpt.append("...");
                        }
                        excerpt.append(fragment.toString());
                    }
                    break;
                }
            } catch (InvalidTokenOffsetsException e) {
                LOG.error("higlighting failed", e);
            }
        }
    }
    return excerpt.toString();
}
Also used : IndexableField(org.apache.lucene.index.IndexableField) TokenStream(org.apache.lucene.analysis.TokenStream) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) TextFragment(org.apache.lucene.search.highlight.TextFragment)

Example 93 with IndexableField

use of org.apache.lucene.index.IndexableField in project lucene-solr by apache.

the class TestLucene54DocValuesFormat method doTestSparseDocValuesVsStoredFields.

private void doTestSparseDocValuesVsStoredFields() throws Exception {
    final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
    for (int i = 0; i < values.length; ++i) {
        values[i] = random().nextLong();
    }
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // sparse compression is only enabled if less than 1% of docs have a value
    final int avgGap = 100;
    final int numDocs = atLeast(200);
    for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
        writer.addDocument(new Document());
    }
    final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
    for (int i = 0; i < numDocs; ++i) {
        Document doc = new Document();
        // single-valued
        long docValue = values[random().nextInt(values.length)];
        doc.add(new NumericDocValuesField("numeric", docValue));
        doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
        doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
        doc.add(new StoredField("value", docValue));
        // multi-valued
        final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
        for (int j = 0; j < numValues; ++j) {
            docValue = values[random().nextInt(values.length)];
            doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
            doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
            doc.add(new StoredField("values", docValue));
        }
        writer.addDocument(doc);
        // add a gap
        for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
            writer.addDocument(new Document());
        }
    }
    if (random().nextBoolean()) {
        writer.forceMerge(1);
    }
    final IndexReader indexReader = writer.getReader();
    TestUtil.checkReader(indexReader);
    writer.close();
    for (LeafReaderContext context : indexReader.leaves()) {
        final LeafReader reader = context.reader();
        final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
        final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
        final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
        final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
        final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
        for (int i = 0; i < reader.maxDoc(); ++i) {
            final Document doc = reader.document(i);
            final IndexableField valueField = doc.getField("value");
            final Long value = valueField == null ? null : valueField.numericValue().longValue();
            if (value == null) {
                assertTrue(numeric.docID() + " vs " + i, numeric.docID() < i);
            } else {
                assertEquals(i, numeric.nextDoc());
                assertEquals(i, binary.nextDoc());
                assertEquals(i, sorted.nextDoc());
                assertEquals(value.longValue(), numeric.longValue());
                assertTrue(sorted.ordValue() >= 0);
                assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.ordValue()));
                assertEquals(new BytesRef(Long.toString(value)), binary.binaryValue());
            }
            final IndexableField[] valuesFields = doc.getFields("values");
            if (valuesFields.length == 0) {
                assertTrue(sortedNumeric.docID() + " vs " + i, sortedNumeric.docID() < i);
            } else {
                final Set<Long> valueSet = new HashSet<>();
                for (IndexableField sf : valuesFields) {
                    valueSet.add(sf.numericValue().longValue());
                }
                assertEquals(i, sortedNumeric.nextDoc());
                assertEquals(valuesFields.length, sortedNumeric.docValueCount());
                for (int j = 0; j < sortedNumeric.docValueCount(); ++j) {
                    assertTrue(valueSet.contains(sortedNumeric.nextValue()));
                }
                assertEquals(i, sortedSet.nextDoc());
                int sortedSetCount = 0;
                while (true) {
                    long ord = sortedSet.nextOrd();
                    if (ord == SortedSetDocValues.NO_MORE_ORDS) {
                        break;
                    }
                    assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
                    sortedSetCount++;
                }
                assertEquals(valueSet.size(), sortedSetCount);
            }
        }
    }
    indexReader.close();
    dir.close();
}
Also used : SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) NumericDocValues(org.apache.lucene.index.NumericDocValues) SparseNumericDocValues(org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseNumericDocValues) SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) Document(org.apache.lucene.document.Document) BinaryDocValues(org.apache.lucene.index.BinaryDocValues) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) StoredField(org.apache.lucene.document.StoredField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet) LeafReader(org.apache.lucene.index.LeafReader) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValues(org.apache.lucene.index.SortedDocValues) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) IndexReader(org.apache.lucene.index.IndexReader) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 94 with IndexableField

use of org.apache.lucene.index.IndexableField in project lucene-solr by apache.

the class TestLucene70DocValuesFormat method doTestSparseDocValuesVsStoredFields.

private void doTestSparseDocValuesVsStoredFields() throws Exception {
    final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
    for (int i = 0; i < values.length; ++i) {
        values[i] = random().nextLong();
    }
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // sparse compression is only enabled if less than 1% of docs have a value
    final int avgGap = 100;
    final int numDocs = atLeast(200);
    for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
        writer.addDocument(new Document());
    }
    final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
    for (int i = 0; i < numDocs; ++i) {
        Document doc = new Document();
        // single-valued
        long docValue = values[random().nextInt(values.length)];
        doc.add(new NumericDocValuesField("numeric", docValue));
        doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
        doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
        doc.add(new StoredField("value", docValue));
        // multi-valued
        final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
        for (int j = 0; j < numValues; ++j) {
            docValue = values[random().nextInt(values.length)];
            doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
            doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
            doc.add(new StoredField("values", docValue));
        }
        writer.addDocument(doc);
        // add a gap
        for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
            writer.addDocument(new Document());
        }
    }
    if (random().nextBoolean()) {
        writer.forceMerge(1);
    }
    final IndexReader indexReader = writer.getReader();
    writer.close();
    for (LeafReaderContext context : indexReader.leaves()) {
        final LeafReader reader = context.reader();
        final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
        final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
        final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
        final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
        final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
        for (int i = 0; i < reader.maxDoc(); ++i) {
            final Document doc = reader.document(i);
            final IndexableField valueField = doc.getField("value");
            final Long value = valueField == null ? null : valueField.numericValue().longValue();
            if (value == null) {
                assertTrue(numeric.docID() + " vs " + i, numeric.docID() < i);
            } else {
                assertEquals(i, numeric.nextDoc());
                assertEquals(i, binary.nextDoc());
                assertEquals(i, sorted.nextDoc());
                assertEquals(value.longValue(), numeric.longValue());
                assertTrue(sorted.ordValue() >= 0);
                assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.ordValue()));
                assertEquals(new BytesRef(Long.toString(value)), binary.binaryValue());
            }
            final IndexableField[] valuesFields = doc.getFields("values");
            if (valuesFields.length == 0) {
                assertTrue(sortedNumeric.docID() + " vs " + i, sortedNumeric.docID() < i);
            } else {
                final Set<Long> valueSet = new HashSet<>();
                for (IndexableField sf : valuesFields) {
                    valueSet.add(sf.numericValue().longValue());
                }
                assertEquals(i, sortedNumeric.nextDoc());
                assertEquals(valuesFields.length, sortedNumeric.docValueCount());
                for (int j = 0; j < sortedNumeric.docValueCount(); ++j) {
                    assertTrue(valueSet.contains(sortedNumeric.nextValue()));
                }
                assertEquals(i, sortedSet.nextDoc());
                int sortedSetCount = 0;
                while (true) {
                    long ord = sortedSet.nextOrd();
                    if (ord == SortedSetDocValues.NO_MORE_ORDS) {
                        break;
                    }
                    assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
                    sortedSetCount++;
                }
                assertEquals(valueSet.size(), sortedSetCount);
            }
        }
    }
    indexReader.close();
    dir.close();
}
Also used : SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) NumericDocValues(org.apache.lucene.index.NumericDocValues) SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) Document(org.apache.lucene.document.Document) BinaryDocValues(org.apache.lucene.index.BinaryDocValues) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) StoredField(org.apache.lucene.document.StoredField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet) LeafReader(org.apache.lucene.index.LeafReader) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValues(org.apache.lucene.index.SortedDocValues) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) IndexReader(org.apache.lucene.index.IndexReader) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 95 with IndexableField

use of org.apache.lucene.index.IndexableField in project lucene-solr by apache.

the class TestBlockPostingsFormat2 method testTTFBlockSizeMultiple.

/** tests terms with ttf % blocksize = 0 */
public void testTTFBlockSizeMultiple() throws Exception {
    Document doc = newDocument();
    for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE / 2; i++) {
        for (IndexableField f : doc.getFields()) {
            String proto = (f.name() + " " + f.name() + " " + f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2");
            StringBuilder val = new StringBuilder();
            for (int j = 0; j < 16; j++) {
                val.append(proto);
                val.append(" ");
            }
            ((Field) f).setStringValue(val.toString());
        }
        iw.addDocument(doc);
    }
}
Also used : IndexableField(org.apache.lucene.index.IndexableField) IndexableField(org.apache.lucene.index.IndexableField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document)

Aggregations

IndexableField (org.apache.lucene.index.IndexableField)276 Document (org.apache.lucene.document.Document)90 CompressedXContent (org.elasticsearch.common.compress.CompressedXContent)75 Matchers.containsString (org.hamcrest.Matchers.containsString)57 BytesRef (org.apache.lucene.util.BytesRef)53 ArrayList (java.util.ArrayList)50 Field (org.apache.lucene.document.Field)34 Test (org.junit.Test)28 IOException (java.io.IOException)27 HashMap (java.util.HashMap)24 IndexReader (org.apache.lucene.index.IndexReader)24 Directory (org.apache.lucene.store.Directory)23 Map (java.util.Map)22 TopDocs (org.apache.lucene.search.TopDocs)22 Term (org.apache.lucene.index.Term)21 HashSet (java.util.HashSet)20 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)20 DocumentMapper (org.elasticsearch.index.mapper.DocumentMapper)20 Query (org.apache.lucene.search.Query)19 Analyzer (org.apache.lucene.analysis.Analyzer)18