Search in sources :

Example 81 with ScoreDoc

use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.

the class ConfusionMatrixGenerator method getConfusionMatrix.

/**
   * get the {@link org.apache.lucene.classification.utils.ConfusionMatrixGenerator.ConfusionMatrix} of a given {@link Classifier},
   * generated on the given {@link IndexReader}, class and text fields.
   *
   * @param reader              the {@link IndexReader} containing the index used for creating the {@link Classifier}
   * @param classifier          the {@link Classifier} whose confusion matrix has to be generated
   * @param classFieldName      the name of the Lucene field used as the classifier's output
   * @param textFieldName       the nome the Lucene field used as the classifier's input
   * @param timeoutMilliseconds timeout to wait before stopping creating the confusion matrix
   * @param <T>                 the return type of the {@link ClassificationResult} returned by the given {@link Classifier}
   * @return a {@link org.apache.lucene.classification.utils.ConfusionMatrixGenerator.ConfusionMatrix}
   * @throws IOException if problems occurr while reading the index or using the classifier
   */
public static <T> ConfusionMatrix getConfusionMatrix(IndexReader reader, Classifier<T> classifier, String classFieldName, String textFieldName, long timeoutMilliseconds) throws IOException {
    ExecutorService executorService = Executors.newFixedThreadPool(1, new NamedThreadFactory("confusion-matrix-gen-"));
    try {
        Map<String, Map<String, Long>> counts = new HashMap<>();
        IndexSearcher indexSearcher = new IndexSearcher(reader);
        TopDocs topDocs = indexSearcher.search(new TermRangeQuery(classFieldName, null, null, true, true), Integer.MAX_VALUE);
        double time = 0d;
        int counter = 0;
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            if (timeoutMilliseconds > 0 && time >= timeoutMilliseconds) {
                break;
            }
            Document doc = reader.document(scoreDoc.doc);
            String[] correctAnswers = doc.getValues(classFieldName);
            if (correctAnswers != null && correctAnswers.length > 0) {
                Arrays.sort(correctAnswers);
                ClassificationResult<T> result;
                String text = doc.get(textFieldName);
                if (text != null) {
                    try {
                        // fail if classification takes more than 5s
                        long start = System.currentTimeMillis();
                        result = executorService.submit(() -> classifier.assignClass(text)).get(5, TimeUnit.SECONDS);
                        long end = System.currentTimeMillis();
                        time += end - start;
                        if (result != null) {
                            T assignedClass = result.getAssignedClass();
                            if (assignedClass != null) {
                                counter++;
                                String classified = assignedClass instanceof BytesRef ? ((BytesRef) assignedClass).utf8ToString() : assignedClass.toString();
                                String correctAnswer;
                                if (Arrays.binarySearch(correctAnswers, classified) >= 0) {
                                    correctAnswer = classified;
                                } else {
                                    correctAnswer = correctAnswers[0];
                                }
                                Map<String, Long> stringLongMap = counts.get(correctAnswer);
                                if (stringLongMap != null) {
                                    Long aLong = stringLongMap.get(classified);
                                    if (aLong != null) {
                                        stringLongMap.put(classified, aLong + 1);
                                    } else {
                                        stringLongMap.put(classified, 1L);
                                    }
                                } else {
                                    stringLongMap = new HashMap<>();
                                    stringLongMap.put(classified, 1L);
                                    counts.put(correctAnswer, stringLongMap);
                                }
                            }
                        }
                    } catch (TimeoutException timeoutException) {
                        // add classification timeout
                        time += 5000;
                    } catch (ExecutionException | InterruptedException executionException) {
                        throw new RuntimeException(executionException);
                    }
                }
            }
        }
        return new ConfusionMatrix(counts, time / counter, counter);
    } finally {
        executorService.shutdown();
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) HashMap(java.util.HashMap) Document(org.apache.lucene.document.Document) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) ExecutionException(java.util.concurrent.ExecutionException) BytesRef(org.apache.lucene.util.BytesRef) TimeoutException(java.util.concurrent.TimeoutException) NamedThreadFactory(org.apache.lucene.util.NamedThreadFactory) TermRangeQuery(org.apache.lucene.search.TermRangeQuery) ExecutorService(java.util.concurrent.ExecutorService) HashMap(java.util.HashMap) Map(java.util.Map)

Example 82 with ScoreDoc

use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.

the class KNearestFuzzyClassifier method buildListFromTopDocs.

/**
   * build a list of classification results from search results
   *
   * @param topDocs the search results as a {@link TopDocs} object
   * @return a {@link List} of {@link ClassificationResult}, one for each existing class
   * @throws IOException if it's not possible to get the stored value of class field
   */
protected List<ClassificationResult<BytesRef>> buildListFromTopDocs(TopDocs topDocs) throws IOException {
    Map<BytesRef, Integer> classCounts = new HashMap<>();
    // this is a boost based on class ranking positions in topDocs
    Map<BytesRef, Double> classBoosts = new HashMap<>();
    float maxScore = topDocs.getMaxScore();
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
        IndexableField storableField = indexSearcher.doc(scoreDoc.doc).getField(classFieldName);
        if (storableField != null) {
            BytesRef cl = new BytesRef(storableField.stringValue());
            //update count
            Integer count = classCounts.get(cl);
            if (count != null) {
                classCounts.put(cl, count + 1);
            } else {
                classCounts.put(cl, 1);
            }
            //update boost, the boost is based on the best score
            Double totalBoost = classBoosts.get(cl);
            double singleBoost = scoreDoc.score / maxScore;
            if (totalBoost != null) {
                classBoosts.put(cl, totalBoost + singleBoost);
            } else {
                classBoosts.put(cl, singleBoost);
            }
        }
    }
    List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
    int sumdoc = 0;
    for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
        Integer count = entry.getValue();
        //the boost is normalized to be 0<b<1
        Double normBoost = classBoosts.get(entry.getKey()) / count;
        temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
        sumdoc += count;
    }
    //correction
    if (sumdoc < k) {
        for (ClassificationResult<BytesRef> cr : temporaryList) {
            returnList.add(new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
        }
    } else {
        returnList = temporaryList;
    }
    return returnList;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ScoreDoc(org.apache.lucene.search.ScoreDoc) IndexableField(org.apache.lucene.index.IndexableField) HashMap(java.util.HashMap) Map(java.util.Map) BytesRef(org.apache.lucene.util.BytesRef)

Example 83 with ScoreDoc

use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.

the class TestIndexWriterOnDiskFull method testAddIndexOnDiskFull.

// TODO: make @Nightly variant that provokes more disk
// fulls
// TODO: have test fail if on any given top
// iter there was not a single IOE hit
/*
  Test: make sure when we run out of disk space or hit
  random IOExceptions in any of the addIndexes(*) calls
  that 1) index is not corrupt (searcher can open/search
  it) and 2) transactional semantics are followed:
  either all or none of the incoming documents were in
  fact added.
   */
public void testAddIndexOnDiskFull() throws IOException {
    // MemoryCodec, since it uses FST, is not necessarily
    // "additive", ie if you add up N small FSTs, then merge
    // them, the merged result can easily be larger than the
    // sum because the merged FST may use array encoding for
    // some arcs (which uses more space):
    final String idFormat = TestUtil.getPostingsFormat("id");
    final String contentFormat = TestUtil.getPostingsFormat("content");
    assumeFalse("This test cannot run with Memory codec", idFormat.equals("Memory") || contentFormat.equals("Memory"));
    int START_COUNT = 57;
    int NUM_DIR = TEST_NIGHTLY ? 50 : 5;
    int END_COUNT = START_COUNT + NUM_DIR * (TEST_NIGHTLY ? 25 : 5);
    // Build up a bunch of dirs that have indexes which we
    // will then merge together by calling addIndexes(*):
    Directory[] dirs = new Directory[NUM_DIR];
    long inputDiskUsage = 0;
    for (int i = 0; i < NUM_DIR; i++) {
        dirs[i] = newDirectory();
        IndexWriter writer = new IndexWriter(dirs[i], newIndexWriterConfig(new MockAnalyzer(random())));
        for (int j = 0; j < 25; j++) {
            addDocWithIndex(writer, 25 * i + j);
        }
        writer.close();
        String[] files = dirs[i].listAll();
        for (int j = 0; j < files.length; j++) {
            inputDiskUsage += dirs[i].fileLength(files[j]);
        }
    }
    // Now, build a starting index that has START_COUNT docs.  We
    // will then try to addIndexes into a copy of this:
    MockDirectoryWrapper startDir = newMockDirectory();
    IndexWriter writer = new IndexWriter(startDir, newIndexWriterConfig(new MockAnalyzer(random())));
    for (int j = 0; j < START_COUNT; j++) {
        addDocWithIndex(writer, j);
    }
    writer.close();
    // Make sure starting index seems to be working properly:
    Term searchTerm = new Term("content", "aaa");
    IndexReader reader = DirectoryReader.open(startDir);
    assertEquals("first docFreq", 57, reader.docFreq(searchTerm));
    IndexSearcher searcher = newSearcher(reader);
    ScoreDoc[] hits = searcher.search(new TermQuery(searchTerm), 1000).scoreDocs;
    assertEquals("first number of hits", 57, hits.length);
    reader.close();
    // Iterate with larger and larger amounts of free
    // disk space.  With little free disk space,
    // addIndexes will certainly run out of space &
    // fail.  Verify that when this happens, index is
    // not corrupt and index in fact has added no
    // documents.  Then, we increase disk space by 2000
    // bytes each iteration.  At some point there is
    // enough free disk space and addIndexes should
    // succeed and index should show all documents were
    // added.
    // String[] files = startDir.listAll();
    long diskUsage = startDir.sizeInBytes();
    long startDiskUsage = 0;
    String[] files = startDir.listAll();
    for (int i = 0; i < files.length; i++) {
        startDiskUsage += startDir.fileLength(files[i]);
    }
    for (int iter = 0; iter < 3; iter++) {
        if (VERBOSE) {
            System.out.println("TEST: iter=" + iter);
        }
        // Start with 100 bytes more than we are currently using:
        long diskFree = diskUsage + TestUtil.nextInt(random(), 50, 200);
        int method = iter;
        boolean success = false;
        boolean done = false;
        String methodName;
        if (0 == method) {
            methodName = "addIndexes(Directory[]) + forceMerge(1)";
        } else if (1 == method) {
            methodName = "addIndexes(IndexReader[])";
        } else {
            methodName = "addIndexes(Directory[])";
        }
        while (!done) {
            if (VERBOSE) {
                System.out.println("TEST: cycle...");
            }
            // Make a new dir that will enforce disk usage:
            MockDirectoryWrapper dir = new MockDirectoryWrapper(random(), TestUtil.ramCopyOf(startDir));
            IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy(false));
            writer = new IndexWriter(dir, iwc);
            Exception err = null;
            for (int x = 0; x < 2; x++) {
                MergeScheduler ms = writer.getConfig().getMergeScheduler();
                if (ms instanceof ConcurrentMergeScheduler) {
                    // want to pollute test output with these.
                    if (0 == x) {
                        ((ConcurrentMergeScheduler) ms).setSuppressExceptions();
                    } else {
                        ((ConcurrentMergeScheduler) ms).clearSuppressExceptions();
                    }
                }
                // Two loops: first time, limit disk space &
                // throw random IOExceptions; second time, no
                // disk space limit:
                double rate = 0.05;
                double diskRatio = ((double) diskFree) / diskUsage;
                long thisDiskFree;
                String testName = null;
                if (0 == x) {
                    dir.setRandomIOExceptionRateOnOpen(random().nextDouble() * 0.01);
                    thisDiskFree = diskFree;
                    if (diskRatio >= 2.0) {
                        rate /= 2;
                    }
                    if (diskRatio >= 4.0) {
                        rate /= 2;
                    }
                    if (diskRatio >= 6.0) {
                        rate = 0.0;
                    }
                    if (VERBOSE) {
                        testName = "disk full test " + methodName + " with disk full at " + diskFree + " bytes";
                    }
                } else {
                    dir.setRandomIOExceptionRateOnOpen(0.0);
                    thisDiskFree = 0;
                    rate = 0.0;
                    if (VERBOSE) {
                        testName = "disk full test " + methodName + " with unlimited disk space";
                    }
                }
                if (VERBOSE) {
                    System.out.println("\ncycle: " + testName);
                }
                dir.setTrackDiskUsage(true);
                dir.setMaxSizeInBytes(thisDiskFree);
                dir.setRandomIOExceptionRate(rate);
                try {
                    if (0 == method) {
                        if (VERBOSE) {
                            System.out.println("TEST: now addIndexes count=" + dirs.length);
                        }
                        writer.addIndexes(dirs);
                        if (VERBOSE) {
                            System.out.println("TEST: now forceMerge");
                        }
                        writer.forceMerge(1);
                    } else if (1 == method) {
                        DirectoryReader[] readers = new DirectoryReader[dirs.length];
                        for (int i = 0; i < dirs.length; i++) {
                            readers[i] = DirectoryReader.open(dirs[i]);
                        }
                        try {
                            TestUtil.addIndexesSlowly(writer, readers);
                        } finally {
                            for (int i = 0; i < dirs.length; i++) {
                                readers[i].close();
                            }
                        }
                    } else {
                        writer.addIndexes(dirs);
                    }
                    success = true;
                    if (VERBOSE) {
                        System.out.println("  success!");
                    }
                    if (0 == x) {
                        done = true;
                    }
                } catch (IllegalStateException | IOException e) {
                    success = false;
                    err = e;
                    if (VERBOSE) {
                        System.out.println("  hit Exception: " + e);
                        e.printStackTrace(System.out);
                    }
                    if (1 == x) {
                        e.printStackTrace(System.out);
                        fail(methodName + " hit IOException after disk space was freed up");
                    }
                }
                if (x == 1) {
                    // Make sure all threads from ConcurrentMergeScheduler are done
                    TestUtil.syncConcurrentMerges(writer);
                } else {
                    dir.setRandomIOExceptionRateOnOpen(0.0);
                    writer.rollback();
                    writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy(false)));
                }
                if (VERBOSE) {
                    System.out.println("  now test readers");
                }
                // Finally, verify index is not corrupt, and, if
                // we succeeded, we see all docs added, and if we
                // failed, we see either all docs or no docs added
                // (transactional semantics):
                dir.setRandomIOExceptionRateOnOpen(0.0);
                try {
                    reader = DirectoryReader.open(dir);
                } catch (IOException e) {
                    e.printStackTrace(System.out);
                    fail(testName + ": exception when creating IndexReader: " + e);
                }
                int result = reader.docFreq(searchTerm);
                if (success) {
                    if (result != START_COUNT) {
                        fail(testName + ": method did not throw exception but docFreq('aaa') is " + result + " instead of expected " + START_COUNT);
                    }
                } else {
                    // all docs:
                    if (result != START_COUNT && result != END_COUNT) {
                        err.printStackTrace(System.out);
                        fail(testName + ": method did throw exception but docFreq('aaa') is " + result + " instead of expected " + START_COUNT + " or " + END_COUNT);
                    }
                }
                searcher = newSearcher(reader);
                try {
                    hits = searcher.search(new TermQuery(searchTerm), END_COUNT).scoreDocs;
                } catch (IOException e) {
                    e.printStackTrace(System.out);
                    fail(testName + ": exception when searching: " + e);
                }
                int result2 = hits.length;
                if (success) {
                    if (result2 != result) {
                        fail(testName + ": method did not throw exception but hits.length for search on term 'aaa' is " + result2 + " instead of expected " + result);
                    }
                } else {
                    // all docs:
                    if (result2 != result) {
                        err.printStackTrace(System.out);
                        fail(testName + ": method did throw exception but hits.length for search on term 'aaa' is " + result2 + " instead of expected " + result);
                    }
                }
                reader.close();
                if (VERBOSE) {
                    System.out.println("  count is " + result);
                }
                if (done || result == END_COUNT) {
                    break;
                }
            }
            if (VERBOSE) {
                System.out.println("  start disk = " + startDiskUsage + "; input disk = " + inputDiskUsage + "; max used = " + dir.getMaxUsedSizeInBytes());
            }
            if (done) {
                // Javadocs state that temp free Directory space
                // required is at most 2X total input size of
                // indices so let's make sure:
                assertTrue("max free Directory space required exceeded 1X the total input index sizes during " + methodName + ": max temp usage = " + (dir.getMaxUsedSizeInBytes() - startDiskUsage) + " bytes vs limit=" + (2 * (startDiskUsage + inputDiskUsage)) + "; starting disk usage = " + startDiskUsage + " bytes; " + "input index disk usage = " + inputDiskUsage + " bytes", (dir.getMaxUsedSizeInBytes() - startDiskUsage) < 2 * (startDiskUsage + inputDiskUsage));
            }
            // Make sure we don't hit disk full during close below:
            dir.setMaxSizeInBytes(0);
            dir.setRandomIOExceptionRate(0.0);
            dir.setRandomIOExceptionRateOnOpen(0.0);
            writer.close();
            dir.close();
            // Try again with more free space:
            diskFree += TEST_NIGHTLY ? TestUtil.nextInt(random(), 4000, 8000) : TestUtil.nextInt(random(), 40000, 80000);
        }
    }
    startDir.close();
    for (Directory dir : dirs) dir.close();
}
Also used : MockDirectoryWrapper(org.apache.lucene.store.MockDirectoryWrapper) IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) IOException(java.io.IOException) IntPoint(org.apache.lucene.document.IntPoint) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) IOException(java.io.IOException) ScoreDoc(org.apache.lucene.search.ScoreDoc) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory)

Example 84 with ScoreDoc

use of org.apache.lucene.search.ScoreDoc in project jackrabbit-oak by apache.

the class FilteredSortedSetDocValuesFacetCounts method getTopChildren.

@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
    FacetResult topChildren = super.getTopChildren(topN, dim, path);
    LabelAndValue[] labelAndValues = topChildren.labelValues;
    for (ScoreDoc scoreDoc : docs.scoreDocs) {
        labelAndValues = filterFacet(scoreDoc.doc, dim, labelAndValues);
    }
    int childCount = labelAndValues.length;
    Number value = 0;
    for (LabelAndValue lv : labelAndValues) {
        value = value.longValue() + lv.value.longValue();
    }
    return new FacetResult(dim, path, value, labelAndValues, childCount);
}
Also used : FacetResult(org.apache.lucene.facet.FacetResult) LabelAndValue(org.apache.lucene.facet.LabelAndValue) ScoreDoc(org.apache.lucene.search.ScoreDoc)

Example 85 with ScoreDoc

use of org.apache.lucene.search.ScoreDoc in project jena by apache.

the class TextIndexLucene method query$.

private List<TextHit> query$(IndexReader indexReader, Node property, String qs, String graphURI, String lang, int limit) throws ParseException, IOException {
    String textField = docDef.getField(property);
    String textClause;
    String langClause = null;
    String graphClause = null;
    //for language-based search extension
    if (getDocDef().getLangField() != null) {
        String langField = getDocDef().getLangField();
        if (lang != null) {
            if (this.isMultilingual && !lang.equals("none")) {
                textField = textField + "_" + lang;
            }
            langClause = !"none".equals(lang) ? langField + ":" + lang : "-" + langField + ":*";
        }
    }
    if (textField != null)
        textClause = textField + ":" + qs;
    else
        textClause = qs;
    if (graphURI != null) {
        String escaped = QueryParserBase.escape(graphURI);
        graphClause = getDocDef().getGraphField() + ":" + escaped;
    }
    String queryString = textClause;
    if (langClause != null)
        queryString = "(" + queryString + ") AND " + langClause;
    if (graphClause != null)
        queryString = "(" + queryString + ") AND " + graphClause;
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    Query query = parseQuery(queryString, queryAnalyzer);
    if (limit <= 0)
        limit = MAX_N;
    ScoreDoc[] sDocs = indexSearcher.search(query, limit).scoreDocs;
    List<TextHit> results = new ArrayList<>();
    for (ScoreDoc sd : sDocs) {
        Document doc = indexSearcher.doc(sd.doc);
        String[] values = doc.getValues(docDef.getEntityField());
        Node literal = null;
        String field = (property != null) ? docDef.getField(property) : docDef.getPrimaryField();
        String[] lexicals = doc.getValues(field);
        if (lexicals.length > 0) {
            String lexical = lexicals[0];
            String[] langs = doc.getValues(docDef.getLangField());
            if (langs.length > 0) {
                String doclang = langs[0];
                if (doclang.startsWith(DATATYPE_PREFIX)) {
                    String datatype = doclang.substring(DATATYPE_PREFIX.length());
                    TypeMapper tmap = TypeMapper.getInstance();
                    literal = NodeFactory.createLiteral(lexical, tmap.getSafeTypeByName(datatype));
                } else {
                    literal = NodeFactory.createLiteral(lexical, doclang);
                }
            } else {
                literal = NodeFactory.createLiteral(lexical);
            }
        }
        for (String v : values) {
            Node n = TextQueryFuncs.stringToNode(v);
            TextHit hit = new TextHit(n, sd.score, literal);
            results.add(hit);
        }
    }
    return results;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TypeMapper(org.apache.jena.datatypes.TypeMapper) Query(org.apache.lucene.search.Query) Node(org.apache.jena.graph.Node) ScoreDoc(org.apache.lucene.search.ScoreDoc)

Aggregations

ScoreDoc (org.apache.lucene.search.ScoreDoc)222 TopDocs (org.apache.lucene.search.TopDocs)124 IndexSearcher (org.apache.lucene.search.IndexSearcher)98 Document (org.apache.lucene.document.Document)95 Query (org.apache.lucene.search.Query)71 TermQuery (org.apache.lucene.search.TermQuery)52 IOException (java.io.IOException)48 ArrayList (java.util.ArrayList)46 IndexReader (org.apache.lucene.index.IndexReader)45 Term (org.apache.lucene.index.Term)39 Directory (org.apache.lucene.store.Directory)37 BooleanQuery (org.apache.lucene.search.BooleanQuery)27 MatchAllDocsQuery (org.apache.lucene.search.MatchAllDocsQuery)26 Test (org.junit.Test)23 Sort (org.apache.lucene.search.Sort)22 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)21 HashMap (java.util.HashMap)20 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)20 FieldDoc (org.apache.lucene.search.FieldDoc)20 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)18