use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.
the class ConfusionMatrixGenerator method getConfusionMatrix.
/**
* get the {@link org.apache.lucene.classification.utils.ConfusionMatrixGenerator.ConfusionMatrix} of a given {@link Classifier},
* generated on the given {@link IndexReader}, class and text fields.
*
* @param reader the {@link IndexReader} containing the index used for creating the {@link Classifier}
* @param classifier the {@link Classifier} whose confusion matrix has to be generated
* @param classFieldName the name of the Lucene field used as the classifier's output
* @param textFieldName the nome the Lucene field used as the classifier's input
* @param timeoutMilliseconds timeout to wait before stopping creating the confusion matrix
* @param <T> the return type of the {@link ClassificationResult} returned by the given {@link Classifier}
* @return a {@link org.apache.lucene.classification.utils.ConfusionMatrixGenerator.ConfusionMatrix}
* @throws IOException if problems occurr while reading the index or using the classifier
*/
public static <T> ConfusionMatrix getConfusionMatrix(IndexReader reader, Classifier<T> classifier, String classFieldName, String textFieldName, long timeoutMilliseconds) throws IOException {
ExecutorService executorService = Executors.newFixedThreadPool(1, new NamedThreadFactory("confusion-matrix-gen-"));
try {
Map<String, Map<String, Long>> counts = new HashMap<>();
IndexSearcher indexSearcher = new IndexSearcher(reader);
TopDocs topDocs = indexSearcher.search(new TermRangeQuery(classFieldName, null, null, true, true), Integer.MAX_VALUE);
double time = 0d;
int counter = 0;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
if (timeoutMilliseconds > 0 && time >= timeoutMilliseconds) {
break;
}
Document doc = reader.document(scoreDoc.doc);
String[] correctAnswers = doc.getValues(classFieldName);
if (correctAnswers != null && correctAnswers.length > 0) {
Arrays.sort(correctAnswers);
ClassificationResult<T> result;
String text = doc.get(textFieldName);
if (text != null) {
try {
// fail if classification takes more than 5s
long start = System.currentTimeMillis();
result = executorService.submit(() -> classifier.assignClass(text)).get(5, TimeUnit.SECONDS);
long end = System.currentTimeMillis();
time += end - start;
if (result != null) {
T assignedClass = result.getAssignedClass();
if (assignedClass != null) {
counter++;
String classified = assignedClass instanceof BytesRef ? ((BytesRef) assignedClass).utf8ToString() : assignedClass.toString();
String correctAnswer;
if (Arrays.binarySearch(correctAnswers, classified) >= 0) {
correctAnswer = classified;
} else {
correctAnswer = correctAnswers[0];
}
Map<String, Long> stringLongMap = counts.get(correctAnswer);
if (stringLongMap != null) {
Long aLong = stringLongMap.get(classified);
if (aLong != null) {
stringLongMap.put(classified, aLong + 1);
} else {
stringLongMap.put(classified, 1L);
}
} else {
stringLongMap = new HashMap<>();
stringLongMap.put(classified, 1L);
counts.put(correctAnswer, stringLongMap);
}
}
}
} catch (TimeoutException timeoutException) {
// add classification timeout
time += 5000;
} catch (ExecutionException | InterruptedException executionException) {
throw new RuntimeException(executionException);
}
}
}
}
return new ConfusionMatrix(counts, time / counter, counter);
} finally {
executorService.shutdown();
}
}
use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.
the class KNearestFuzzyClassifier method buildListFromTopDocs.
/**
* build a list of classification results from search results
*
* @param topDocs the search results as a {@link TopDocs} object
* @return a {@link List} of {@link ClassificationResult}, one for each existing class
* @throws IOException if it's not possible to get the stored value of class field
*/
protected List<ClassificationResult<BytesRef>> buildListFromTopDocs(TopDocs topDocs) throws IOException {
Map<BytesRef, Integer> classCounts = new HashMap<>();
// this is a boost based on class ranking positions in topDocs
Map<BytesRef, Double> classBoosts = new HashMap<>();
float maxScore = topDocs.getMaxScore();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
IndexableField storableField = indexSearcher.doc(scoreDoc.doc).getField(classFieldName);
if (storableField != null) {
BytesRef cl = new BytesRef(storableField.stringValue());
//update count
Integer count = classCounts.get(cl);
if (count != null) {
classCounts.put(cl, count + 1);
} else {
classCounts.put(cl, 1);
}
//update boost, the boost is based on the best score
Double totalBoost = classBoosts.get(cl);
double singleBoost = scoreDoc.score / maxScore;
if (totalBoost != null) {
classBoosts.put(cl, totalBoost + singleBoost);
} else {
classBoosts.put(cl, singleBoost);
}
}
}
List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
int sumdoc = 0;
for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
Integer count = entry.getValue();
//the boost is normalized to be 0<b<1
Double normBoost = classBoosts.get(entry.getKey()) / count;
temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
sumdoc += count;
}
//correction
if (sumdoc < k) {
for (ClassificationResult<BytesRef> cr : temporaryList) {
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
}
} else {
returnList = temporaryList;
}
return returnList;
}
use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.
the class TestIndexWriterOnDiskFull method testAddIndexOnDiskFull.
// TODO: make @Nightly variant that provokes more disk
// fulls
// TODO: have test fail if on any given top
// iter there was not a single IOE hit
/*
Test: make sure when we run out of disk space or hit
random IOExceptions in any of the addIndexes(*) calls
that 1) index is not corrupt (searcher can open/search
it) and 2) transactional semantics are followed:
either all or none of the incoming documents were in
fact added.
*/
public void testAddIndexOnDiskFull() throws IOException {
// MemoryCodec, since it uses FST, is not necessarily
// "additive", ie if you add up N small FSTs, then merge
// them, the merged result can easily be larger than the
// sum because the merged FST may use array encoding for
// some arcs (which uses more space):
final String idFormat = TestUtil.getPostingsFormat("id");
final String contentFormat = TestUtil.getPostingsFormat("content");
assumeFalse("This test cannot run with Memory codec", idFormat.equals("Memory") || contentFormat.equals("Memory"));
int START_COUNT = 57;
int NUM_DIR = TEST_NIGHTLY ? 50 : 5;
int END_COUNT = START_COUNT + NUM_DIR * (TEST_NIGHTLY ? 25 : 5);
// Build up a bunch of dirs that have indexes which we
// will then merge together by calling addIndexes(*):
Directory[] dirs = new Directory[NUM_DIR];
long inputDiskUsage = 0;
for (int i = 0; i < NUM_DIR; i++) {
dirs[i] = newDirectory();
IndexWriter writer = new IndexWriter(dirs[i], newIndexWriterConfig(new MockAnalyzer(random())));
for (int j = 0; j < 25; j++) {
addDocWithIndex(writer, 25 * i + j);
}
writer.close();
String[] files = dirs[i].listAll();
for (int j = 0; j < files.length; j++) {
inputDiskUsage += dirs[i].fileLength(files[j]);
}
}
// Now, build a starting index that has START_COUNT docs. We
// will then try to addIndexes into a copy of this:
MockDirectoryWrapper startDir = newMockDirectory();
IndexWriter writer = new IndexWriter(startDir, newIndexWriterConfig(new MockAnalyzer(random())));
for (int j = 0; j < START_COUNT; j++) {
addDocWithIndex(writer, j);
}
writer.close();
// Make sure starting index seems to be working properly:
Term searchTerm = new Term("content", "aaa");
IndexReader reader = DirectoryReader.open(startDir);
assertEquals("first docFreq", 57, reader.docFreq(searchTerm));
IndexSearcher searcher = newSearcher(reader);
ScoreDoc[] hits = searcher.search(new TermQuery(searchTerm), 1000).scoreDocs;
assertEquals("first number of hits", 57, hits.length);
reader.close();
// Iterate with larger and larger amounts of free
// disk space. With little free disk space,
// addIndexes will certainly run out of space &
// fail. Verify that when this happens, index is
// not corrupt and index in fact has added no
// documents. Then, we increase disk space by 2000
// bytes each iteration. At some point there is
// enough free disk space and addIndexes should
// succeed and index should show all documents were
// added.
// String[] files = startDir.listAll();
long diskUsage = startDir.sizeInBytes();
long startDiskUsage = 0;
String[] files = startDir.listAll();
for (int i = 0; i < files.length; i++) {
startDiskUsage += startDir.fileLength(files[i]);
}
for (int iter = 0; iter < 3; iter++) {
if (VERBOSE) {
System.out.println("TEST: iter=" + iter);
}
// Start with 100 bytes more than we are currently using:
long diskFree = diskUsage + TestUtil.nextInt(random(), 50, 200);
int method = iter;
boolean success = false;
boolean done = false;
String methodName;
if (0 == method) {
methodName = "addIndexes(Directory[]) + forceMerge(1)";
} else if (1 == method) {
methodName = "addIndexes(IndexReader[])";
} else {
methodName = "addIndexes(Directory[])";
}
while (!done) {
if (VERBOSE) {
System.out.println("TEST: cycle...");
}
// Make a new dir that will enforce disk usage:
MockDirectoryWrapper dir = new MockDirectoryWrapper(random(), TestUtil.ramCopyOf(startDir));
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy(false));
writer = new IndexWriter(dir, iwc);
Exception err = null;
for (int x = 0; x < 2; x++) {
MergeScheduler ms = writer.getConfig().getMergeScheduler();
if (ms instanceof ConcurrentMergeScheduler) {
// want to pollute test output with these.
if (0 == x) {
((ConcurrentMergeScheduler) ms).setSuppressExceptions();
} else {
((ConcurrentMergeScheduler) ms).clearSuppressExceptions();
}
}
// Two loops: first time, limit disk space &
// throw random IOExceptions; second time, no
// disk space limit:
double rate = 0.05;
double diskRatio = ((double) diskFree) / diskUsage;
long thisDiskFree;
String testName = null;
if (0 == x) {
dir.setRandomIOExceptionRateOnOpen(random().nextDouble() * 0.01);
thisDiskFree = diskFree;
if (diskRatio >= 2.0) {
rate /= 2;
}
if (diskRatio >= 4.0) {
rate /= 2;
}
if (diskRatio >= 6.0) {
rate = 0.0;
}
if (VERBOSE) {
testName = "disk full test " + methodName + " with disk full at " + diskFree + " bytes";
}
} else {
dir.setRandomIOExceptionRateOnOpen(0.0);
thisDiskFree = 0;
rate = 0.0;
if (VERBOSE) {
testName = "disk full test " + methodName + " with unlimited disk space";
}
}
if (VERBOSE) {
System.out.println("\ncycle: " + testName);
}
dir.setTrackDiskUsage(true);
dir.setMaxSizeInBytes(thisDiskFree);
dir.setRandomIOExceptionRate(rate);
try {
if (0 == method) {
if (VERBOSE) {
System.out.println("TEST: now addIndexes count=" + dirs.length);
}
writer.addIndexes(dirs);
if (VERBOSE) {
System.out.println("TEST: now forceMerge");
}
writer.forceMerge(1);
} else if (1 == method) {
DirectoryReader[] readers = new DirectoryReader[dirs.length];
for (int i = 0; i < dirs.length; i++) {
readers[i] = DirectoryReader.open(dirs[i]);
}
try {
TestUtil.addIndexesSlowly(writer, readers);
} finally {
for (int i = 0; i < dirs.length; i++) {
readers[i].close();
}
}
} else {
writer.addIndexes(dirs);
}
success = true;
if (VERBOSE) {
System.out.println(" success!");
}
if (0 == x) {
done = true;
}
} catch (IllegalStateException | IOException e) {
success = false;
err = e;
if (VERBOSE) {
System.out.println(" hit Exception: " + e);
e.printStackTrace(System.out);
}
if (1 == x) {
e.printStackTrace(System.out);
fail(methodName + " hit IOException after disk space was freed up");
}
}
if (x == 1) {
// Make sure all threads from ConcurrentMergeScheduler are done
TestUtil.syncConcurrentMerges(writer);
} else {
dir.setRandomIOExceptionRateOnOpen(0.0);
writer.rollback();
writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy(false)));
}
if (VERBOSE) {
System.out.println(" now test readers");
}
// Finally, verify index is not corrupt, and, if
// we succeeded, we see all docs added, and if we
// failed, we see either all docs or no docs added
// (transactional semantics):
dir.setRandomIOExceptionRateOnOpen(0.0);
try {
reader = DirectoryReader.open(dir);
} catch (IOException e) {
e.printStackTrace(System.out);
fail(testName + ": exception when creating IndexReader: " + e);
}
int result = reader.docFreq(searchTerm);
if (success) {
if (result != START_COUNT) {
fail(testName + ": method did not throw exception but docFreq('aaa') is " + result + " instead of expected " + START_COUNT);
}
} else {
// all docs:
if (result != START_COUNT && result != END_COUNT) {
err.printStackTrace(System.out);
fail(testName + ": method did throw exception but docFreq('aaa') is " + result + " instead of expected " + START_COUNT + " or " + END_COUNT);
}
}
searcher = newSearcher(reader);
try {
hits = searcher.search(new TermQuery(searchTerm), END_COUNT).scoreDocs;
} catch (IOException e) {
e.printStackTrace(System.out);
fail(testName + ": exception when searching: " + e);
}
int result2 = hits.length;
if (success) {
if (result2 != result) {
fail(testName + ": method did not throw exception but hits.length for search on term 'aaa' is " + result2 + " instead of expected " + result);
}
} else {
// all docs:
if (result2 != result) {
err.printStackTrace(System.out);
fail(testName + ": method did throw exception but hits.length for search on term 'aaa' is " + result2 + " instead of expected " + result);
}
}
reader.close();
if (VERBOSE) {
System.out.println(" count is " + result);
}
if (done || result == END_COUNT) {
break;
}
}
if (VERBOSE) {
System.out.println(" start disk = " + startDiskUsage + "; input disk = " + inputDiskUsage + "; max used = " + dir.getMaxUsedSizeInBytes());
}
if (done) {
// Javadocs state that temp free Directory space
// required is at most 2X total input size of
// indices so let's make sure:
assertTrue("max free Directory space required exceeded 1X the total input index sizes during " + methodName + ": max temp usage = " + (dir.getMaxUsedSizeInBytes() - startDiskUsage) + " bytes vs limit=" + (2 * (startDiskUsage + inputDiskUsage)) + "; starting disk usage = " + startDiskUsage + " bytes; " + "input index disk usage = " + inputDiskUsage + " bytes", (dir.getMaxUsedSizeInBytes() - startDiskUsage) < 2 * (startDiskUsage + inputDiskUsage));
}
// Make sure we don't hit disk full during close below:
dir.setMaxSizeInBytes(0);
dir.setRandomIOExceptionRate(0.0);
dir.setRandomIOExceptionRateOnOpen(0.0);
writer.close();
dir.close();
// Try again with more free space:
diskFree += TEST_NIGHTLY ? TestUtil.nextInt(random(), 4000, 8000) : TestUtil.nextInt(random(), 40000, 80000);
}
}
startDir.close();
for (Directory dir : dirs) dir.close();
}
use of org.apache.lucene.search.ScoreDoc in project jackrabbit-oak by apache.
the class FilteredSortedSetDocValuesFacetCounts method getTopChildren.
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
FacetResult topChildren = super.getTopChildren(topN, dim, path);
LabelAndValue[] labelAndValues = topChildren.labelValues;
for (ScoreDoc scoreDoc : docs.scoreDocs) {
labelAndValues = filterFacet(scoreDoc.doc, dim, labelAndValues);
}
int childCount = labelAndValues.length;
Number value = 0;
for (LabelAndValue lv : labelAndValues) {
value = value.longValue() + lv.value.longValue();
}
return new FacetResult(dim, path, value, labelAndValues, childCount);
}
use of org.apache.lucene.search.ScoreDoc in project jena by apache.
the class TextIndexLucene method query$.
private List<TextHit> query$(IndexReader indexReader, Node property, String qs, String graphURI, String lang, int limit) throws ParseException, IOException {
String textField = docDef.getField(property);
String textClause;
String langClause = null;
String graphClause = null;
//for language-based search extension
if (getDocDef().getLangField() != null) {
String langField = getDocDef().getLangField();
if (lang != null) {
if (this.isMultilingual && !lang.equals("none")) {
textField = textField + "_" + lang;
}
langClause = !"none".equals(lang) ? langField + ":" + lang : "-" + langField + ":*";
}
}
if (textField != null)
textClause = textField + ":" + qs;
else
textClause = qs;
if (graphURI != null) {
String escaped = QueryParserBase.escape(graphURI);
graphClause = getDocDef().getGraphField() + ":" + escaped;
}
String queryString = textClause;
if (langClause != null)
queryString = "(" + queryString + ") AND " + langClause;
if (graphClause != null)
queryString = "(" + queryString + ") AND " + graphClause;
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
Query query = parseQuery(queryString, queryAnalyzer);
if (limit <= 0)
limit = MAX_N;
ScoreDoc[] sDocs = indexSearcher.search(query, limit).scoreDocs;
List<TextHit> results = new ArrayList<>();
for (ScoreDoc sd : sDocs) {
Document doc = indexSearcher.doc(sd.doc);
String[] values = doc.getValues(docDef.getEntityField());
Node literal = null;
String field = (property != null) ? docDef.getField(property) : docDef.getPrimaryField();
String[] lexicals = doc.getValues(field);
if (lexicals.length > 0) {
String lexical = lexicals[0];
String[] langs = doc.getValues(docDef.getLangField());
if (langs.length > 0) {
String doclang = langs[0];
if (doclang.startsWith(DATATYPE_PREFIX)) {
String datatype = doclang.substring(DATATYPE_PREFIX.length());
TypeMapper tmap = TypeMapper.getInstance();
literal = NodeFactory.createLiteral(lexical, tmap.getSafeTypeByName(datatype));
} else {
literal = NodeFactory.createLiteral(lexical, doclang);
}
} else {
literal = NodeFactory.createLiteral(lexical);
}
}
for (String v : values) {
Node n = TextQueryFuncs.stringToNode(v);
TextHit hit = new TextHit(n, sd.score, literal);
results.add(hit);
}
}
return results;
}
Aggregations