use of org.apache.lucene.index.IndexableField in project lucene-solr by apache.
the class KNearestFuzzyClassifier method buildListFromTopDocs.
/**
* build a list of classification results from search results
*
* @param topDocs the search results as a {@link TopDocs} object
* @return a {@link List} of {@link ClassificationResult}, one for each existing class
* @throws IOException if it's not possible to get the stored value of class field
*/
protected List<ClassificationResult<BytesRef>> buildListFromTopDocs(TopDocs topDocs) throws IOException {
Map<BytesRef, Integer> classCounts = new HashMap<>();
// this is a boost based on class ranking positions in topDocs
Map<BytesRef, Double> classBoosts = new HashMap<>();
float maxScore = topDocs.getMaxScore();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
IndexableField storableField = indexSearcher.doc(scoreDoc.doc).getField(classFieldName);
if (storableField != null) {
BytesRef cl = new BytesRef(storableField.stringValue());
//update count
Integer count = classCounts.get(cl);
if (count != null) {
classCounts.put(cl, count + 1);
} else {
classCounts.put(cl, 1);
}
//update boost, the boost is based on the best score
Double totalBoost = classBoosts.get(cl);
double singleBoost = scoreDoc.score / maxScore;
if (totalBoost != null) {
classBoosts.put(cl, totalBoost + singleBoost);
} else {
classBoosts.put(cl, singleBoost);
}
}
}
List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
int sumdoc = 0;
for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
Integer count = entry.getValue();
//the boost is normalized to be 0<b<1
Double normBoost = classBoosts.get(entry.getKey()) / count;
temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
sumdoc += count;
}
//correction
if (sumdoc < k) {
for (ClassificationResult<BytesRef> cr : temporaryList) {
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
}
} else {
returnList = temporaryList;
}
return returnList;
}
use of org.apache.lucene.index.IndexableField in project jackrabbit-oak by apache.
the class LuceneIndex method getExcerpt.
private String getExcerpt(Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc) throws IOException {
StringBuilder excerpt = new StringBuilder();
for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
String name = field.name();
// only full text or analyzed fields
if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
String text = field.stringValue();
TokenStream tokenStream = analyzer.tokenStream(name, text);
try {
TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 2);
if (textFragments != null && textFragments.length > 0) {
for (TextFragment fragment : textFragments) {
if (excerpt.length() > 0) {
excerpt.append("...");
}
excerpt.append(fragment.toString());
}
break;
}
} catch (InvalidTokenOffsetsException e) {
LOG.error("higlighting failed", e);
}
}
}
return excerpt.toString();
}
use of org.apache.lucene.index.IndexableField in project lucene-solr by apache.
the class TestLucene54DocValuesFormat method doTestSparseDocValuesVsStoredFields.
private void doTestSparseDocValuesVsStoredFields() throws Exception {
final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
for (int i = 0; i < values.length; ++i) {
values[i] = random().nextLong();
}
Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
// sparse compression is only enabled if less than 1% of docs have a value
final int avgGap = 100;
final int numDocs = atLeast(200);
for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
writer.addDocument(new Document());
}
final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
// single-valued
long docValue = values[random().nextInt(values.length)];
doc.add(new NumericDocValuesField("numeric", docValue));
doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
doc.add(new StoredField("value", docValue));
// multi-valued
final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
for (int j = 0; j < numValues; ++j) {
docValue = values[random().nextInt(values.length)];
doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
doc.add(new StoredField("values", docValue));
}
writer.addDocument(doc);
// add a gap
for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
writer.addDocument(new Document());
}
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
final IndexReader indexReader = writer.getReader();
TestUtil.checkReader(indexReader);
writer.close();
for (LeafReaderContext context : indexReader.leaves()) {
final LeafReader reader = context.reader();
final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
for (int i = 0; i < reader.maxDoc(); ++i) {
final Document doc = reader.document(i);
final IndexableField valueField = doc.getField("value");
final Long value = valueField == null ? null : valueField.numericValue().longValue();
if (value == null) {
assertTrue(numeric.docID() + " vs " + i, numeric.docID() < i);
} else {
assertEquals(i, numeric.nextDoc());
assertEquals(i, binary.nextDoc());
assertEquals(i, sorted.nextDoc());
assertEquals(value.longValue(), numeric.longValue());
assertTrue(sorted.ordValue() >= 0);
assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.ordValue()));
assertEquals(new BytesRef(Long.toString(value)), binary.binaryValue());
}
final IndexableField[] valuesFields = doc.getFields("values");
if (valuesFields.length == 0) {
assertTrue(sortedNumeric.docID() + " vs " + i, sortedNumeric.docID() < i);
} else {
final Set<Long> valueSet = new HashSet<>();
for (IndexableField sf : valuesFields) {
valueSet.add(sf.numericValue().longValue());
}
assertEquals(i, sortedNumeric.nextDoc());
assertEquals(valuesFields.length, sortedNumeric.docValueCount());
for (int j = 0; j < sortedNumeric.docValueCount(); ++j) {
assertTrue(valueSet.contains(sortedNumeric.nextValue()));
}
assertEquals(i, sortedSet.nextDoc());
int sortedSetCount = 0;
while (true) {
long ord = sortedSet.nextOrd();
if (ord == SortedSetDocValues.NO_MORE_ORDS) {
break;
}
assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
sortedSetCount++;
}
assertEquals(valueSet.size(), sortedSetCount);
}
}
}
indexReader.close();
dir.close();
}
use of org.apache.lucene.index.IndexableField in project lucene-solr by apache.
the class TestLucene70DocValuesFormat method doTestSparseDocValuesVsStoredFields.
private void doTestSparseDocValuesVsStoredFields() throws Exception {
final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
for (int i = 0; i < values.length; ++i) {
values[i] = random().nextLong();
}
Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
// sparse compression is only enabled if less than 1% of docs have a value
final int avgGap = 100;
final int numDocs = atLeast(200);
for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
writer.addDocument(new Document());
}
final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
// single-valued
long docValue = values[random().nextInt(values.length)];
doc.add(new NumericDocValuesField("numeric", docValue));
doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
doc.add(new StoredField("value", docValue));
// multi-valued
final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
for (int j = 0; j < numValues; ++j) {
docValue = values[random().nextInt(values.length)];
doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
doc.add(new StoredField("values", docValue));
}
writer.addDocument(doc);
// add a gap
for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
writer.addDocument(new Document());
}
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
final IndexReader indexReader = writer.getReader();
writer.close();
for (LeafReaderContext context : indexReader.leaves()) {
final LeafReader reader = context.reader();
final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
for (int i = 0; i < reader.maxDoc(); ++i) {
final Document doc = reader.document(i);
final IndexableField valueField = doc.getField("value");
final Long value = valueField == null ? null : valueField.numericValue().longValue();
if (value == null) {
assertTrue(numeric.docID() + " vs " + i, numeric.docID() < i);
} else {
assertEquals(i, numeric.nextDoc());
assertEquals(i, binary.nextDoc());
assertEquals(i, sorted.nextDoc());
assertEquals(value.longValue(), numeric.longValue());
assertTrue(sorted.ordValue() >= 0);
assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.ordValue()));
assertEquals(new BytesRef(Long.toString(value)), binary.binaryValue());
}
final IndexableField[] valuesFields = doc.getFields("values");
if (valuesFields.length == 0) {
assertTrue(sortedNumeric.docID() + " vs " + i, sortedNumeric.docID() < i);
} else {
final Set<Long> valueSet = new HashSet<>();
for (IndexableField sf : valuesFields) {
valueSet.add(sf.numericValue().longValue());
}
assertEquals(i, sortedNumeric.nextDoc());
assertEquals(valuesFields.length, sortedNumeric.docValueCount());
for (int j = 0; j < sortedNumeric.docValueCount(); ++j) {
assertTrue(valueSet.contains(sortedNumeric.nextValue()));
}
assertEquals(i, sortedSet.nextDoc());
int sortedSetCount = 0;
while (true) {
long ord = sortedSet.nextOrd();
if (ord == SortedSetDocValues.NO_MORE_ORDS) {
break;
}
assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
sortedSetCount++;
}
assertEquals(valueSet.size(), sortedSetCount);
}
}
}
indexReader.close();
dir.close();
}
use of org.apache.lucene.index.IndexableField in project lucene-solr by apache.
the class TestBlockPostingsFormat2 method testTTFBlockSizeMultiple.
/** tests terms with ttf % blocksize = 0 */
public void testTTFBlockSizeMultiple() throws Exception {
Document doc = newDocument();
for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE / 2; i++) {
for (IndexableField f : doc.getFields()) {
String proto = (f.name() + " " + f.name() + " " + f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2");
StringBuilder val = new StringBuilder();
for (int j = 0; j < 16; j++) {
val.append(proto);
val.append(" ");
}
((Field) f).setStringValue(val.toString());
}
iw.addDocument(doc);
}
}
Aggregations