use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class FloatPayloadValueSource method getValues.
@Override
public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
Fields fields = readerContext.reader().fields();
final Terms terms = fields.terms(indexedField);
FunctionValues defaultValues = defaultValueSource.getValues(context, readerContext);
// copied the bulk of this from TFValueSource - TODO: this is a very repeated pattern - base-class this advance logic stuff?
return new FloatDocValues(this) {
PostingsEnum docs;
int atDoc;
int lastDocRequested = -1;
{
reset();
}
public void reset() throws IOException {
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(indexedBytes)) {
docs = termsEnum.postings(null, PostingsEnum.ALL);
} else {
docs = null;
}
} else {
docs = null;
}
if (docs == null) {
// dummy PostingsEnum so floatVal() can work
// when would this be called? if field/val did not match? this is called for every doc? create once and cache?
docs = new PostingsEnum() {
@Override
public int freq() {
return 0;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int docID() {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int nextDoc() {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int advance(int target) {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public long cost() {
return 0;
}
};
}
atDoc = -1;
}
@Override
public float floatVal(int doc) {
try {
if (doc < lastDocRequested) {
// out-of-order access.... reset
reset();
}
lastDocRequested = doc;
if (atDoc < doc) {
atDoc = docs.advance(doc);
}
if (atDoc > doc) {
// end, or because the next doc is after this doc.
return defaultValues.floatVal(doc);
}
// a match!
int freq = docs.freq();
int numPayloadsSeen = 0;
float currentScore = 0;
for (int i = 0; i < freq; i++) {
docs.nextPosition();
BytesRef payload = docs.getPayload();
if (payload != null) {
float payloadVal = decoder.decode(atDoc, docs.startOffset(), docs.endOffset(), payload);
// payloadFunction = null represents "first"
if (payloadFunction == null)
return payloadVal;
currentScore = payloadFunction.currentScore(doc, indexedField, docs.startOffset(), docs.endOffset(), numPayloadsSeen, currentScore, payloadVal);
numPayloadsSeen++;
}
}
return (numPayloadsSeen > 0) ? payloadFunction.docScore(doc, indexedField, numPayloadsSeen, currentScore) : defaultValues.floatVal(doc);
} catch (IOException e) {
throw new RuntimeException("caught exception in function " + description() + " : doc=" + doc, e);
}
}
};
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class FieldOffsetStrategy method createAutomataOffsetsFromTerms.
protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
automataPostings.add(new ArrayList<>());
}
TermsEnum termsEnum = termsIndex.iterator();
BytesRef term;
CharsRefBuilder refBuilder = new CharsRefBuilder();
while ((term = termsEnum.next()) != null) {
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
refBuilder.copyUTF8Bytes(term);
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (doc == postings.advance(doc)) {
automataPostings.get(i).add(postings);
}
}
}
}
//will be at most this long
List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
List<PostingsEnum> postingsEnums = automataPostings.get(i);
int size = postingsEnums.size();
if (size > 0) {
//only add if we have offsets
BytesRef wildcardTerm = new BytesRef(automaton.toString());
if (size == 1) {
//don't wrap in a composite if there's only one OffsetsEnum
offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
} else {
offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
}
}
}
return offsetsEnums;
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestJoinUtil method createContext.
private IndexIterationContext createContext(int nDocs, boolean multipleValuesPerDocument, boolean globalOrdinalJoin) throws IOException {
if (globalOrdinalJoin) {
assertFalse("ordinal join doesn't support multiple join values per document", multipleValuesPerDocument);
}
Directory dir = newDirectory();
final Random random = random();
RandomIndexWriter w = new RandomIndexWriter(random, dir, newIndexWriterConfig(new MockAnalyzer(random, MockTokenizer.KEYWORD, false)));
IndexIterationContext context = new IndexIterationContext();
int numRandomValues = nDocs / RandomNumbers.randomIntBetween(random, 1, 4);
context.randomUniqueValues = new String[numRandomValues];
Set<String> trackSet = new HashSet<>();
context.randomFrom = new boolean[numRandomValues];
for (int i = 0; i < numRandomValues; i++) {
String uniqueRandomValue;
do {
// the trick is to generate values which will be ordered similarly for string, ints&longs, positive nums makes it easier
final int nextInt = random.nextInt(Integer.MAX_VALUE);
uniqueRandomValue = String.format(Locale.ROOT, "%08x", nextInt);
assert nextInt == Integer.parseUnsignedInt(uniqueRandomValue, 16);
} while ("".equals(uniqueRandomValue) || trackSet.contains(uniqueRandomValue));
// Generate unique values and empty strings aren't allowed.
trackSet.add(uniqueRandomValue);
context.randomFrom[i] = random.nextBoolean();
context.randomUniqueValues[i] = uniqueRandomValue;
}
List<String> randomUniqueValuesReplica = new ArrayList<>(Arrays.asList(context.randomUniqueValues));
RandomDoc[] docs = new RandomDoc[nDocs];
for (int i = 0; i < nDocs; i++) {
String id = Integer.toString(i);
int randomI = random.nextInt(context.randomUniqueValues.length);
String value = context.randomUniqueValues[randomI];
Document document = new Document();
document.add(newTextField(random, "id", id, Field.Store.YES));
document.add(newTextField(random, "value", value, Field.Store.NO));
boolean from = context.randomFrom[randomI];
int numberOfLinkValues = multipleValuesPerDocument ? Math.min(2 + random.nextInt(10), context.randomUniqueValues.length) : 1;
docs[i] = new RandomDoc(id, numberOfLinkValues, value, from);
if (globalOrdinalJoin) {
document.add(newStringField("type", from ? "from" : "to", Field.Store.NO));
}
final List<String> subValues;
{
int start = randomUniqueValuesReplica.size() == numberOfLinkValues ? 0 : random.nextInt(randomUniqueValuesReplica.size() - numberOfLinkValues);
subValues = randomUniqueValuesReplica.subList(start, start + numberOfLinkValues);
Collections.shuffle(subValues, random);
}
for (String linkValue : subValues) {
assert !docs[i].linkValues.contains(linkValue);
docs[i].linkValues.add(linkValue);
if (from) {
if (!context.fromDocuments.containsKey(linkValue)) {
context.fromDocuments.put(linkValue, new ArrayList<>());
}
if (!context.randomValueFromDocs.containsKey(value)) {
context.randomValueFromDocs.put(value, new ArrayList<>());
}
context.fromDocuments.get(linkValue).add(docs[i]);
context.randomValueFromDocs.get(value).add(docs[i]);
addLinkFields(random, document, "from", linkValue, multipleValuesPerDocument, globalOrdinalJoin);
} else {
if (!context.toDocuments.containsKey(linkValue)) {
context.toDocuments.put(linkValue, new ArrayList<>());
}
if (!context.randomValueToDocs.containsKey(value)) {
context.randomValueToDocs.put(value, new ArrayList<>());
}
context.toDocuments.get(linkValue).add(docs[i]);
context.randomValueToDocs.get(value).add(docs[i]);
addLinkFields(random, document, "to", linkValue, multipleValuesPerDocument, globalOrdinalJoin);
}
}
w.addDocument(document);
if (random.nextInt(10) == 4) {
w.commit();
}
if (VERBOSE) {
System.out.println("Added document[" + docs[i].id + "]: " + document);
}
}
if (random.nextBoolean()) {
w.forceMerge(1);
}
w.close();
// Pre-compute all possible hits for all unique random values. On top of this also compute all possible score for
// any ScoreMode.
DirectoryReader topLevelReader = DirectoryReader.open(dir);
IndexSearcher searcher = newSearcher(topLevelReader);
for (int i = 0; i < context.randomUniqueValues.length; i++) {
String uniqueRandomValue = context.randomUniqueValues[i];
final String fromField;
final String toField;
final Map<String, Map<Integer, JoinScore>> queryVals;
if (context.randomFrom[i]) {
fromField = "from";
toField = "to";
queryVals = context.fromHitsToJoinScore;
} else {
fromField = "to";
toField = "from";
queryVals = context.toHitsToJoinScore;
}
final Map<BytesRef, JoinScore> joinValueToJoinScores = new HashMap<>();
if (multipleValuesPerDocument) {
searcher.search(new TermQuery(new Term("value", uniqueRandomValue)), new SimpleCollector() {
private Scorer scorer;
private SortedSetDocValues docTermOrds;
@Override
public void collect(int doc) throws IOException {
if (doc > docTermOrds.docID()) {
docTermOrds.advance(doc);
}
if (doc == docTermOrds.docID()) {
long ord;
while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
final BytesRef joinValue = docTermOrds.lookupOrd(ord);
JoinScore joinScore = joinValueToJoinScores.get(joinValue);
if (joinScore == null) {
joinValueToJoinScores.put(BytesRef.deepCopyOf(joinValue), joinScore = new JoinScore());
}
joinScore.addScore(scorer.score());
}
}
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
docTermOrds = DocValues.getSortedSet(context.reader(), fromField);
}
@Override
public void setScorer(Scorer scorer) {
this.scorer = scorer;
}
@Override
public boolean needsScores() {
return true;
}
});
} else {
searcher.search(new TermQuery(new Term("value", uniqueRandomValue)), new SimpleCollector() {
private Scorer scorer;
private BinaryDocValues terms;
@Override
public void collect(int doc) throws IOException {
if (doc > terms.docID()) {
terms.advance(doc);
}
final BytesRef joinValue;
if (doc == terms.docID()) {
joinValue = terms.binaryValue();
} else {
// missing;
return;
}
JoinScore joinScore = joinValueToJoinScores.get(joinValue);
if (joinScore == null) {
joinValueToJoinScores.put(BytesRef.deepCopyOf(joinValue), joinScore = new JoinScore());
}
if (VERBOSE) {
System.out.println("expected val=" + joinValue.utf8ToString() + " expected score=" + scorer.score());
}
joinScore.addScore(scorer.score());
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
terms = DocValues.getBinary(context.reader(), fromField);
}
@Override
public void setScorer(Scorer scorer) {
this.scorer = scorer;
}
@Override
public boolean needsScores() {
return true;
}
});
}
final Map<Integer, JoinScore> docToJoinScore = new HashMap<>();
if (multipleValuesPerDocument) {
Terms terms = MultiFields.getTerms(topLevelReader, toField);
if (terms != null) {
PostingsEnum postingsEnum = null;
SortedSet<BytesRef> joinValues = new TreeSet<>();
joinValues.addAll(joinValueToJoinScores.keySet());
for (BytesRef joinValue : joinValues) {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(joinValue)) {
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
JoinScore joinScore = joinValueToJoinScores.get(joinValue);
for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) {
// Something to keep in mind for many-to-many relations.
if (!docToJoinScore.containsKey(doc)) {
docToJoinScore.put(doc, joinScore);
}
}
}
}
}
} else {
searcher.search(new MatchAllDocsQuery(), new SimpleCollector() {
private BinaryDocValues terms;
private int docBase;
@Override
public void collect(int doc) throws IOException {
if (doc > terms.docID()) {
terms.advance(doc);
}
final BytesRef joinValue;
if (doc == terms.docID()) {
joinValue = terms.binaryValue();
} else {
// missing;
joinValue = new BytesRef(BytesRef.EMPTY_BYTES);
}
JoinScore joinScore = joinValueToJoinScores.get(joinValue);
if (joinScore == null) {
return;
}
docToJoinScore.put(docBase + doc, joinScore);
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
terms = DocValues.getBinary(context.reader(), toField);
docBase = context.docBase;
}
@Override
public void setScorer(Scorer scorer) {
}
@Override
public boolean needsScores() {
return false;
}
});
}
queryVals.put(uniqueRandomValue, docToJoinScore);
}
if (globalOrdinalJoin) {
SortedDocValues[] values = new SortedDocValues[topLevelReader.leaves().size()];
for (LeafReaderContext leadContext : topLevelReader.leaves()) {
values[leadContext.ord] = DocValues.getSorted(leadContext.reader(), "join_field");
}
context.ordinalMap = MultiDocValues.OrdinalMap.build(null, values, PackedInts.DEFAULT);
}
context.searcher = searcher;
context.dir = dir;
return context;
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestMemoryIndex method testDocValuesDoNotAffectBoostPositionsOrOffset.
public void testDocValuesDoNotAffectBoostPositionsOrOffset() throws Exception {
Document doc = new Document();
doc.add(new BinaryDocValuesField("text", new BytesRef("quick brown fox")));
doc.add(new TextField("text", "quick brown fox", Field.Store.NO));
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer, true, true);
LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
TermsEnum tenum = leafReader.terms("text").iterator();
assertEquals("brown", tenum.next().utf8ToString());
PostingsEnum penum = tenum.postings(null, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(1, penum.nextPosition());
assertEquals(6, penum.startOffset());
assertEquals(11, penum.endOffset());
assertEquals("fox", tenum.next().utf8ToString());
penum = tenum.postings(penum, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(2, penum.nextPosition());
assertEquals(12, penum.startOffset());
assertEquals(15, penum.endOffset());
assertEquals("quick", tenum.next().utf8ToString());
penum = tenum.postings(penum, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(0, penum.nextPosition());
assertEquals(0, penum.startOffset());
assertEquals(5, penum.endOffset());
BinaryDocValues binaryDocValues = leafReader.getBinaryDocValues("text");
assertEquals(0, binaryDocValues.nextDoc());
assertEquals("quick brown fox", binaryDocValues.binaryValue().utf8ToString());
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestMemoryIndex method testPointValuesDoNotAffectPositionsOrOffset.
public void testPointValuesDoNotAffectPositionsOrOffset() throws Exception {
MemoryIndex mi = new MemoryIndex(true, true);
mi.addField(new TextField("text", "quick brown fox", Field.Store.NO), analyzer);
mi.addField(new BinaryPoint("text", "quick".getBytes(StandardCharsets.UTF_8)), analyzer);
mi.addField(new BinaryPoint("text", "brown".getBytes(StandardCharsets.UTF_8)), analyzer);
LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
TermsEnum tenum = leafReader.terms("text").iterator();
assertEquals("brown", tenum.next().utf8ToString());
PostingsEnum penum = tenum.postings(null, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(1, penum.nextPosition());
assertEquals(6, penum.startOffset());
assertEquals(11, penum.endOffset());
assertEquals("fox", tenum.next().utf8ToString());
penum = tenum.postings(penum, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(2, penum.nextPosition());
assertEquals(12, penum.startOffset());
assertEquals(15, penum.endOffset());
assertEquals("quick", tenum.next().utf8ToString());
penum = tenum.postings(penum, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(0, penum.nextPosition());
assertEquals(0, penum.startOffset());
assertEquals(5, penum.endOffset());
IndexSearcher indexSearcher = mi.createSearcher();
assertEquals(1, indexSearcher.count(BinaryPoint.newExactQuery("text", "quick".getBytes(StandardCharsets.UTF_8))));
assertEquals(1, indexSearcher.count(BinaryPoint.newExactQuery("text", "brown".getBytes(StandardCharsets.UTF_8))));
assertEquals(0, indexSearcher.count(BinaryPoint.newExactQuery("text", "jumps".getBytes(StandardCharsets.UTF_8))));
}
Aggregations