use of org.apache.lucene.index.TermPositions in project greplin-lucene-utils by Cue.
the class PhraseFilter method getDocIdSet.
@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()];
int matchCount = 0;
int readerNumber = 0;
for (IndexReader subReader : subReaders) {
SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
for (int i = 0; i < this.terms.length; i++) {
Term t = this.terms[i];
termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i));
}
PhraseFilterMatchList matches = null;
TermPositions termPositions = subReader.termPositions();
try {
for (TermWithFrequency term : termsOrderedByFrequency) {
if (term.docFreq == 0) {
break;
}
termPositions.seek(term.term);
if (matches == null) {
// If this is the first term, collect all matches that intersect
// with the provided initial document set.
Intersection intersection = this.intersectionProvider.get(reader);
matches = new PhraseFilterMatchList(term.docFreq);
while (intersection.advanceToNextIntersection(termPositions)) {
int freq = termPositions.freq();
PhraseFilterIntList list = new PhraseFilterIntList(freq);
for (int i = 0; i < freq; i++) {
list.add(termPositions.nextPosition() - term.offset);
}
matches.add(termPositions.doc(), list);
}
} else {
// Otherwise, intersect with the existing matches.
matches.intersect(termPositions, term.offset);
}
if (matches.getCount() == 0) {
break;
}
}
} finally {
termPositions.close();
}
if (matches != null) {
results[readerNumber] = matches;
matchCount += matches.getCount();
}
readerNumber++;
}
// 2^5 = 32
final int bitsPerIntPowerLogTwo = 5;
if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
FixedBitSet result = new FixedBitSet(reader.maxDoc());
int readerOffset = 0;
for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
PhraseFilterMatchList matches = results[readerIndex];
if (matches != null) {
int count = matches.getCount();
int[] docIds = matches.getDocIds();
for (int i = 0; i < count; i++) {
result.set(docIds[i] + readerOffset);
}
}
readerOffset += subReaders.get(readerIndex).maxDoc();
}
return result;
} else if (matchCount == 0) {
return DocIdSets.EMPTY;
} else {
int[] result = new int[matchCount];
int base = 0;
int readerOffset = 0;
for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
PhraseFilterMatchList matches = results[readerIndex];
if (matches != null) {
int count = matches.getCount();
int[] docIds = matches.getDocIds();
for (int i = 0; i < count; i++) {
result[base + i] = docIds[i] + readerOffset;
}
base += count;
}
readerOffset += subReaders.get(readerIndex).maxDoc();
}
return new SortedIntArrayDocIdSet(result);
}
}
use of org.apache.lucene.index.TermPositions in project greplin-lucene-utils by Cue.
the class FilteredMultiReader method termPositions.
@Override
public TermPositions termPositions(final Term term) throws IOException {
TermPositions result = termPositions();
result.seek(term);
return result;
}
use of org.apache.lucene.index.TermPositions in project zm-mailbox by Zimbra.
the class LuceneViewer method dumpTerms.
private void dumpTerms() throws IOException {
outputBanner("Terms (in Term.compareTo() order)");
TermEnum terms = mIndexReader.terms();
int order = 0;
while (terms.next()) {
order++;
Term term = terms.term();
String field = term.field();
String text = term.text();
if (!wantThisTerm(field, text)) {
continue;
}
outputLn(order + " " + field + ": " + text);
/*
* for each term, print the
* <document, frequency, <position>* > tuples for a term.
*
* document: document in which the Term appears
* frequency: number of time the Term appears in the document
* position: position for each appearance in the document
*
* e.g. doc.add(new Field("field", "one two three two four five", Field.Store.YES, Field.Index.ANALYZED));
* then the tuple for Term("field", "two") in this document would be like:
* 88, 2, <2, 4>
* where
* 88 is the document number
* 2 is the frequency this term appear in the document
* <2, 4> are the positions for each appearance in the document
*/
// by TermPositions
outputLn(" document, frequency, <position>*");
// keep track of docs that appear in all terms that are filtered in.
Set<Integer> docNums = null;
if (hasFilters()) {
docNums = new HashSet<Integer>();
}
TermPositions termPos = mIndexReader.termPositions(term);
while (termPos.next()) {
int docNum = termPos.doc();
int freq = termPos.freq();
if (docNums != null) {
docNums.add(docNum);
}
output(" " + docNum + ", " + freq + ", <");
boolean first = true;
for (int f = 0; f < freq; f++) {
int positionInDoc = termPos.nextPosition();
if (!first) {
output(" ");
} else {
first = false;
}
output(positionInDoc + "");
}
outputLn(">");
}
termPos.close();
if (docNums != null) {
computeDocsIntersection(docNums);
}
outputLn();
if (order % 1000 == 0) {
mConsole.debug("Dumped " + order + " terms");
}
}
terms.close();
}
use of org.apache.lucene.index.TermPositions in project jackrabbit by apache.
the class SharedFieldCache method getValueIndex.
/**
* Creates a <code>ValueIndex</code> for a <code>field</code> and a term
* <code>prefix</code>. The term prefix acts as the property name for the
* shared <code>field</code>.
* <p>
* This method is an adapted version of: <code>FieldCacheImpl.getStringIndex()</code>
*
* @param reader the <code>IndexReader</code>.
* @param field name of the shared field.
* @param prefix the property name, will be used as term prefix.
* @return a ValueIndex that contains the field values and order
* information.
* @throws IOException if an error occurs while reading from the index.
*/
public ValueIndex getValueIndex(IndexReader reader, String field, String prefix) throws IOException {
if (reader instanceof ReadOnlyIndexReader) {
reader = ((ReadOnlyIndexReader) reader).getBase();
}
field = field.intern();
ValueIndex ret = lookup(reader, field, prefix);
if (ret == null) {
final int maxDocs = reader.maxDoc();
Comparable<?>[] retArray = new Comparable<?>[maxDocs];
Map<Integer, Integer> positions = new HashMap<Integer, Integer>();
boolean usingSimpleComparable = true;
int setValues = 0;
if (maxDocs > 0) {
IndexFormatVersion version = IndexFormatVersion.getVersion(reader);
boolean hasPayloads = version.isAtLeast(IndexFormatVersion.V3);
TermDocs termDocs;
byte[] payload = null;
int type;
if (hasPayloads) {
termDocs = reader.termPositions();
payload = new byte[1];
} else {
termDocs = reader.termDocs();
}
TermEnum termEnum = reader.terms(new Term(field, prefix));
try {
if (termEnum.term() == null) {
throw new RuntimeException("no terms in field " + field);
}
do {
Term term = termEnum.term();
if (term.field() != field || !term.text().startsWith(prefix)) {
break;
}
final String value = termValueAsString(term, prefix);
termDocs.seek(term);
while (termDocs.next()) {
int termPosition = 0;
type = PropertyType.UNDEFINED;
if (hasPayloads) {
TermPositions termPos = (TermPositions) termDocs;
termPosition = termPos.nextPosition();
if (termPos.isPayloadAvailable()) {
payload = termPos.getPayload(payload, 0);
type = PropertyMetaData.fromByteArray(payload).getPropertyType();
}
}
setValues++;
Comparable<?> v = getValue(value, type);
int doc = termDocs.doc();
Comparable<?> ca = retArray[doc];
if (ca == null) {
if (usingSimpleComparable) {
// put simple value on the queue
positions.put(doc, termPosition);
retArray[doc] = v;
} else {
retArray[doc] = new ComparableArray(v, termPosition);
}
} else {
if (ca instanceof ComparableArray) {
((ComparableArray) ca).insert(v, termPosition);
} else {
// Comparable to ComparableArray
for (int pos : positions.keySet()) {
retArray[pos] = new ComparableArray(retArray[pos], positions.get(pos));
}
positions = null;
usingSimpleComparable = false;
ComparableArray caNew = (ComparableArray) retArray[doc];
retArray[doc] = caNew.insert(v, termPosition);
}
}
}
} while (termEnum.next());
} finally {
termDocs.close();
termEnum.close();
}
}
ValueIndex value = new ValueIndex(retArray, setValues);
store(reader, field, prefix, value);
return value;
}
return ret;
}
Aggregations