use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestMockAnalyzer method testChangeGaps.
public void testChangeGaps() throws Exception {
// LUCENE-5324: check that it is possible to change the wrapper's gaps
final int positionGap = random().nextInt(1000);
final int offsetGap = random().nextInt(1000);
final Analyzer delegate = new MockAnalyzer(random());
final Analyzer a = new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
}
@Override
public int getPositionIncrementGap(String fieldName) {
return positionGap;
}
@Override
public int getOffsetGap(String fieldName) {
return offsetGap;
}
};
final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
final Document doc = new Document();
final FieldType ft = new FieldType();
ft.setIndexOptions(IndexOptions.DOCS);
ft.setTokenized(true);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorOffsets(true);
doc.add(new Field("f", "a", ft));
doc.add(new Field("f", "a", ft));
writer.addDocument(doc);
final LeafReader reader = getOnlyLeafReader(writer.getReader());
final Fields fields = reader.getTermVectors(0);
final Terms terms = fields.terms("f");
final TermsEnum te = terms.iterator();
assertEquals(new BytesRef("a"), te.next());
final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
assertEquals(0, dpe.nextDoc());
assertEquals(2, dpe.freq());
assertEquals(0, dpe.nextPosition());
assertEquals(0, dpe.startOffset());
final int endOffset = dpe.endOffset();
assertEquals(1 + positionGap, dpe.nextPosition());
assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
assertEquals(null, te.next());
reader.close();
writer.close();
writer.w.getDirectory().close();
}
use of org.apache.lucene.index.PostingsEnum in project Anserini by castorini.
the class PMIFeatureExtractor method extract.
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
// and also doc freqs of each pair
if (!this.lastQueryProcessed.equals(context.getQueryText())) {
this.lastQueryProcessed = context.getQueryText();
this.lastComputedValue = 0.0f;
Set<String> querySet = new HashSet<>(context.getQueryTokens());
IndexReader reader = context.getIndexSearcher().getIndexReader();
Map<String, Integer> docFreqs = new HashMap<>();
List<String> queryTokens = new ArrayList<>(querySet);
try {
for (String token : querySet) {
docFreqs.put(token, reader.docFreq(new Term(context.getField(), token)));
}
float sumPMI = 0.0f;
float pairsComputed = 0.0f;
for (int i = 0; i < queryTokens.size(); i++) {
String firstToken = queryTokens.get(i);
for (int j = i + 1; j < queryTokens.size(); j++) {
pairsComputed++;
String secondToken = queryTokens.get(j);
PostingsEnum firstEnum = MultiFields.getTermDocsEnum(reader, context.getField(), new BytesRef(firstToken));
PostingsEnum secondEnum = MultiFields.getTermDocsEnum(reader, context.getField(), new BytesRef(secondToken));
int intersect;
if (firstEnum == null || secondEnum == null) {
intersect = 0;
} else {
intersect = countPostingIntersect(firstEnum, secondEnum);
}
if (intersect == 0)
continue;
// We should never reach this point and have doc freq =0 because then there would
// be no intersect between docIds
int firstDocFreq = docFreqs.containsKey(firstToken) ? docFreqs.get(firstToken) : 1;
int secondDocFreq = docFreqs.containsKey(secondToken) ? docFreqs.get(secondToken) : 1;
float fraction = (intersect / (float) (firstDocFreq * secondDocFreq));
if (fraction <= 0) {
continue;
}
sumPMI += Math.log(fraction);
}
}
// Now compute the average
if (pairsComputed != 0) {
this.lastComputedValue = sumPMI / pairsComputed;
}
} catch (IOException e) {
this.lastComputedValue = 0.0f;
}
}
return this.lastComputedValue;
}
use of org.apache.lucene.index.PostingsEnum in project SearchServices by Alfresco.
the class AlfrescoLukeRequestHandler method getFirstLiveDoc.
// Just get a document with the term in it, the first one will do!
// Is there a better way to do this? Shouldn't actually be very costly
// to do it this way.
private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
PostingsEnum postingsEnum = null;
TermsEnum termsEnum = terms.iterator();
BytesRef text;
// documents. Is there a better way?
for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
text = termsEnum.next();
if (text == null) {
// them.
return null;
}
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
final Bits liveDocs = reader.getLiveDocs();
if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
continue;
}
return reader.document(postingsEnum.docID());
}
}
return null;
}
use of org.apache.lucene.index.PostingsEnum in project SearchServices by Alfresco.
the class SolrPathScorer method createPathScorer.
public static SolrPathScorer createPathScorer(SolrPathQuery solrPathQuery, LeafReaderContext context, Weight weight, DictionaryService dictionarySertvice, boolean repeat) throws IOException {
if (solrPathQuery.getPathStructuredFieldPositions().size() == 0) {
ArrayList<StructuredFieldPosition> answer = new ArrayList<StructuredFieldPosition>(2);
answer.add(new SelfAxisStructuredFieldPosition());
answer.add(new SelfAxisStructuredFieldPosition());
solrPathQuery.appendQuery(answer);
}
for (StructuredFieldPosition sfp : solrPathQuery.getPathStructuredFieldPositions()) {
if (sfp.getTermText() != null) {
PostingsEnum p = context.reader().postings(new Term(solrPathQuery.getPathField(), sfp.getTermText()), PostingsEnum.POSITIONS);
if (p == null)
return null;
CachingTermPositions ctp = new CachingTermPositions(p);
sfp.setCachingTermPositions(ctp);
}
}
SolrContainerScorer cs = null;
PostingsEnum rootContainerPositions = null;
if (solrPathQuery.getPathRootTerm() != null) {
rootContainerPositions = context.reader().postings(solrPathQuery.getPathRootTerm(), PostingsEnum.POSITIONS);
}
if (solrPathQuery.getPathStructuredFieldPositions().size() > 0) {
cs = new SolrContainerScorer(weight, rootContainerPositions, (StructuredFieldPosition[]) solrPathQuery.getPathStructuredFieldPositions().toArray(new StructuredFieldPosition[] {}));
}
return new SolrPathScorer(weight, cs);
}
use of org.apache.lucene.index.PostingsEnum in project Anserini by castorini.
the class DocumentFieldContext method getPostings.
public Map<Integer, List<Integer>> getPostings(String term) {
if (postings.containsKey(term)) {
return postings.get(term);
} else {
Map<Integer, List<Integer>> posting = new HashMap<>();
try {
Term t = new Term(fieldName, term);
PostingsEnum postingsEnum = MultiTerms.getTermPostingsEnum(reader, fieldName, t.bytes(), PostingsEnum.POSITIONS);
if (postingsEnum != null) {
int docId;
while ((docId = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
List<Integer> postions = new ArrayList<>();
int freq = postingsEnum.freq();
for (int i = 0; i < freq; i++) {
postions.add(postingsEnum.nextPosition());
}
posting.put(docId, postions);
}
}
} catch (IOException e) {
e.printStackTrace();
}
postings.put(term, posting);
return posting;
}
}
Aggregations