use of org.apache.lucene.index.TermsEnum in project neo4j by neo4j.
the class SimpleUniquenessVerifier method verify.
@Override
public void verify(PropertyAccessor accessor, int[] propKeyIds) throws IndexEntryConflictException, IOException {
try {
DuplicateCheckingCollector collector = DuplicateCheckingCollector.forProperties(accessor, propKeyIds);
IndexSearcher searcher = indexSearcher();
for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) {
Fields fields = leafReaderContext.reader().fields();
for (String field : fields) {
if (LuceneDocumentStructure.NODE_ID_KEY.equals(field)) {
continue;
}
TermsEnum terms = LuceneDocumentStructure.originalTerms(fields.terms(field), field);
BytesRef termsRef;
while ((termsRef = terms.next()) != null) {
if (terms.docFreq() > 1) {
collector.reset();
searcher.search(new TermQuery(new Term(field, termsRef)), collector);
}
}
}
}
} catch (IOException e) {
Throwable cause = e.getCause();
if (cause instanceof IndexEntryConflictException) {
throw (IndexEntryConflictException) cause;
}
throw e;
}
}
use of org.apache.lucene.index.TermsEnum in project neo4j by neo4j.
the class NonUniqueDatabaseIndexSamplerTest method getTerms.
private Terms getTerms(String value, int frequency) throws IOException {
TermsEnum termsEnum = mock(TermsEnum.class);
Terms terms = mock(Terms.class);
when(terms.iterator()).thenReturn(termsEnum);
when(termsEnum.next()).thenReturn(new BytesRef(value.getBytes())).thenReturn(null);
when(termsEnum.docFreq()).thenReturn(frequency);
return terms;
}
use of org.apache.lucene.index.TermsEnum in project textdb by TextDB.
the class WordCountIndexSource method computeWordCount.
private void computeWordCount() throws TextDBException {
try {
HashMap<String, Integer> wordCountMap = new HashMap<>();
DataReader dataReader = RelationManager.getRelationManager().getTableDataReader(predicate.getTableName(), new MatchAllDocsQuery());
dataReader.open();
IndexReader luceneIndexReader = dataReader.getLuceneIndexReader();
for (int i = 0; i < luceneIndexReader.numDocs(); i++) {
Terms termVector = luceneIndexReader.getTermVector(i, predicate.getAttribute());
TermsEnum termsEnum = termVector.iterator();
while (termsEnum.next() != null) {
String key = termsEnum.term().utf8ToString();
wordCountMap.put(key, wordCountMap.get(key) == null ? ((int) termsEnum.totalTermFreq()) : wordCountMap.get(key) + ((int) termsEnum.totalTermFreq()));
}
}
luceneIndexReader.close();
dataReader.close();
sortedWordCountMap = wordCountMap.entrySet().stream().sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())).collect(Collectors.toList());
wordCountIterator = sortedWordCountMap.iterator();
} catch (IOException e) {
throw new DataFlowException(e);
}
}
use of org.apache.lucene.index.TermsEnum in project textdb by TextDB.
the class DataReader method buildPayloadFromTermVector.
private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
ArrayList<Span> payloadSpanList = new ArrayList<>();
for (Attribute attr : inputSchema.getAttributes()) {
String attributeName = attr.getAttributeName();
AttributeType attributeType = attr.getAttributeType();
// payload.
if (attributeType != AttributeType.TEXT) {
continue;
}
String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
if (termVector == null) {
continue;
}
TermsEnum termsEnum = termVector.iterator();
PostingsEnum termPostings = null;
// go through document terms
while ((termsEnum.next()) != null) {
termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
continue;
}
// for each term, go through its postings
for (int i = 0; i < termPostings.freq(); i++) {
// nextPosition needs to be called first
int tokenPosition = termPostings.nextPosition();
int charStart = termPostings.startOffset();
int charEnd = termPostings.endOffset();
String analyzedTermStr = termsEnum.term().utf8ToString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
payloadSpanList.add(span);
}
}
}
return payloadSpanList;
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestLucene54DocValuesFormat method doTestTermsEnumRandom.
// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
// set to duel against a codec which has ordinals:
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
final DocValuesFormat dv = new Lucene54DocValuesFormat();
conf.setCodec(new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return pf;
}
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return dv;
}
});
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
// index some docs
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
doc.add(idField);
final int length = TestUtil.nextInt(random(), minLength, maxLength);
int numValues = random().nextInt(17);
// create a random list of strings
List<String> values = new ArrayList<>();
for (int v = 0; v < numValues; v++) {
values.add(TestUtil.randomSimpleString(random(), minLength, length));
}
// add in any order to the indexed field
ArrayList<String> unordered = new ArrayList<>(values);
Collections.shuffle(unordered, random());
for (String v : values) {
doc.add(newStringField("indexed", v, Field.Store.NO));
}
// add in any order to the dv field
ArrayList<String> unordered2 = new ArrayList<>(values);
Collections.shuffle(unordered2, random());
for (String v : unordered2) {
doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
}
writer.addDocument(doc);
if (random().nextInt(31) == 0) {
writer.commit();
}
}
// delete some docs
int numDeletions = random().nextInt(numDocs / 10);
for (int i = 0; i < numDeletions; i++) {
int id = random().nextInt(numDocs);
writer.deleteDocuments(new Term("id", Integer.toString(id)));
}
// compare per-segment
DirectoryReader ir = writer.getReader();
for (LeafReaderContext context : ir.leaves()) {
LeafReader r = context.reader();
Terms terms = r.terms("indexed");
if (terms != null) {
SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
assertEquals(terms.size(), ssdv.getValueCount());
TermsEnum expected = terms.iterator();
TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
assertEquals(terms.size(), expected, actual);
doTestSortedSetEnumAdvanceIndependently(ssdv);
}
}
ir.close();
writer.forceMerge(1);
// now compare again after the merge
ir = writer.getReader();
LeafReader ar = getOnlyLeafReader(ir);
Terms terms = ar.terms("indexed");
if (terms != null) {
assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
TermsEnum expected = terms.iterator();
TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
assertEquals(terms.size(), expected, actual);
}
ir.close();
writer.close();
dir.close();
}
Aggregations