use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-skos by behas.
the class URIbasedTermExpansionTest method uriBasedTermExpansion.
/**
* This test indexes a sample metadata record (=lucene document) having a
* "title", "description", and "subject" field, which is semantically
* enriched by a URI pointing to a SKOS concept "weapons".
* <p/>
* A search for "arms" returns that record as a result because "arms" is
* defined as an alternative label (altLabel) for the concept "weapons".
*
* @throws IOException
*/
@Test
public void uriBasedTermExpansion() throws IOException {
/* defining the document to be indexed */
Document doc = new Document();
doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED));
doc.add(new Field("subject", "http://www.ukat.org.uk/thesaurus/concept/859", TextField.TYPE_NOT_STORED));
/* setting up the SKOS analyzer */
String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
String indexPath = "build/";
/* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.URI);
/* Define different analyzers for different fields */
Map<String, Analyzer> analyzerPerField = new HashMap<>();
analyzerPerField.put("subject", skosAnalyzer);
PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);
/* setting up a writer with a default (simple) analyzer */
writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));
/* adding the document to the index */
writer.addDocument(doc);
/* defining a query that searches over all fields */
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);
/* creating a new searcher */
searcher = new IndexSearcher(DirectoryReader.open(writer, false));
TopDocs results = searcher.search(builder.build(), 10);
/* the document matches because "arms" is among the expanded terms */
assertEquals(1, results.totalHits);
/* defining a query that searches for a broader concept */
Query query = new TermQuery(new Term("subject", "military equipment"));
results = searcher.search(query, 10);
/* ... also returns the document as result */
assertEquals(1, results.totalHits);
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project camel by apache.
the class LuceneIndexAndQueryProducerTest method createRegistry.
@Override
protected JndiRegistry createRegistry() throws Exception {
JndiRegistry registry = new JndiRegistry(createJndiContext());
registry.bind("std", new File("target/stdindexDir"));
registry.bind("load_dir", new File("src/test/resources/sources"));
registry.bind("stdAnalyzer", new StandardAnalyzer());
registry.bind("simple", new File("target/simpleindexDir"));
registry.bind("simpleAnalyzer", new SimpleAnalyzer());
registry.bind("whitespace", new File("target/whitespaceindexDir"));
registry.bind("whitespaceAnalyzer", new WhitespaceAnalyzer());
return registry;
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-solr by apache.
the class TestSort method testSort.
public void testSort() throws Exception {
Directory dir = new RAMDirectory();
Field f = new StringField("f", "0", Field.Store.NO);
Field f2 = new StringField("f2", "0", Field.Store.NO);
for (int iterCnt = 0; iterCnt < iter; iterCnt++) {
IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(new SimpleAnalyzer()).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
final MyDoc[] mydocs = new MyDoc[ndocs];
int v1EmptyPercent = 50;
int v2EmptyPercent = 50;
int commitCountdown = commitCount;
for (int i = 0; i < ndocs; i++) {
MyDoc mydoc = new MyDoc();
mydoc.doc = i;
mydocs[i] = mydoc;
Document document = new Document();
if (r.nextInt(100) < v1EmptyPercent) {
mydoc.val = Integer.toString(r.nextInt(maxval));
f.setStringValue(mydoc.val);
document.add(f);
}
if (r.nextInt(100) < v2EmptyPercent) {
mydoc.val2 = Integer.toString(r.nextInt(maxval));
f2.setStringValue(mydoc.val2);
document.add(f2);
}
iw.addDocument(document);
if (--commitCountdown <= 0) {
commitCountdown = commitCount;
iw.commit();
}
}
iw.close();
Map<String, UninvertingReader.Type> mapping = new HashMap<>();
mapping.put("f", UninvertingReader.Type.SORTED);
mapping.put("f2", UninvertingReader.Type.SORTED);
DirectoryReader reader = UninvertingReader.wrap(DirectoryReader.open(dir), mapping);
IndexSearcher searcher = new IndexSearcher(reader);
// System.out.println("segments="+searcher.getIndexReader().getSequentialSubReaders().length);
assertTrue(reader.leaves().size() > 1);
for (int i = 0; i < qiter; i++) {
Filter filt = new Filter() {
@Override
public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) {
return BitsFilteredDocIdSet.wrap(randSet(context.reader().maxDoc()), acceptDocs);
}
@Override
public String toString(String field) {
return "TestSortFilter";
}
@Override
public boolean equals(Object other) {
return other == this;
}
@Override
public int hashCode() {
return System.identityHashCode(this);
}
};
int top = r.nextInt((ndocs >> 3) + 1) + 1;
final boolean luceneSort = r.nextBoolean();
final boolean sortMissingLast = !luceneSort && r.nextBoolean();
final boolean sortMissingFirst = !luceneSort && !sortMissingLast;
final boolean reverse = r.nextBoolean();
List<SortField> sfields = new ArrayList<>();
final boolean secondary = r.nextBoolean();
final boolean luceneSort2 = r.nextBoolean();
final boolean sortMissingLast2 = !luceneSort2 && r.nextBoolean();
final boolean sortMissingFirst2 = !luceneSort2 && !sortMissingLast2;
final boolean reverse2 = r.nextBoolean();
if (r.nextBoolean())
sfields.add(new SortField(null, SortField.Type.SCORE));
// hit both use-cases of sort-missing-last
sfields.add(Sorting.getStringSortField("f", reverse, sortMissingLast, sortMissingFirst));
if (secondary) {
sfields.add(Sorting.getStringSortField("f2", reverse2, sortMissingLast2, sortMissingFirst2));
}
if (r.nextBoolean())
sfields.add(new SortField(null, SortField.Type.SCORE));
Sort sort = new Sort(sfields.toArray(new SortField[sfields.size()]));
final String nullRep = luceneSort || sortMissingFirst && !reverse || sortMissingLast && reverse ? "" : "zzz";
final String nullRep2 = luceneSort2 || sortMissingFirst2 && !reverse2 || sortMissingLast2 && reverse2 ? "" : "zzz";
boolean trackScores = r.nextBoolean();
boolean trackMaxScores = r.nextBoolean();
boolean scoreInOrder = r.nextBoolean();
final TopFieldCollector topCollector = TopFieldCollector.create(sort, top, true, trackScores, trackMaxScores);
final List<MyDoc> collectedDocs = new ArrayList<>();
// delegate and collect docs ourselves
Collector myCollector = new FilterCollector(topCollector) {
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
final int docBase = context.docBase;
return new FilterLeafCollector(super.getLeafCollector(context)) {
@Override
public void collect(int doc) throws IOException {
super.collect(doc);
collectedDocs.add(mydocs[docBase + doc]);
}
};
}
};
searcher.search(filt, myCollector);
Collections.sort(collectedDocs, (o1, o2) -> {
String v1 = o1.val == null ? nullRep : o1.val;
String v2 = o2.val == null ? nullRep : o2.val;
int cmp = v1.compareTo(v2);
if (reverse)
cmp = -cmp;
if (cmp != 0)
return cmp;
if (secondary) {
v1 = o1.val2 == null ? nullRep2 : o1.val2;
v2 = o2.val2 == null ? nullRep2 : o2.val2;
cmp = v1.compareTo(v2);
if (reverse2)
cmp = -cmp;
}
cmp = cmp == 0 ? o1.doc - o2.doc : cmp;
return cmp;
});
TopDocs topDocs = topCollector.topDocs();
ScoreDoc[] sdocs = topDocs.scoreDocs;
for (int j = 0; j < sdocs.length; j++) {
int id = sdocs[j].doc;
if (id != collectedDocs.get(j).doc) {
log.error("Error at pos " + j + "\n\tsortMissingFirst=" + sortMissingFirst + " sortMissingLast=" + sortMissingLast + " reverse=" + reverse + "\n\tEXPECTED=" + collectedDocs);
}
assertEquals(id, collectedDocs.get(j).doc);
}
}
reader.close();
}
dir.close();
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-solr by apache.
the class TestPerFieldAnalyzerWrapper method testPerField.
public void testPerField() throws Exception {
String text = "Qwerty";
Map<String, Analyzer> analyzerPerField = Collections.<String, Analyzer>singletonMap("special", new SimpleAnalyzer());
Analyzer defaultAnalyzer = new WhitespaceAnalyzer();
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzerPerField);
try (TokenStream tokenStream = analyzer.tokenStream("field", text)) {
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
tokenStream.reset();
assertTrue(tokenStream.incrementToken());
assertEquals("WhitespaceAnalyzer does not lowercase", "Qwerty", termAtt.toString());
assertFalse(tokenStream.incrementToken());
tokenStream.end();
}
try (TokenStream tokenStream = analyzer.tokenStream("special", text)) {
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
tokenStream.reset();
assertTrue(tokenStream.incrementToken());
assertEquals("SimpleAnalyzer lowercases", "qwerty", termAtt.toString());
assertFalse(tokenStream.incrementToken());
tokenStream.end();
}
// TODO: fix this about PFAW, this is crazy
analyzer.close();
defaultAnalyzer.close();
IOUtils.close(analyzerPerField.values());
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-solr by apache.
the class TestPerFieldAnalyzerWrapper method testReuseWrapped.
public void testReuseWrapped() throws Exception {
final String text = "Qwerty";
final Analyzer specialAnalyzer = new SimpleAnalyzer();
final Analyzer defaultAnalyzer = new WhitespaceAnalyzer();
TokenStream ts1, ts2, ts3, ts4;
final PerFieldAnalyzerWrapper wrapper1 = new PerFieldAnalyzerWrapper(defaultAnalyzer, Collections.<String, Analyzer>singletonMap("special", specialAnalyzer));
// test that the PerFieldWrapper returns the same instance as original Analyzer:
ts1 = defaultAnalyzer.tokenStream("something", text);
ts2 = wrapper1.tokenStream("something", text);
assertSame(ts1, ts2);
ts1 = specialAnalyzer.tokenStream("special", text);
ts2 = wrapper1.tokenStream("special", text);
assertSame(ts1, ts2);
// Wrap with another wrapper, which does *not* extend DelegatingAnalyzerWrapper:
final AnalyzerWrapper wrapper2 = new AnalyzerWrapper(wrapper1.getReuseStrategy()) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return wrapper1;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
assertNotSame(specialAnalyzer.tokenStream("special", text), components.getTokenStream());
TokenFilter filter = new ASCIIFoldingFilter(components.getTokenStream());
return new TokenStreamComponents(components.getTokenizer(), filter);
}
};
ts3 = wrapper2.tokenStream("special", text);
assertNotSame(ts1, ts3);
assertTrue(ts3 instanceof ASCIIFoldingFilter);
// check that cache did not get corrumpted:
ts2 = wrapper1.tokenStream("special", text);
assertSame(ts1, ts2);
// Wrap PerField with another PerField. In that case all TokenStreams returned must be the same:
final PerFieldAnalyzerWrapper wrapper3 = new PerFieldAnalyzerWrapper(wrapper1, Collections.<String, Analyzer>singletonMap("moreSpecial", specialAnalyzer));
ts1 = specialAnalyzer.tokenStream("special", text);
ts2 = wrapper3.tokenStream("special", text);
assertSame(ts1, ts2);
ts3 = specialAnalyzer.tokenStream("moreSpecial", text);
ts4 = wrapper3.tokenStream("moreSpecial", text);
assertSame(ts3, ts4);
assertSame(ts2, ts3);
IOUtils.close(wrapper3, wrapper2, wrapper1, specialAnalyzer, defaultAnalyzer);
}
Aggregations