use of org.apache.lucene.analysis.core.SimpleAnalyzer in project camel by apache.
the class LuceneIndexAndQueryProducerTest method createRegistry.
@Override
protected JndiRegistry createRegistry() throws Exception {
JndiRegistry registry = new JndiRegistry(createJndiContext());
registry.bind("std", new File("target/stdindexDir"));
registry.bind("load_dir", new File("src/test/resources/sources"));
registry.bind("stdAnalyzer", new StandardAnalyzer());
registry.bind("simple", new File("target/simpleindexDir"));
registry.bind("simpleAnalyzer", new SimpleAnalyzer());
registry.bind("whitespace", new File("target/whitespaceindexDir"));
registry.bind("whitespaceAnalyzer", new WhitespaceAnalyzer());
return registry;
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.
the class EntityLinking method search.
/**
* Returns a list of query results.
*
* @param queryName the entity name to search
* @throws Exception on error
* @return a list of top ranked entities
*/
public List<RankedEntity> search(String queryName, int numHits) throws Exception {
List<RankedEntity> rankedEntities = new ArrayList<>();
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
// do exact search on query name
QueryParser queryParser = new QueryParser(IndexTopics.FIELD_NAME, new SimpleAnalyzer());
queryParser.setAutoGeneratePhraseQueries(true);
queryParser.setPhraseSlop(3);
queryName = "\"" + queryName + "\"";
Query query = queryParser.parse(queryName);
TopDocs rs = searcher.search(query, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
float score = docs.scores[i];
String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
String shortMid = getShortMid(mid);
String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
rankedEntities.add(new RankedEntity(shortMid, score, name, label));
}
if (docs.documents.length >= numHits) {
return rankedEntities;
}
int numHitsLeft = numHits - docs.documents.length;
// do TFIDF search
Similarity similarity = new ClassicSimilarity();
searcher.setSimilarity(similarity);
queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL }, new SimpleAnalyzer());
queryParser.setDefaultOperator(QueryParser.Operator.AND);
query = queryParser.parse(queryName);
rs = searcher.search(query, numHitsLeft);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
float score = docs.scores[i];
String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
String shortMid = getShortMid(mid);
String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
rankedEntities.add(new RankedEntity(shortMid, score, name, label));
}
return rankedEntities;
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-skos by behas.
the class URIbasedTermExpansionTest method uriBasedTermExpansion.
/**
* This test indexes a sample metadata record (=lucene document) having a
* "title", "description", and "subject" field, which is semantically
* enriched by a URI pointing to a SKOS concept "weapons".
* <p/>
* A search for "arms" returns that record as a result because "arms" is
* defined as an alternative label (altLabel) for the concept "weapons".
*
* @throws IOException
*/
@Test
public void uriBasedTermExpansion() throws IOException {
/* defining the document to be indexed */
Document doc = new Document();
doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED));
doc.add(new Field("subject", "http://www.ukat.org.uk/thesaurus/concept/859", TextField.TYPE_NOT_STORED));
/* setting up the SKOS analyzer */
String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
String indexPath = "build/";
/* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.URI);
/* Define different analyzers for different fields */
Map<String, Analyzer> analyzerPerField = new HashMap<>();
analyzerPerField.put("subject", skosAnalyzer);
PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);
/* setting up a writer with a default (simple) analyzer */
writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));
/* adding the document to the index */
writer.addDocument(doc);
/* defining a query that searches over all fields */
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);
/* creating a new searcher */
searcher = new IndexSearcher(DirectoryReader.open(writer, false));
TopDocs results = searcher.search(builder.build(), 10);
/* the document matches because "arms" is among the expanded terms */
assertEquals(1, results.totalHits);
/* defining a query that searches for a broader concept */
Query query = new TermQuery(new Term("subject", "military equipment"));
results = searcher.search(query, 10);
/* ... also returns the document as result */
assertEquals(1, results.totalHits);
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-skos by behas.
the class SKOSLabelFilterTest method testTermQuery.
@Test
public void testTermQuery() throws IOException, QueryNodeException {
Document doc = new Document();
doc.add(new Field("content", "I work for the united nations", TextField.TYPE_STORED));
writer.addDocument(doc);
searcher = new IndexSearcher(DirectoryReader.open(writer, false));
StandardQueryParser parser = new StandardQueryParser(new SimpleAnalyzer());
Query query = parser.parse("united nations", "content");
assertEquals(1, searcher.search(query, 1).totalHits);
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-skos by behas.
the class LabelbasedTermExpansionTest method labelBasedTermExpansion.
/**
* This test indexes a sample metadata record (=lucene document) having a
* "title", "description", and "subject" field.
* <p/>
* A search for "arms" returns that record as a result because "arms" is
* defined as an alternative label for "weapons", the term which is
* contained in the subject field.
*
* @throws IOException
*/
@Test
public void labelBasedTermExpansion() throws IOException {
/* defining the document to be indexed */
Document doc = new Document();
doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED));
doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED));
/* setting up the SKOS analyzer */
String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
String indexPath = "build/";
/* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.LABEL);
/* Define different analyzers for different fields */
Map<String, Analyzer> analyzerPerField = new HashMap<>();
analyzerPerField.put("subject", skosAnalyzer);
PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);
/* setting up a writer with a default (simple) analyzer */
writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));
/* adding the document to the index */
writer.addDocument(doc);
/* defining a query that searches over all fields */
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);
/* creating a new searcher */
searcher = new IndexSearcher(DirectoryReader.open(writer, false));
TopDocs results = searcher.search(builder.build(), 10);
/* the document matches because "arms" is among the expanded terms */
assertEquals(1, results.totalHits);
/* defining a query that searches for a broader concept */
Query query = new TermQuery(new Term("subject", "military equipment"));
results = searcher.search(query, 10);
/* ... also returns the document as result */
assertEquals(1, results.totalHits);
}
Aggregations