use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project elasticsearch by elastic.
the class NoisyChannelSpellCheckerTests method testNgram.
public void testNgram() throws IOException {
RAMDirectory dir = new RAMDirectory();
Map<String, Analyzer> mapping = new HashMap<>();
mapping.put("body_ngram", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
ShingleFilter tf = new ShingleFilter(t, 2, 3);
tf.setOutputUnigrams(false);
return new TokenStreamComponents(t, new LowerCaseFilter(tf));
}
});
mapping.put("body", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new LowerCaseFilter(t));
}
});
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
IndexWriter writer = new IndexWriter(dir, conf);
String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "USA Hero", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
for (String line : strings) {
Document doc = new Document();
doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
}
DirectoryReader ir = DirectoryReader.open(writer);
WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
Correction[] corrections = result.corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>"));
assertThat(result.cutoffScore, greaterThan(0d));
result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1);
corrections = result.corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame"));
assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));
suggester = new NoisyChannelSpellChecker(0.85);
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the <em>god</em> jewel"));
assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor the <em>god</em> jewel"));
assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn</em> the <em>god</em> jewel"));
assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the got jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
// Test some of the highlighting corner cases
suggester = new NoisyChannelSpellChecker(0.85);
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(space).utf8ToString(), equalTo("xor teh god jewel"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr the god</em> jewel"));
assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor <em>the god</em> jewel"));
assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn the god</em> jewel"));
assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xor teh <em>god</em> jewel"));
// test synonyms
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
TokenFilter filter = new LowerCaseFilter(t);
try {
SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer());
parser.parse(new StringReader("usa => usa, america, american"));
filter = new SynonymFilter(filter, parser.build(), true);
} catch (Exception e) {
throw new RuntimeException(e);
}
return new TokenStreamComponents(t, filter);
}
};
spellchecker.setAccuracy(0.0f);
spellchecker.setMinPrefix(1);
spellchecker.setMinQueryLength(1);
suggester = new NoisyChannelSpellChecker(0.85);
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
// Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>"));
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project elasticsearch by elastic.
the class NoisyChannelSpellCheckerTests method testTrigram.
public void testTrigram() throws IOException {
RAMDirectory dir = new RAMDirectory();
Map<String, Analyzer> mapping = new HashMap<>();
mapping.put("body_ngram", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
ShingleFilter tf = new ShingleFilter(t, 2, 3);
tf.setOutputUnigrams(false);
return new TokenStreamComponents(t, new LowerCaseFilter(tf));
}
});
mapping.put("body", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new LowerCaseFilter(t));
}
});
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
IndexWriter writer = new IndexWriter(dir, conf);
String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "USA Hero", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
for (String line : strings) {
Document doc = new Document();
doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
}
DirectoryReader ir = DirectoryReader.open(writer);
WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections;
assertThat(corrections.length, equalTo(0));
// assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));
wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
// test synonyms
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
TokenFilter filter = new LowerCaseFilter(t);
try {
SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer());
parser.parse(new StringReader("usa => usa, america, american"));
filter = new SynonymFilter(filter, parser.build(), true);
} catch (Exception e) {
throw new RuntimeException(e);
}
return new TokenStreamComponents(t, filter);
}
};
spellchecker.setAccuracy(0.0f);
spellchecker.setMinPrefix(1);
spellchecker.setMinQueryLength(1);
suggester = new NoisyChannelSpellChecker(0.95);
wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1);
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections;
assertThat(corrections.length, equalTo(2));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project jackrabbit-oak by apache.
the class IndexWriterUtils method getIndexWriterConfig.
public static IndexWriterConfig getIndexWriterConfig(IndexDefinition definition, boolean remoteDir, LuceneIndexWriterConfig writerConfig) {
// FIXME: Hack needed to make Lucene work in an OSGi environment
Thread thread = Thread.currentThread();
ClassLoader loader = thread.getContextClassLoader();
thread.setContextClassLoader(IndexWriterConfig.class.getClassLoader());
try {
Analyzer definitionAnalyzer = definition.getAnalyzer();
Map<String, Analyzer> analyzers = new HashMap<String, Analyzer>();
analyzers.put(FieldNames.SPELLCHECK, new ShingleAnalyzerWrapper(LuceneIndexConstants.ANALYZER, 3));
if (!definition.isSuggestAnalyzed()) {
analyzers.put(FieldNames.SUGGEST, SuggestHelper.getAnalyzer());
}
Analyzer analyzer = new PerFieldAnalyzerWrapper(definitionAnalyzer, analyzers);
IndexWriterConfig config = new IndexWriterConfig(VERSION, analyzer);
if (remoteDir) {
config.setMergeScheduler(new SerialMergeScheduler());
}
if (definition.getCodec() != null) {
config.setCodec(definition.getCodec());
}
config.setRAMBufferSizeMB(writerConfig.getRamBufferSizeMB());
return config;
} finally {
thread.setContextClassLoader(loader);
}
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project lucene-skos by behas.
the class URIbasedTermExpansionTest method uriBasedTermExpansion.
/**
* This test indexes a sample metadata record (=lucene document) having a
* "title", "description", and "subject" field, which is semantically
* enriched by a URI pointing to a SKOS concept "weapons".
* <p/>
* A search for "arms" returns that record as a result because "arms" is
* defined as an alternative label (altLabel) for the concept "weapons".
*
* @throws IOException
*/
@Test
public void uriBasedTermExpansion() throws IOException {
/* defining the document to be indexed */
Document doc = new Document();
doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED));
doc.add(new Field("subject", "http://www.ukat.org.uk/thesaurus/concept/859", TextField.TYPE_NOT_STORED));
/* setting up the SKOS analyzer */
String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
String indexPath = "build/";
/* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.URI);
/* Define different analyzers for different fields */
Map<String, Analyzer> analyzerPerField = new HashMap<>();
analyzerPerField.put("subject", skosAnalyzer);
PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);
/* setting up a writer with a default (simple) analyzer */
writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));
/* adding the document to the index */
writer.addDocument(doc);
/* defining a query that searches over all fields */
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);
/* creating a new searcher */
searcher = new IndexSearcher(DirectoryReader.open(writer, false));
TopDocs results = searcher.search(builder.build(), 10);
/* the document matches because "arms" is among the expanded terms */
assertEquals(1, results.totalHits);
/* defining a query that searches for a broader concept */
Query query = new TermQuery(new Term("subject", "military equipment"));
results = searcher.search(query, 10);
/* ... also returns the document as result */
assertEquals(1, results.totalHits);
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project lucene-skos by behas.
the class LabelbasedTermExpansionTest method labelBasedTermExpansion.
/**
* This test indexes a sample metadata record (=lucene document) having a
* "title", "description", and "subject" field.
* <p/>
* A search for "arms" returns that record as a result because "arms" is
* defined as an alternative label for "weapons", the term which is
* contained in the subject field.
*
* @throws IOException
*/
@Test
public void labelBasedTermExpansion() throws IOException {
/* defining the document to be indexed */
Document doc = new Document();
doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED));
doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED));
/* setting up the SKOS analyzer */
String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
String indexPath = "build/";
/* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.LABEL);
/* Define different analyzers for different fields */
Map<String, Analyzer> analyzerPerField = new HashMap<>();
analyzerPerField.put("subject", skosAnalyzer);
PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);
/* setting up a writer with a default (simple) analyzer */
writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));
/* adding the document to the index */
writer.addDocument(doc);
/* defining a query that searches over all fields */
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);
/* creating a new searcher */
searcher = new IndexSearcher(DirectoryReader.open(writer, false));
TopDocs results = searcher.search(builder.build(), 10);
/* the document matches because "arms" is among the expanded terms */
assertEquals(1, results.totalHits);
/* defining a query that searches for a broader concept */
Query query = new TermQuery(new Term("subject", "military equipment"));
results = searcher.search(query, 10);
/* ... also returns the document as result */
assertEquals(1, results.totalHits);
}
Aggregations