use of org.apache.lucene.document.Document in project elasticsearch by elastic.
the class NoisyChannelSpellCheckerTests method testMultiGenerator.
public void testMultiGenerator() throws IOException {
RAMDirectory dir = new RAMDirectory();
Map<String, Analyzer> mapping = new HashMap<>();
mapping.put("body_ngram", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
ShingleFilter tf = new ShingleFilter(t, 2, 3);
tf.setOutputUnigrams(false);
return new TokenStreamComponents(t, new LowerCaseFilter(tf));
}
});
mapping.put("body", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new LowerCaseFilter(t));
}
});
mapping.put("body_reverse", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new ReverseStringFilter(new LowerCaseFilter(t)));
}
});
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
IndexWriter writer = new IndexWriter(dir, conf);
String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
for (String line : strings) {
Document doc = new Document();
doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
}
DirectoryReader ir = DirectoryReader.open(writer);
LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
// only use forward with constant prefix
assertThat(corrections.length, equalTo(0));
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
// Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
}
use of org.apache.lucene.document.Document in project elasticsearch by elastic.
the class SmoothingModelTestCase method testBuildWordScorer.
/**
* Test the WordScorer emitted by the smoothing model
*/
public void testBuildWordScorer() throws IOException {
SmoothingModel testModel = createTestModel();
Map<String, Analyzer> mapping = new HashMap<>();
mapping.put("field", new WhitespaceAnalyzer());
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriter writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(wrapper));
Document doc = new Document();
doc.add(new Field("field", "someText", TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
DirectoryReader ir = DirectoryReader.open(writer);
WordScorer wordScorer = testModel.buildWordScorerFactory().newScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.9d, BytesRefs.toBytesRef(" "));
assertWordScorer(wordScorer, testModel);
}
use of org.apache.lucene.document.Document in project che by eclipse.
the class LuceneSearcher method createDocument.
protected Document createDocument(VirtualFile virtualFile, Reader reader) throws ServerException {
final Document doc = new Document();
doc.add(new StringField(PATH_FIELD, virtualFile.getPath().toString(), Field.Store.YES));
doc.add(new TextField(NAME_FIELD, virtualFile.getName(), Field.Store.YES));
if (reader != null) {
doc.add(new TextField(TEXT_FIELD, reader));
}
return doc;
}
use of org.apache.lucene.document.Document in project elasticsearch by elastic.
the class IndexShardTestCase method getShardDocUIDs.
protected Set<Uid> getShardDocUIDs(final IndexShard shard) throws IOException {
shard.refresh("get_uids");
try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
Set<Uid> ids = new HashSet<>();
for (LeafReaderContext leafContext : searcher.reader().leaves()) {
LeafReader reader = leafContext.reader();
Bits liveDocs = reader.getLiveDocs();
for (int i = 0; i < reader.maxDoc(); i++) {
if (liveDocs == null || liveDocs.get(i)) {
Document uuid = reader.document(i, Collections.singleton(UidFieldMapper.NAME));
ids.add(Uid.createUid(uuid.get(UidFieldMapper.NAME)));
}
}
}
return ids;
}
}
use of org.apache.lucene.document.Document in project elasticsearch by elastic.
the class FreqTermsEnumTests method setUp.
@Before
@Override
public void setUp() throws Exception {
super.setUp();
referenceAll = new HashMap<>();
referenceNotDeleted = new HashMap<>();
referenceFilter = new HashMap<>();
Directory dir = newDirectory();
// use keyword analyzer we rely on the stored field holding the exact term.
IndexWriterConfig conf = newIndexWriterConfig(new KeywordAnalyzer());
if (frequently()) {
// we don't want to do any merges, so we won't expunge deletes
conf.setMergePolicy(NoMergePolicy.INSTANCE);
}
iw = new IndexWriter(dir, conf);
terms = new String[scaledRandomIntBetween(10, 300)];
for (int i = 0; i < terms.length; i++) {
terms[i] = randomAsciiOfLength(5);
}
int numberOfDocs = scaledRandomIntBetween(30, 300);
Document[] docs = new Document[numberOfDocs];
for (int i = 0; i < numberOfDocs; i++) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Field.Store.YES));
docs[i] = doc;
for (String term : terms) {
if (randomBoolean()) {
continue;
}
int freq = randomIntBetween(1, 3);
for (int j = 0; j < freq; j++) {
doc.add(new TextField("field", term, Field.Store.YES));
}
}
}
for (int i = 0; i < docs.length; i++) {
Document doc = docs[i];
iw.addDocument(doc);
if (rarely()) {
iw.commit();
}
}
Set<String> deletedIds = new HashSet<>();
for (int i = 0; i < docs.length; i++) {
Document doc = docs[i];
if (randomInt(5) == 2) {
Term idTerm = new Term("id", doc.getField("id").stringValue());
deletedIds.add(idTerm.text());
iw.deleteDocuments(idTerm);
}
}
for (String term : terms) {
referenceAll.put(term, new FreqHolder());
referenceFilter.put(term, new FreqHolder());
referenceNotDeleted.put(term, new FreqHolder());
}
// now go over each doc, build the relevant references and filter
reader = DirectoryReader.open(iw);
List<BytesRef> filterTerms = new ArrayList<>();
for (int docId = 0; docId < reader.maxDoc(); docId++) {
Document doc = reader.document(docId);
addFreqs(doc, referenceAll);
if (!deletedIds.contains(doc.getField("id").stringValue())) {
addFreqs(doc, referenceNotDeleted);
if (randomBoolean()) {
filterTerms.add(new BytesRef(doc.getField("id").stringValue()));
addFreqs(doc, referenceFilter);
}
}
}
filter = new TermInSetQuery("id", filterTerms);
}
Aggregations