Search in sources :

Example 26 with Document

use of org.apache.lucene.document.Document in project elasticsearch by elastic.

the class NoisyChannelSpellCheckerTests method testMultiGenerator.

public void testMultiGenerator() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<>();
    mapping.put("body_ngram", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(tf));
        }
    });
    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            return new TokenStreamComponents(t, new LowerCaseFilter(t));
        }
    });
    mapping.put("body_reverse", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            return new TokenStreamComponents(t, new ReverseStringFilter(new LowerCaseFilter(t)));
        }
    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
    IndexWriterConfig conf = new IndexWriterConfig(wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
    for (String line : strings) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }
    DirectoryReader ir = DirectoryReader.open(writer);
    LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
    DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
    CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    // only use forward with constant prefix
    assertThat(corrections.length, equalTo(0));
    corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    // Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
    corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
}
Also used : HashMap(java.util.HashMap) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) DirectSpellChecker(org.apache.lucene.search.spell.DirectSpellChecker) BytesRef(org.apache.lucene.util.BytesRef) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) DirectoryReader(org.apache.lucene.index.DirectoryReader) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 27 with Document

use of org.apache.lucene.document.Document in project elasticsearch by elastic.

the class SmoothingModelTestCase method testBuildWordScorer.

/**
     * Test the WordScorer emitted by the smoothing model
     */
public void testBuildWordScorer() throws IOException {
    SmoothingModel testModel = createTestModel();
    Map<String, Analyzer> mapping = new HashMap<>();
    mapping.put("field", new WhitespaceAnalyzer());
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
    IndexWriter writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(wrapper));
    Document doc = new Document();
    doc.add(new Field("field", "someText", TextField.TYPE_NOT_STORED));
    writer.addDocument(doc);
    DirectoryReader ir = DirectoryReader.open(writer);
    WordScorer wordScorer = testModel.buildWordScorerFactory().newScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.9d, BytesRefs.toBytesRef(" "));
    assertWordScorer(wordScorer, testModel);
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) HashMap(java.util.HashMap) DirectoryReader(org.apache.lucene.index.DirectoryReader) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 28 with Document

use of org.apache.lucene.document.Document in project che by eclipse.

the class LuceneSearcher method createDocument.

protected Document createDocument(VirtualFile virtualFile, Reader reader) throws ServerException {
    final Document doc = new Document();
    doc.add(new StringField(PATH_FIELD, virtualFile.getPath().toString(), Field.Store.YES));
    doc.add(new TextField(NAME_FIELD, virtualFile.getName(), Field.Store.YES));
    if (reader != null) {
        doc.add(new TextField(TEXT_FIELD, reader));
    }
    return doc;
}
Also used : StringField(org.apache.lucene.document.StringField) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document)

Example 29 with Document

use of org.apache.lucene.document.Document in project elasticsearch by elastic.

the class IndexShardTestCase method getShardDocUIDs.

protected Set<Uid> getShardDocUIDs(final IndexShard shard) throws IOException {
    shard.refresh("get_uids");
    try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
        Set<Uid> ids = new HashSet<>();
        for (LeafReaderContext leafContext : searcher.reader().leaves()) {
            LeafReader reader = leafContext.reader();
            Bits liveDocs = reader.getLiveDocs();
            for (int i = 0; i < reader.maxDoc(); i++) {
                if (liveDocs == null || liveDocs.get(i)) {
                    Document uuid = reader.document(i, Collections.singleton(UidFieldMapper.NAME));
                    ids.add(Uid.createUid(uuid.get(UidFieldMapper.NAME)));
                }
            }
        }
        return ids;
    }
}
Also used : Uid(org.elasticsearch.index.mapper.Uid) LeafReader(org.apache.lucene.index.LeafReader) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) Bits(org.apache.lucene.util.Bits) Document(org.apache.lucene.document.Document) Engine(org.elasticsearch.index.engine.Engine) HashSet(java.util.HashSet)

Example 30 with Document

use of org.apache.lucene.document.Document in project elasticsearch by elastic.

the class FreqTermsEnumTests method setUp.

@Before
@Override
public void setUp() throws Exception {
    super.setUp();
    referenceAll = new HashMap<>();
    referenceNotDeleted = new HashMap<>();
    referenceFilter = new HashMap<>();
    Directory dir = newDirectory();
    // use keyword analyzer we rely on the stored field holding the exact term.
    IndexWriterConfig conf = newIndexWriterConfig(new KeywordAnalyzer());
    if (frequently()) {
        // we don't want to do any merges, so we won't expunge deletes
        conf.setMergePolicy(NoMergePolicy.INSTANCE);
    }
    iw = new IndexWriter(dir, conf);
    terms = new String[scaledRandomIntBetween(10, 300)];
    for (int i = 0; i < terms.length; i++) {
        terms[i] = randomAsciiOfLength(5);
    }
    int numberOfDocs = scaledRandomIntBetween(30, 300);
    Document[] docs = new Document[numberOfDocs];
    for (int i = 0; i < numberOfDocs; i++) {
        Document doc = new Document();
        doc.add(new StringField("id", Integer.toString(i), Field.Store.YES));
        docs[i] = doc;
        for (String term : terms) {
            if (randomBoolean()) {
                continue;
            }
            int freq = randomIntBetween(1, 3);
            for (int j = 0; j < freq; j++) {
                doc.add(new TextField("field", term, Field.Store.YES));
            }
        }
    }
    for (int i = 0; i < docs.length; i++) {
        Document doc = docs[i];
        iw.addDocument(doc);
        if (rarely()) {
            iw.commit();
        }
    }
    Set<String> deletedIds = new HashSet<>();
    for (int i = 0; i < docs.length; i++) {
        Document doc = docs[i];
        if (randomInt(5) == 2) {
            Term idTerm = new Term("id", doc.getField("id").stringValue());
            deletedIds.add(idTerm.text());
            iw.deleteDocuments(idTerm);
        }
    }
    for (String term : terms) {
        referenceAll.put(term, new FreqHolder());
        referenceFilter.put(term, new FreqHolder());
        referenceNotDeleted.put(term, new FreqHolder());
    }
    // now go over each doc, build the relevant references and filter
    reader = DirectoryReader.open(iw);
    List<BytesRef> filterTerms = new ArrayList<>();
    for (int docId = 0; docId < reader.maxDoc(); docId++) {
        Document doc = reader.document(docId);
        addFreqs(doc, referenceAll);
        if (!deletedIds.contains(doc.getField("id").stringValue())) {
            addFreqs(doc, referenceNotDeleted);
            if (randomBoolean()) {
                filterTerms.add(new BytesRef(doc.getField("id").stringValue()));
                addFreqs(doc, referenceFilter);
            }
        }
    }
    filter = new TermInSetQuery("id", filterTerms);
}
Also used : KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) IndexWriter(org.apache.lucene.index.IndexWriter) TermInSetQuery(org.apache.lucene.search.TermInSetQuery) StringField(org.apache.lucene.document.StringField) TextField(org.apache.lucene.document.TextField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) HashSet(java.util.HashSet) Before(org.junit.Before)

Aggregations

Document (org.apache.lucene.document.Document)2344 Directory (org.apache.lucene.store.Directory)1374 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)798 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)752 IndexReader (org.apache.lucene.index.IndexReader)598 Field (org.apache.lucene.document.Field)480 IndexSearcher (org.apache.lucene.search.IndexSearcher)470 Term (org.apache.lucene.index.Term)456 BytesRef (org.apache.lucene.util.BytesRef)415 StringField (org.apache.lucene.document.StringField)403 TextField (org.apache.lucene.document.TextField)389 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)325 IndexWriter (org.apache.lucene.index.IndexWriter)312 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)278 TopDocs (org.apache.lucene.search.TopDocs)270 TermQuery (org.apache.lucene.search.TermQuery)237 FieldType (org.apache.lucene.document.FieldType)231 DirectoryReader (org.apache.lucene.index.DirectoryReader)226 Test (org.junit.Test)222 RAMDirectory (org.apache.lucene.store.RAMDirectory)211