Search in sources :

Example 66 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class DistinctValuesCollectorTest method createIndexContext.

private IndexContext createIndexContext() throws Exception {
    Random random = random();
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random, dir, newIndexWriterConfig(new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
    int numDocs = 86 + random.nextInt(1087) * RANDOM_MULTIPLIER;
    String[] groupValues = new String[numDocs / 5];
    String[] countValues = new String[numDocs / 10];
    for (int i = 0; i < groupValues.length; i++) {
        groupValues[i] = generateRandomNonEmptyString();
    }
    for (int i = 0; i < countValues.length; i++) {
        countValues[i] = generateRandomNonEmptyString();
    }
    List<String> contentStrings = new ArrayList<>();
    Map<String, Map<String, Set<String>>> searchTermToGroupCounts = new HashMap<>();
    for (int i = 1; i <= numDocs; i++) {
        String groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.length)];
        String countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.length)];
        String content = "random" + random.nextInt(numDocs / 20);
        Map<String, Set<String>> groupToCounts = searchTermToGroupCounts.get(content);
        if (groupToCounts == null) {
            // Groups sort always DOCID asc...
            searchTermToGroupCounts.put(content, groupToCounts = new LinkedHashMap<>());
            contentStrings.add(content);
        }
        Set<String> countsVals = groupToCounts.get(groupValue);
        if (countsVals == null) {
            groupToCounts.put(groupValue, countsVals = new HashSet<>());
        }
        countsVals.add(countValue);
        Document doc = new Document();
        doc.add(new StringField("id", String.format(Locale.ROOT, "%09d", i), Field.Store.YES));
        doc.add(new SortedDocValuesField("id", new BytesRef(String.format(Locale.ROOT, "%09d", i))));
        if (groupValue != null) {
            addField(doc, GROUP_FIELD, groupValue);
        }
        if (countValue != null) {
            addField(doc, COUNT_FIELD, countValue);
        }
        doc.add(new TextField("content", content, Field.Store.YES));
        w.addDocument(doc);
    }
    DirectoryReader reader = w.getReader();
    if (VERBOSE) {
        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            Document doc = reader.document(docID);
            System.out.println("docID=" + docID + " id=" + doc.get("id") + " content=" + doc.get("content") + " author=" + doc.get("author") + " publisher=" + doc.get("publisher"));
        }
    }
    w.close();
    return new IndexContext(dir, reader, searchTermToGroupCounts, contentStrings.toArray(new String[contentStrings.size()]));
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) DirectoryReader(org.apache.lucene.index.DirectoryReader) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) LinkedHashMap(java.util.LinkedHashMap) Random(java.util.Random) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringField(org.apache.lucene.document.StringField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) TextField(org.apache.lucene.document.TextField) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet)

Example 67 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class PayloadHelper method setUp.

/**
   * Sets up a RAMDirectory, and adds documents (using English.intToEnglish()) with two fields: field and multiField
   * and analyzes them using the PayloadAnalyzer
   * @param similarity The Similarity class to use in the Searcher
   * @param numDocs The num docs to add
   * @return An IndexSearcher
   */
// TODO: randomize
public IndexSearcher setUp(Random random, Similarity similarity, int numDocs) throws IOException {
    Directory directory = new MockDirectoryWrapper(random, new RAMDirectory());
    PayloadAnalyzer analyzer = new PayloadAnalyzer();
    // TODO randomize this
    IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer).setSimilarity(similarity));
    // writer.infoStream = System.out;
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        doc.add(new TextField(FIELD, English.intToEnglish(i), Field.Store.YES));
        doc.add(new TextField(MULTI_FIELD, English.intToEnglish(i) + "  " + English.intToEnglish(i), Field.Store.YES));
        doc.add(new TextField(NO_PAYLOAD_FIELD, English.intToEnglish(i), Field.Store.YES));
        writer.addDocument(doc);
    }
    writer.forceMerge(1);
    reader = DirectoryReader.open(writer);
    writer.close();
    IndexSearcher searcher = LuceneTestCase.newSearcher(LuceneTestCase.getOnlyLeafReader(reader));
    searcher.setSimilarity(similarity);
    return searcher;
}
Also used : MockDirectoryWrapper(org.apache.lucene.store.MockDirectoryWrapper) IndexSearcher(org.apache.lucene.search.IndexSearcher) IndexWriter(org.apache.lucene.index.IndexWriter) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 68 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class TestPayloadSpans method testShrinkToAfterShortestMatch2.

public void testShrinkToAfterShortestMatch2() throws IOException {
    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(new TestPayloadAnalyzer()));
    Document doc = new Document();
    doc.add(new TextField("content", new StringReader("a b a d k f a h i k a k")));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    IndexSearcher is = newSearcher(getOnlyLeafReader(reader), false);
    writer.close();
    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
    SpanQuery[] sqs = { stq1, stq2 };
    SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
    VerifyingCollector collector = new VerifyingCollector();
    Spans spans = snq.createWeight(is, false, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);
    TopDocs topDocs = is.search(snq, 1);
    Set<String> payloadSet = new HashSet<>();
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
        while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
            while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                collector.reset();
                spans.collect(collector);
                for (final BytesRef payload : collector.payloads) {
                    payloadSet.add(Term.toString(payload));
                }
            }
        }
    }
    assertEquals(2, payloadSet.size());
    assertTrue(payloadSet.contains("a:Noise:10"));
    assertTrue(payloadSet.contains("k:Noise:11"));
    reader.close();
    directory.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) SpanQuery(org.apache.lucene.search.spans.SpanQuery) Spans(org.apache.lucene.search.spans.Spans) TopDocs(org.apache.lucene.search.TopDocs) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) StringReader(java.io.StringReader) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet)

Example 69 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class IndexBasedSpellCheckerTest method testAlternateLocation.

@Test
public void testAlternateLocation() throws Exception {
    String[] ALT_DOCS = new String[] { "jumpin jack flash", "Sargent Peppers Lonely Hearts Club Band", "Born to Run", "Thunder Road", "Londons Burning", "A Horse with No Name", "Sweet Caroline" };
    IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
    NamedList spellchecker = new NamedList();
    spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
    File tmpDir = createTempDir().toFile();
    File indexDir = new File(tmpDir, "spellingIdx");
    //create a standalone index
    File altIndexDir = new File(tmpDir, "alternateIdx" + new Date().getTime());
    Directory dir = newFSDirectory(altIndexDir.toPath());
    IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(new WhitespaceAnalyzer()));
    for (int i = 0; i < ALT_DOCS.length; i++) {
        Document doc = new Document();
        doc.add(new TextField("title", ALT_DOCS[i], Field.Store.YES));
        iw.addDocument(doc);
    }
    iw.forceMerge(1);
    iw.close();
    dir.close();
    indexDir.mkdirs();
    spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
    spellchecker.add(AbstractLuceneSpellChecker.LOCATION, altIndexDir.getAbsolutePath());
    spellchecker.add(AbstractLuceneSpellChecker.FIELD, "title");
    spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
    SolrCore core = h.getCore();
    String dictName = checker.init(spellchecker, core);
    assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME, dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
    RefCounted<SolrIndexSearcher> holder = core.getSearcher();
    SolrIndexSearcher searcher = holder.get();
    try {
        checker.build(core, searcher);
        IndexReader reader = searcher.getIndexReader();
        Collection<Token> tokens = queryConverter.convert("flesh");
        SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, true, 0.5f, null);
        SpellingResult result = checker.getSuggestions(spellOpts);
        assertTrue("result is null and it shouldn't be", result != null);
        //should be lowercased, b/c we are using a lowercasing analyzer
        Map<String, Integer> suggestions = result.get(spellOpts.tokens.iterator().next());
        assertTrue("flesh is null and it shouldn't be", suggestions != null);
        assertTrue("flesh Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
        Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
        assertTrue(entry.getKey() + " is not equal to " + "flash", entry.getKey().equals("flash") == true);
        assertTrue(entry.getValue() + " does not equal: " + 1, entry.getValue() == 1);
        //test something not in the spell checker
        spellOpts.tokens = queryConverter.convert("super");
        result = checker.getSuggestions(spellOpts);
        assertTrue("result is null and it shouldn't be", result != null);
        suggestions = result.get(spellOpts.tokens.iterator().next());
        assertTrue("suggestions size should be 0", suggestions.size() == 0);
        spellOpts.tokens = queryConverter.convert("Caroline");
        result = checker.getSuggestions(spellOpts);
        assertTrue("result is null and it shouldn't be", result != null);
        suggestions = result.get(spellOpts.tokens.iterator().next());
        assertTrue("suggestions is not null and it should be", suggestions == null);
    } finally {
        holder.decref();
    }
}
Also used : SolrCore(org.apache.solr.core.SolrCore) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) TextField(org.apache.lucene.document.TextField) Directory(org.apache.lucene.store.Directory) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) NamedList(org.apache.solr.common.util.NamedList) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) Date(java.util.Date) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) Map(java.util.Map) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Example 70 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class QueryAutoStopWordAnalyzerTest method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    dir = new RAMDirectory();
    appAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(appAnalyzer));
    int numDocs = 200;
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        String variedFieldValue = variedFieldValues[i % variedFieldValues.length];
        String repetitiveFieldValue = repetitiveFieldValues[i % repetitiveFieldValues.length];
        doc.add(new TextField("variedField", variedFieldValue, Field.Store.YES));
        doc.add(new TextField("repetitiveField", repetitiveFieldValue, Field.Store.YES));
        writer.addDocument(doc);
    }
    writer.close();
    reader = DirectoryReader.open(dir);
}
Also used : IndexWriter(org.apache.lucene.index.IndexWriter) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

TextField (org.apache.lucene.document.TextField)194 Document (org.apache.lucene.document.Document)173 Directory (org.apache.lucene.store.Directory)99 Term (org.apache.lucene.index.Term)63 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)61 IndexWriter (org.apache.lucene.index.IndexWriter)58 IndexSearcher (org.apache.lucene.search.IndexSearcher)55 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)52 Field (org.apache.lucene.document.Field)50 StringField (org.apache.lucene.document.StringField)50 BytesRef (org.apache.lucene.util.BytesRef)49 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)44 IndexReader (org.apache.lucene.index.IndexReader)43 TermQuery (org.apache.lucene.search.TermQuery)41 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)31 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)30 TopDocs (org.apache.lucene.search.TopDocs)29 RAMDirectory (org.apache.lucene.store.RAMDirectory)29 FieldType (org.apache.lucene.document.FieldType)23 Query (org.apache.lucene.search.Query)23