Search in sources :

Example 71 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class TestTermVectors method beforeClass.

@BeforeClass
public static void beforeClass() throws Exception {
    directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)).setMergePolicy(newLogMergePolicy()));
    //writer.infoStream = System.out;
    for (int i = 0; i < 1000; i++) {
        Document doc = new Document();
        FieldType ft = new FieldType(TextField.TYPE_STORED);
        int mod3 = i % 3;
        int mod2 = i % 2;
        if (mod2 == 0 && mod3 == 0) {
            ft.setStoreTermVectors(true);
            ft.setStoreTermVectorOffsets(true);
            ft.setStoreTermVectorPositions(true);
        } else if (mod2 == 0) {
            ft.setStoreTermVectors(true);
            ft.setStoreTermVectorPositions(true);
        } else if (mod3 == 0) {
            ft.setStoreTermVectors(true);
            ft.setStoreTermVectorOffsets(true);
        } else {
            ft.setStoreTermVectors(true);
        }
        doc.add(new Field("field", English.intToEnglish(i), ft));
        //test no term vectors too
        doc.add(new TextField("noTV", English.intToEnglish(i), Field.Store.YES));
        writer.addDocument(doc);
    }
    reader = writer.getReader();
    writer.close();
}
Also used : Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) FieldType(org.apache.lucene.document.FieldType) BeforeClass(org.junit.BeforeClass)

Example 72 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class TestTerms method testTermMinMaxRandom.

public void testTermMinMaxRandom() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    int numDocs = atLeast(100);
    BytesRef minTerm = null;
    BytesRef maxTerm = null;
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        Field field = new TextField("field", "", Field.Store.NO);
        doc.add(field);
        //System.out.println("  doc " + i);
        CannedBinaryTokenStream.BinaryToken[] tokens = new CannedBinaryTokenStream.BinaryToken[atLeast(10)];
        for (int j = 0; j < tokens.length; j++) {
            byte[] bytes = new byte[TestUtil.nextInt(random(), 1, 20)];
            random().nextBytes(bytes);
            BytesRef tokenBytes = new BytesRef(bytes);
            //System.out.println("    token " + tokenBytes);
            if (minTerm == null || tokenBytes.compareTo(minTerm) < 0) {
                //System.out.println("      ** new min");
                minTerm = tokenBytes;
            }
            if (maxTerm == null || tokenBytes.compareTo(maxTerm) > 0) {
                //System.out.println("      ** new max");
                maxTerm = tokenBytes;
            }
            tokens[j] = new CannedBinaryTokenStream.BinaryToken(tokenBytes);
        }
        field.setTokenStream(new CannedBinaryTokenStream(tokens));
        w.addDocument(doc);
    }
    IndexReader r = w.getReader();
    Terms terms = MultiFields.getTerms(r, "field");
    assertEquals(minTerm, terms.getMin());
    assertEquals(maxTerm, terms.getMax());
    r.close();
    w.close();
    dir.close();
}
Also used : CannedBinaryTokenStream(org.apache.lucene.analysis.CannedBinaryTokenStream) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) TextField(org.apache.lucene.document.TextField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 73 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class IndexBasedSpellCheckerTest method testAlternateLocation.

@Test
public void testAlternateLocation() throws Exception {
    String[] ALT_DOCS = new String[] { "jumpin jack flash", "Sargent Peppers Lonely Hearts Club Band", "Born to Run", "Thunder Road", "Londons Burning", "A Horse with No Name", "Sweet Caroline" };
    IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
    NamedList spellchecker = new NamedList();
    spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
    File tmpDir = createTempDir().toFile();
    File indexDir = new File(tmpDir, "spellingIdx");
    //create a standalone index
    File altIndexDir = new File(tmpDir, "alternateIdx" + new Date().getTime());
    Directory dir = newFSDirectory(altIndexDir.toPath());
    IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(new WhitespaceAnalyzer()));
    for (int i = 0; i < ALT_DOCS.length; i++) {
        Document doc = new Document();
        doc.add(new TextField("title", ALT_DOCS[i], Field.Store.YES));
        iw.addDocument(doc);
    }
    iw.forceMerge(1);
    iw.close();
    dir.close();
    indexDir.mkdirs();
    spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
    spellchecker.add(AbstractLuceneSpellChecker.LOCATION, altIndexDir.getAbsolutePath());
    spellchecker.add(AbstractLuceneSpellChecker.FIELD, "title");
    spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
    SolrCore core = h.getCore();
    String dictName = checker.init(spellchecker, core);
    assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME, dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
    RefCounted<SolrIndexSearcher> holder = core.getSearcher();
    SolrIndexSearcher searcher = holder.get();
    try {
        checker.build(core, searcher);
        IndexReader reader = searcher.getIndexReader();
        Collection<Token> tokens = queryConverter.convert("flesh");
        SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, true, 0.5f, null);
        SpellingResult result = checker.getSuggestions(spellOpts);
        assertTrue("result is null and it shouldn't be", result != null);
        //should be lowercased, b/c we are using a lowercasing analyzer
        Map<String, Integer> suggestions = result.get(spellOpts.tokens.iterator().next());
        assertTrue("flesh is null and it shouldn't be", suggestions != null);
        assertTrue("flesh Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
        Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
        assertTrue(entry.getKey() + " is not equal to " + "flash", entry.getKey().equals("flash") == true);
        assertTrue(entry.getValue() + " does not equal: " + 1, entry.getValue() == 1);
        //test something not in the spell checker
        spellOpts.tokens = queryConverter.convert("super");
        result = checker.getSuggestions(spellOpts);
        assertTrue("result is null and it shouldn't be", result != null);
        suggestions = result.get(spellOpts.tokens.iterator().next());
        assertTrue("suggestions size should be 0", suggestions.size() == 0);
        spellOpts.tokens = queryConverter.convert("Caroline");
        result = checker.getSuggestions(spellOpts);
        assertTrue("result is null and it shouldn't be", result != null);
        suggestions = result.get(spellOpts.tokens.iterator().next());
        assertTrue("suggestions is not null and it should be", suggestions == null);
    } finally {
        holder.decref();
    }
}
Also used : SolrCore(org.apache.solr.core.SolrCore) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) TextField(org.apache.lucene.document.TextField) Directory(org.apache.lucene.store.Directory) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) NamedList(org.apache.solr.common.util.NamedList) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) Date(java.util.Date) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) Map(java.util.Map) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Example 74 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class TestSimilarityBase method testLengthEncodingBackwardCompatibility.

public void testLengthEncodingBackwardCompatibility() throws IOException {
    Similarity similarity = RandomPicks.randomFrom(random(), sims);
    for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major }) {
        for (int length : new int[] { 1, 2, 4 }) {
            // these length values are encoded accurately on both cases
            Directory dir = newDirectory();
            // set the version on the directory
            new SegmentInfos(indexCreatedVersionMajor).commit(dir);
            IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
            Document doc = new Document();
            String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
            doc.add(new TextField("foo", value, Store.NO));
            w.addDocument(doc);
            IndexReader reader = DirectoryReader.open(w);
            IndexSearcher searcher = newSearcher(reader);
            searcher.setSimilarity(similarity);
            Term term = new Term("foo", "b");
            TermContext context = TermContext.build(reader.getContext(), term);
            SimWeight simWeight = similarity.computeWeight(1f, searcher.collectionStatistics("foo"), searcher.termStatistics(term, context));
            SimilarityBase.BasicSimScorer simScorer = (SimilarityBase.BasicSimScorer) similarity.simScorer(simWeight, reader.leaves().get(0));
            float docLength = simScorer.getLengthValue(0);
            assertEquals(length, (int) docLength);
            w.close();
            reader.close();
            dir.close();
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) Query(org.apache.lucene.search.Query) RandomPicks(com.carrotsearch.randomizedtesting.generators.RandomPicks) FieldType(org.apache.lucene.document.FieldType) Term(org.apache.lucene.index.Term) SimWeight(org.apache.lucene.search.similarities.Similarity.SimWeight) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory) Store(org.apache.lucene.document.Field.Store) TermStatistics(org.apache.lucene.search.TermStatistics) TopDocs(org.apache.lucene.search.TopDocs) Explanation(org.apache.lucene.search.Explanation) BytesRef(org.apache.lucene.util.BytesRef) DirectoryReader(org.apache.lucene.index.DirectoryReader) IOException(java.io.IOException) TermContext(org.apache.lucene.index.TermContext) Collectors(java.util.stream.Collectors) Version(org.apache.lucene.util.Version) SegmentInfos(org.apache.lucene.index.SegmentInfos) List(java.util.List) FieldInvertState(org.apache.lucene.index.FieldInvertState) IndexWriter(org.apache.lucene.index.IndexWriter) CollectionStatistics(org.apache.lucene.search.CollectionStatistics) TermQuery(org.apache.lucene.search.TermQuery) Field(org.apache.lucene.document.Field) LuceneTestCase(org.apache.lucene.util.LuceneTestCase) TextField(org.apache.lucene.document.TextField) IndexOptions(org.apache.lucene.index.IndexOptions) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) IndexSearcher(org.apache.lucene.search.IndexSearcher) IndexSearcher(org.apache.lucene.search.IndexSearcher) SegmentInfos(org.apache.lucene.index.SegmentInfos) SimWeight(org.apache.lucene.search.similarities.Similarity.SimWeight) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) TermContext(org.apache.lucene.index.TermContext) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) Directory(org.apache.lucene.store.Directory)

Example 75 with TextField

use of org.apache.lucene.document.TextField in project lucene-solr by apache.

the class TokenSourcesTest method testMaxStartOffsetConsistency.

public void testMaxStartOffsetConsistency() throws IOException {
    FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
    tvFieldType.setStoreTermVectors(true);
    tvFieldType.setStoreTermVectorOffsets(true);
    tvFieldType.setStoreTermVectorPositions(true);
    Directory dir = newDirectory();
    MockAnalyzer analyzer = new MockAnalyzer(random());
    //we don't necessarily consume the whole stream because of limiting by startOffset
    analyzer.setEnableChecks(false);
    Document doc = new Document();
    final String TEXT = " f gg h";
    doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
    doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));
    IndexReader reader;
    try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
        writer.addDocument(doc);
        reader = writer.getReader();
    }
    try {
        Fields tvFields = reader.getTermVectors(0);
        for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
            TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
            TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);
            //assert have same tokens, none of which has a start offset > maxStartOffset
            final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
            final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
            tvStream.reset();
            anaStream.reset();
            while (tvStream.incrementToken()) {
                assertTrue(anaStream.incrementToken());
                assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
                if (maxStartOffset >= 0)
                    assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
            }
            assertTrue(anaStream.incrementToken() == false);
            tvStream.end();
            anaStream.end();
            tvStream.close();
            anaStream.close();
        }
    } finally {
        reader.close();
    }
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Fields(org.apache.lucene.index.Fields) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) TextField(org.apache.lucene.document.TextField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Aggregations

TextField (org.apache.lucene.document.TextField)192 Document (org.apache.lucene.document.Document)171 Directory (org.apache.lucene.store.Directory)99 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)61 Term (org.apache.lucene.index.Term)61 IndexWriter (org.apache.lucene.index.IndexWriter)58 IndexSearcher (org.apache.lucene.search.IndexSearcher)55 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)52 Field (org.apache.lucene.document.Field)50 StringField (org.apache.lucene.document.StringField)48 BytesRef (org.apache.lucene.util.BytesRef)48 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)44 IndexReader (org.apache.lucene.index.IndexReader)43 TermQuery (org.apache.lucene.search.TermQuery)41 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)31 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)30 TopDocs (org.apache.lucene.search.TopDocs)29 RAMDirectory (org.apache.lucene.store.RAMDirectory)29 FieldType (org.apache.lucene.document.FieldType)23 Query (org.apache.lucene.search.Query)23