Search in sources :

Example 26 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TokenSourcesTest method testOverlapWithPositionsAndOffsetExactPhrase.

public void testOverlapWithPositionsAndOffsetExactPhrase() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
    try {
        final Document document = new Document();
        FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
        customType.setStoreTermVectors(true);
        customType.setStoreTermVectorPositions(true);
        customType.setStoreTermVectorOffsets(true);
        document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
        indexWriter.addDocument(document);
    } finally {
        indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
        assertEquals(1, indexReader.numDocs());
        final IndexSearcher indexSearcher = newSearcher(indexReader);
        // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
        // query.add(new SpanTermQuery(new Term(FIELD, "the")));
        // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
        final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD, "the")), new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
        TopDocs hits = indexSearcher.search(phraseQuery, 1);
        assertEquals(1, hits.totalHits);
        final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery));
        final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
        assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
    } finally {
        indexReader.close();
        directory.close();
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) IndexReader(org.apache.lucene.index.IndexReader) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) Directory(org.apache.lucene.store.Directory)

Example 27 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class FastVectorHighlighterTest method testCommonTermsQueryHighlight.

public void testCommonTermsQueryHighlight() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
    FieldType type = new FieldType(TextField.TYPE_STORED);
    type.setStoreTermVectorOffsets(true);
    type.setStoreTermVectorPositions(true);
    type.setStoreTermVectors(true);
    type.freeze();
    String[] texts = { "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", "JFK has been shot", "John Kennedy has been shot", "This text has a typo in referring to Keneddy", "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
    for (int i = 0; i < texts.length; i++) {
        Document doc = new Document();
        Field field = new Field("field", texts[i], type);
        doc.add(field);
        writer.addDocument(doc);
    }
    CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 2);
    query.add(new Term("field", "text"));
    query.add(new Term("field", "long"));
    query.add(new Term("field", "very"));
    FastVectorHighlighter highlighter = new FastVectorHighlighter();
    IndexReader reader = DirectoryReader.open(writer);
    IndexSearcher searcher = newSearcher(reader);
    TopDocs hits = searcher.search(query, 10);
    assertEquals(2, hits.totalHits);
    FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
    String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
    assertEquals("This piece of <b>text</b> refers to Kennedy at the beginning then has a longer piece of <b>text</b> that is <b>very</b> <b>long</b> in the middle and finally ends with another reference to Kennedy", bestFragments[0]);
    fieldQuery = highlighter.getFieldQuery(query, reader);
    bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[1].doc, "field", 1000, 1);
    assertEquals("Hello this is a piece of <b>text</b> that is <b>very</b> <b>long</b> and contains too much preamble and the meat is really here which says kennedy has been shot", bestFragments[0]);
    reader.close();
    writer.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery) TopDocs(org.apache.lucene.search.TopDocs) StoredField(org.apache.lucene.document.StoredField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) Directory(org.apache.lucene.store.Directory)

Example 28 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class FastVectorHighlighterTest method matchedFieldsTestCase.

private void matchedFieldsTestCase(boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses) throws IOException {
    Document doc = new Document();
    FieldType stored = new FieldType(TextField.TYPE_STORED);
    stored.setStoreTermVectorOffsets(true);
    stored.setStoreTermVectorPositions(true);
    stored.setStoreTermVectors(true);
    stored.freeze();
    FieldType matched = new FieldType(TextField.TYPE_NOT_STORED);
    matched.setStoreTermVectorOffsets(true);
    matched.setStoreTermVectorPositions(true);
    matched.setStoreTermVectors(true);
    matched.freeze();
    // Whitespace tokenized with English stop words
    doc.add(new Field("field", fieldValue, stored));
    // Whitespace tokenized without stop words
    doc.add(new Field("field_exact", fieldValue, matched));
    // Whitespace tokenized without toLower
    doc.add(new Field("field_super_exact", fieldValue, matched));
    // Each letter is a token
    doc.add(new Field("field_characters", fieldValue, matched));
    // Every three letters is a token
    doc.add(new Field("field_tripples", fieldValue, matched));
    doc.add(new Field("field_sliced", // Sliced at 10 chars then analyzed just like field
    fieldValue.substring(// Sliced at 10 chars then analyzed just like field
    0, Math.min(fieldValue.length() - 1, 10)), matched));
    doc.add(new Field("field_der_red", new // Hacky field containing "der" and "red" at pos = 0
    CannedTokenStream(token("der", 1, 0, 3), token("red", 0, 0, 3)), matched));
    final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
    fieldAnalyzers.put("field", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET));
    fieldAnalyzers.put("field_exact", new MockAnalyzer(random()));
    fieldAnalyzers.put("field_super_exact", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
    fieldAnalyzers.put("field_characters", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
    fieldAnalyzers.put("field_tripples", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
    fieldAnalyzers.put("field_sliced", fieldAnalyzers.get("field"));
    // This is required even though we provide a token stream
    fieldAnalyzers.put("field_der_red", fieldAnalyzers.get("field"));
    Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {

        public Analyzer getWrappedAnalyzer(String fieldName) {
            return fieldAnalyzers.get(fieldName);
        }
    };
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    writer.addDocument(doc);
    FastVectorHighlighter highlighter = new FastVectorHighlighter();
    FragListBuilder fragListBuilder = new SimpleFragListBuilder();
    FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
    IndexReader reader = DirectoryReader.open(writer);
    String[] preTags = new String[] { "<b>" };
    String[] postTags = new String[] { "</b>" };
    Encoder encoder = new DefaultEncoder();
    int docId = 0;
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    for (Query clause : queryClauses) {
        query.add(clause, Occur.MUST);
    }
    FieldQuery fieldQuery = new FieldQuery(query.build(), reader, true, fieldMatch);
    String[] bestFragments;
    if (useMatchedFields) {
        Set<String> matchedFields = new HashSet<>();
        matchedFields.add("field");
        matchedFields.add("field_exact");
        matchedFields.add("field_super_exact");
        matchedFields.add("field_characters");
        matchedFields.add("field_tripples");
        matchedFields.add("field_sliced");
        matchedFields.add("field_der_red");
        bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
    } else {
        bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
    }
    assertEquals(expected, bestFragments[0]);
    reader.close();
    writer.close();
    dir.close();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) CustomScoreQuery(org.apache.lucene.queries.CustomScoreQuery) TermQuery(org.apache.lucene.search.TermQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Document(org.apache.lucene.document.Document) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StoredField(org.apache.lucene.document.StoredField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) Encoder(org.apache.lucene.search.highlight.Encoder) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet) RegExp(org.apache.lucene.util.automaton.RegExp) TreeMap(java.util.TreeMap) FieldType(org.apache.lucene.document.FieldType) DelegatingAnalyzerWrapper(org.apache.lucene.analysis.DelegatingAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader)

Example 29 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class AbstractTestCase method make1dmfIndex.

// make 1 doc with multi valued field
protected void make1dmfIndex(Analyzer analyzer, String... values) throws Exception {
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer).setOpenMode(OpenMode.CREATE));
    Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorOffsets(true);
    customType.setStoreTermVectorPositions(true);
    for (String value : values) {
        doc.add(new Field(F, value, customType));
    }
    writer.addDocument(doc);
    writer.close();
    if (reader != null)
        reader.close();
    reader = DirectoryReader.open(dir);
}
Also used : Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) Document(org.apache.lucene.document.Document) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) FieldType(org.apache.lucene.document.FieldType)

Example 30 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class AbstractTestCase method make1dmfIndexNA.

// make 1 doc with multi valued & not analyzed field
protected void make1dmfIndexNA(String... values) throws Exception {
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzerK).setOpenMode(OpenMode.CREATE));
    Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorOffsets(true);
    customType.setStoreTermVectorPositions(true);
    for (String value : values) {
        doc.add(new Field(F, value, customType));
    //doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
    }
    writer.addDocument(doc);
    writer.close();
    if (reader != null)
        reader.close();
    reader = DirectoryReader.open(dir);
}
Also used : Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) Document(org.apache.lucene.document.Document) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) FieldType(org.apache.lucene.document.FieldType)

Aggregations

FieldType (org.apache.lucene.document.FieldType)283 Document (org.apache.lucene.document.Document)244 Field (org.apache.lucene.document.Field)209 Directory (org.apache.lucene.store.Directory)175 TextField (org.apache.lucene.document.TextField)168 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)125 StringField (org.apache.lucene.document.StringField)88 StoredField (org.apache.lucene.document.StoredField)77 BytesRef (org.apache.lucene.util.BytesRef)56 IndexWriter (org.apache.lucene.index.IndexWriter)53 IndexReader (org.apache.lucene.index.IndexReader)49 IndexSearcher (org.apache.lucene.search.IndexSearcher)46 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)45 Term (org.apache.lucene.index.Term)41 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)39 RAMDirectory (org.apache.lucene.store.RAMDirectory)37 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)34 TermQuery (org.apache.lucene.search.TermQuery)34 Analyzer (org.apache.lucene.analysis.Analyzer)33 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)33