Search in sources :

Example 21 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TokenSourcesTest method testMaxStartOffsetConsistency.

public void testMaxStartOffsetConsistency() throws IOException {
    FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
    tvFieldType.setStoreTermVectors(true);
    tvFieldType.setStoreTermVectorOffsets(true);
    tvFieldType.setStoreTermVectorPositions(true);
    Directory dir = newDirectory();
    MockAnalyzer analyzer = new MockAnalyzer(random());
    //we don't necessarily consume the whole stream because of limiting by startOffset
    analyzer.setEnableChecks(false);
    Document doc = new Document();
    final String TEXT = " f gg h";
    doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
    doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));
    IndexReader reader;
    try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
        writer.addDocument(doc);
        reader = writer.getReader();
    }
    try {
        Fields tvFields = reader.getTermVectors(0);
        for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
            TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
            TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);
            //assert have same tokens, none of which has a start offset > maxStartOffset
            final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
            final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
            tvStream.reset();
            anaStream.reset();
            while (tvStream.incrementToken()) {
                assertTrue(anaStream.incrementToken());
                assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
                if (maxStartOffset >= 0)
                    assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
            }
            assertTrue(anaStream.incrementToken() == false);
            tvStream.end();
            anaStream.end();
            tvStream.close();
            anaStream.close();
        }
    } finally {
        reader.close();
    }
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Fields(org.apache.lucene.index.Fields) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) TextField(org.apache.lucene.document.TextField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 22 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TokenSourcesTest method testOverlapWithOffset.

public void testOverlapWithOffset() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
    try {
        final Document document = new Document();
        FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
        customType.setStoreTermVectors(true);
        // no positions!
        customType.setStoreTermVectorOffsets(true);
        document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
        indexWriter.addDocument(document);
    } finally {
        indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    assertEquals(1, indexReader.numDocs());
    final IndexSearcher indexSearcher = newSearcher(indexReader);
    try {
        final DisjunctionMaxQuery query = new DisjunctionMaxQuery(Arrays.asList(new SpanTermQuery(new Term(FIELD, "{fox}")), new SpanTermQuery(new Term(FIELD, "fox"))), 1);
        // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
        // new SpanTermQuery(new Term(FIELD, "{fox}")),
        // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
        TopDocs hits = indexSearcher.search(query, 1);
        assertEquals(1, hits.totalHits);
        final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(query));
        final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
        assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
    } finally {
        indexReader.close();
        directory.close();
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) IndexReader(org.apache.lucene.index.IndexReader) Directory(org.apache.lucene.store.Directory)

Example 23 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TokenSourcesTest method testOverlapWithOffsetExactPhrase.

public void testOverlapWithOffsetExactPhrase() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
    try {
        final Document document = new Document();
        FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
        customType.setStoreTermVectors(true);
        // no positions!
        customType.setStoreTermVectorOffsets(true);
        document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
        indexWriter.addDocument(document);
    } finally {
        indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
        assertEquals(1, indexReader.numDocs());
        final IndexSearcher indexSearcher = newSearcher(indexReader);
        // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
        // query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
        // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
        final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD, "the")), new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
        TopDocs hits = indexSearcher.search(phraseQuery, 1);
        assertEquals(1, hits.totalHits);
        final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery));
        final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
        assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
    } finally {
        indexReader.close();
        directory.close();
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) IndexReader(org.apache.lucene.index.IndexReader) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) Directory(org.apache.lucene.store.Directory)

Example 24 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TokenSourcesTest method testPayloads.

// LUCENE-5294
public void testPayloads() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
    myFieldType.setStoreTermVectors(true);
    myFieldType.setStoreTermVectorOffsets(true);
    myFieldType.setStoreTermVectorPositions(true);
    myFieldType.setStoreTermVectorPayloads(true);
    curOffset = 0;
    Token[] tokens = new Token[] { getToken("foxes"), getToken("can"), getToken("jump"), getToken("high") };
    Document doc = new Document();
    doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();
    assertEquals(1, reader.numDocs());
    TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
    PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    ts.reset();
    for (Token token : tokens) {
        assertTrue(ts.incrementToken());
        assertEquals(token.toString(), termAtt.toString());
        assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
        assertEquals(token.getPayload(), payloadAtt.getPayload());
        assertEquals(token.startOffset(), offsetAtt.startOffset());
        assertEquals(token.endOffset(), offsetAtt.endOffset());
    }
    assertFalse(ts.incrementToken());
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IndexReader(org.apache.lucene.index.IndexReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 25 with FieldType

use of org.apache.lucene.document.FieldType in project lucene-solr by apache.

the class TokenSourcesTest method testOverlapWithPositionsAndOffset.

public void testOverlapWithPositionsAndOffset() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
    try {
        final Document document = new Document();
        FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
        customType.setStoreTermVectors(true);
        customType.setStoreTermVectorOffsets(true);
        customType.setStoreTermVectorPositions(true);
        document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
        indexWriter.addDocument(document);
    } finally {
        indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
        assertEquals(1, indexReader.numDocs());
        final IndexSearcher indexSearcher = newSearcher(indexReader);
        final DisjunctionMaxQuery query = new DisjunctionMaxQuery(Arrays.asList(new SpanTermQuery(new Term(FIELD, "{fox}")), new SpanTermQuery(new Term(FIELD, "fox"))), 1);
        // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
        // new SpanTermQuery(new Term(FIELD, "{fox}")),
        // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
        TopDocs hits = indexSearcher.search(query, 1);
        assertEquals(1, hits.totalHits);
        final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(query));
        final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
        assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
    } finally {
        indexReader.close();
        directory.close();
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) IndexReader(org.apache.lucene.index.IndexReader) Directory(org.apache.lucene.store.Directory)

Aggregations

FieldType (org.apache.lucene.document.FieldType)283 Document (org.apache.lucene.document.Document)244 Field (org.apache.lucene.document.Field)209 Directory (org.apache.lucene.store.Directory)175 TextField (org.apache.lucene.document.TextField)168 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)125 StringField (org.apache.lucene.document.StringField)88 StoredField (org.apache.lucene.document.StoredField)77 BytesRef (org.apache.lucene.util.BytesRef)56 IndexWriter (org.apache.lucene.index.IndexWriter)53 IndexReader (org.apache.lucene.index.IndexReader)49 IndexSearcher (org.apache.lucene.search.IndexSearcher)46 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)45 Term (org.apache.lucene.index.Term)41 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)39 RAMDirectory (org.apache.lucene.store.RAMDirectory)37 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)34 TermQuery (org.apache.lucene.search.TermQuery)34 Analyzer (org.apache.lucene.analysis.Analyzer)33 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)33