use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class TokenSourcesTest method testMaxStartOffsetConsistency.
public void testMaxStartOffsetConsistency() throws IOException {
FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
tvFieldType.setStoreTermVectors(true);
tvFieldType.setStoreTermVectorOffsets(true);
tvFieldType.setStoreTermVectorPositions(true);
Directory dir = newDirectory();
MockAnalyzer analyzer = new MockAnalyzer(random());
//we don't necessarily consume the whole stream because of limiting by startOffset
analyzer.setEnableChecks(false);
Document doc = new Document();
final String TEXT = " f gg h";
doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));
IndexReader reader;
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
writer.addDocument(doc);
reader = writer.getReader();
}
try {
Fields tvFields = reader.getTermVectors(0);
for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);
//assert have same tokens, none of which has a start offset > maxStartOffset
final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
tvStream.reset();
anaStream.reset();
while (tvStream.incrementToken()) {
assertTrue(anaStream.incrementToken());
assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
if (maxStartOffset >= 0)
assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
}
assertTrue(anaStream.incrementToken() == false);
tvStream.end();
anaStream.end();
tvStream.close();
anaStream.close();
}
} finally {
reader.close();
}
dir.close();
}
use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class TokenSourcesTest method testOverlapWithOffset.
public void testOverlapWithOffset() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
// no positions!
customType.setStoreTermVectorOffsets(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
try {
final DisjunctionMaxQuery query = new DisjunctionMaxQuery(Arrays.asList(new SpanTermQuery(new Term(FIELD, "{fox}")), new SpanTermQuery(new Term(FIELD, "fox"))), 1);
// final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
// new SpanTermQuery(new Term(FIELD, "{fox}")),
// new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
TopDocs hits = indexSearcher.search(query, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(query));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class TokenSourcesTest method testOverlapWithOffsetExactPhrase.
public void testOverlapWithOffsetExactPhrase() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
// no positions!
customType.setStoreTermVectorOffsets(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
// query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD, "the")), new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class TokenSourcesTest method testPayloads.
// LUCENE-5294
public void testPayloads() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
myFieldType.setStoreTermVectors(true);
myFieldType.setStoreTermVectorOffsets(true);
myFieldType.setStoreTermVectorPositions(true);
myFieldType.setStoreTermVectorPayloads(true);
curOffset = 0;
Token[] tokens = new Token[] { getToken("foxes"), getToken("can"), getToken("jump"), getToken("high") };
Document doc = new Document();
doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
assertEquals(1, reader.numDocs());
TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
ts.reset();
for (Token token : tokens) {
assertTrue(ts.incrementToken());
assertEquals(token.toString(), termAtt.toString());
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
assertEquals(token.getPayload(), payloadAtt.getPayload());
assertEquals(token.startOffset(), offsetAtt.startOffset());
assertEquals(token.endOffset(), offsetAtt.endOffset());
}
assertFalse(ts.incrementToken());
reader.close();
dir.close();
}
use of org.apache.lucene.document.FieldType in project lucene-solr by apache.
the class TokenSourcesTest method testOverlapWithPositionsAndOffset.
public void testOverlapWithPositionsAndOffset() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(null));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPositions(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final DisjunctionMaxQuery query = new DisjunctionMaxQuery(Arrays.asList(new SpanTermQuery(new Term(FIELD, "{fox}")), new SpanTermQuery(new Term(FIELD, "fox"))), 1);
// final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
// new SpanTermQuery(new Term(FIELD, "{fox}")),
// new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
TopDocs hits = indexSearcher.search(query, 1);
assertEquals(1, hits.totalHits);
final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(query));
final TokenStream tokenStream = TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump", highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
Aggregations