use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class TestTermVectors method beforeClass.
@BeforeClass
public static void beforeClass() throws Exception {
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)).setMergePolicy(newLogMergePolicy()));
//writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) {
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_STORED);
int mod3 = i % 3;
int mod2 = i % 2;
if (mod2 == 0 && mod3 == 0) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
} else if (mod2 == 0) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
} else if (mod3 == 0) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
} else {
ft.setStoreTermVectors(true);
}
doc.add(new Field("field", English.intToEnglish(i), ft));
//test no term vectors too
doc.add(new TextField("noTV", English.intToEnglish(i), Field.Store.YES));
writer.addDocument(doc);
}
reader = writer.getReader();
writer.close();
}
use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class TestTerms method testTermMinMaxRandom.
public void testTermMinMaxRandom() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int numDocs = atLeast(100);
BytesRef minTerm = null;
BytesRef maxTerm = null;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
Field field = new TextField("field", "", Field.Store.NO);
doc.add(field);
//System.out.println(" doc " + i);
CannedBinaryTokenStream.BinaryToken[] tokens = new CannedBinaryTokenStream.BinaryToken[atLeast(10)];
for (int j = 0; j < tokens.length; j++) {
byte[] bytes = new byte[TestUtil.nextInt(random(), 1, 20)];
random().nextBytes(bytes);
BytesRef tokenBytes = new BytesRef(bytes);
//System.out.println(" token " + tokenBytes);
if (minTerm == null || tokenBytes.compareTo(minTerm) < 0) {
//System.out.println(" ** new min");
minTerm = tokenBytes;
}
if (maxTerm == null || tokenBytes.compareTo(maxTerm) > 0) {
//System.out.println(" ** new max");
maxTerm = tokenBytes;
}
tokens[j] = new CannedBinaryTokenStream.BinaryToken(tokenBytes);
}
field.setTokenStream(new CannedBinaryTokenStream(tokens));
w.addDocument(doc);
}
IndexReader r = w.getReader();
Terms terms = MultiFields.getTerms(r, "field");
assertEquals(minTerm, terms.getMin());
assertEquals(maxTerm, terms.getMax());
r.close();
w.close();
dir.close();
}
use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class IndexBasedSpellCheckerTest method testAlternateLocation.
@Test
public void testAlternateLocation() throws Exception {
String[] ALT_DOCS = new String[] { "jumpin jack flash", "Sargent Peppers Lonely Hearts Club Band", "Born to Run", "Thunder Road", "Londons Burning", "A Horse with No Name", "Sweet Caroline" };
IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
File tmpDir = createTempDir().toFile();
File indexDir = new File(tmpDir, "spellingIdx");
//create a standalone index
File altIndexDir = new File(tmpDir, "alternateIdx" + new Date().getTime());
Directory dir = newFSDirectory(altIndexDir.toPath());
IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(new WhitespaceAnalyzer()));
for (int i = 0; i < ALT_DOCS.length; i++) {
Document doc = new Document();
doc.add(new TextField("title", ALT_DOCS[i], Field.Store.YES));
iw.addDocument(doc);
}
iw.forceMerge(1);
iw.close();
dir.close();
indexDir.mkdirs();
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, altIndexDir.getAbsolutePath());
spellchecker.add(AbstractLuceneSpellChecker.FIELD, "title");
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core);
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME, dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
RefCounted<SolrIndexSearcher> holder = core.getSearcher();
SolrIndexSearcher searcher = holder.get();
try {
checker.build(core, searcher);
IndexReader reader = searcher.getIndexReader();
Collection<Token> tokens = queryConverter.convert("flesh");
SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, true, 0.5f, null);
SpellingResult result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null);
//should be lowercased, b/c we are using a lowercasing analyzer
Map<String, Integer> suggestions = result.get(spellOpts.tokens.iterator().next());
assertTrue("flesh is null and it shouldn't be", suggestions != null);
assertTrue("flesh Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "flash", entry.getKey().equals("flash") == true);
assertTrue(entry.getValue() + " does not equal: " + 1, entry.getValue() == 1);
//test something not in the spell checker
spellOpts.tokens = queryConverter.convert("super");
result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(spellOpts.tokens.iterator().next());
assertTrue("suggestions size should be 0", suggestions.size() == 0);
spellOpts.tokens = queryConverter.convert("Caroline");
result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(spellOpts.tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
} finally {
holder.decref();
}
}
use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class TestSimilarityBase method testLengthEncodingBackwardCompatibility.
public void testLengthEncodingBackwardCompatibility() throws IOException {
Similarity similarity = RandomPicks.randomFrom(random(), sims);
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major }) {
for (int length : new int[] { 1, 2, 4 }) {
// these length values are encoded accurately on both cases
Directory dir = newDirectory();
// set the version on the directory
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
Document doc = new Document();
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
doc.add(new TextField("foo", value, Store.NO));
w.addDocument(doc);
IndexReader reader = DirectoryReader.open(w);
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(similarity);
Term term = new Term("foo", "b");
TermContext context = TermContext.build(reader.getContext(), term);
SimWeight simWeight = similarity.computeWeight(1f, searcher.collectionStatistics("foo"), searcher.termStatistics(term, context));
SimilarityBase.BasicSimScorer simScorer = (SimilarityBase.BasicSimScorer) similarity.simScorer(simWeight, reader.leaves().get(0));
float docLength = simScorer.getLengthValue(0);
assertEquals(length, (int) docLength);
w.close();
reader.close();
dir.close();
}
}
}
use of org.apache.lucene.document.TextField in project lucene-solr by apache.
the class TokenSourcesTest method testMaxStartOffsetConsistency.
public void testMaxStartOffsetConsistency() throws IOException {
FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
tvFieldType.setStoreTermVectors(true);
tvFieldType.setStoreTermVectorOffsets(true);
tvFieldType.setStoreTermVectorPositions(true);
Directory dir = newDirectory();
MockAnalyzer analyzer = new MockAnalyzer(random());
//we don't necessarily consume the whole stream because of limiting by startOffset
analyzer.setEnableChecks(false);
Document doc = new Document();
final String TEXT = " f gg h";
doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));
IndexReader reader;
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
writer.addDocument(doc);
reader = writer.getReader();
}
try {
Fields tvFields = reader.getTermVectors(0);
for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);
//assert have same tokens, none of which has a start offset > maxStartOffset
final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
tvStream.reset();
anaStream.reset();
while (tvStream.incrementToken()) {
assertTrue(anaStream.incrementToken());
assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
if (maxStartOffset >= 0)
assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
}
assertTrue(anaStream.incrementToken() == false);
tvStream.end();
anaStream.end();
tvStream.close();
anaStream.close();
}
} finally {
reader.close();
}
dir.close();
}
Aggregations