use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class FastVectorHighlighterTest method matchedFieldsTestCase.
private void matchedFieldsTestCase(boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses) throws IOException {
Document doc = new Document();
FieldType stored = new FieldType(TextField.TYPE_STORED);
stored.setStoreTermVectorOffsets(true);
stored.setStoreTermVectorPositions(true);
stored.setStoreTermVectors(true);
stored.freeze();
FieldType matched = new FieldType(TextField.TYPE_NOT_STORED);
matched.setStoreTermVectorOffsets(true);
matched.setStoreTermVectorPositions(true);
matched.setStoreTermVectors(true);
matched.freeze();
// Whitespace tokenized with English stop words
doc.add(new Field("field", fieldValue, stored));
// Whitespace tokenized without stop words
doc.add(new Field("field_exact", fieldValue, matched));
// Whitespace tokenized without toLower
doc.add(new Field("field_super_exact", fieldValue, matched));
// Each letter is a token
doc.add(new Field("field_characters", fieldValue, matched));
// Every three letters is a token
doc.add(new Field("field_tripples", fieldValue, matched));
doc.add(new Field("field_sliced", // Sliced at 10 chars then analyzed just like field
fieldValue.substring(// Sliced at 10 chars then analyzed just like field
0, Math.min(fieldValue.length() - 1, 10)), matched));
doc.add(new Field("field_der_red", new // Hacky field containing "der" and "red" at pos = 0
CannedTokenStream(token("der", 1, 0, 3), token("red", 0, 0, 3)), matched));
final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
fieldAnalyzers.put("field", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET));
fieldAnalyzers.put("field_exact", new MockAnalyzer(random()));
fieldAnalyzers.put("field_super_exact", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
fieldAnalyzers.put("field_characters", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
fieldAnalyzers.put("field_tripples", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
fieldAnalyzers.put("field_sliced", fieldAnalyzers.get("field"));
// This is required even though we provide a token stream
fieldAnalyzers.put("field_der_red", fieldAnalyzers.get("field"));
Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
public Analyzer getWrappedAnalyzer(String fieldName) {
return fieldAnalyzers.get(fieldName);
}
};
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
writer.addDocument(doc);
FastVectorHighlighter highlighter = new FastVectorHighlighter();
FragListBuilder fragListBuilder = new SimpleFragListBuilder();
FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
IndexReader reader = DirectoryReader.open(writer);
String[] preTags = new String[] { "<b>" };
String[] postTags = new String[] { "</b>" };
Encoder encoder = new DefaultEncoder();
int docId = 0;
BooleanQuery.Builder query = new BooleanQuery.Builder();
for (Query clause : queryClauses) {
query.add(clause, Occur.MUST);
}
FieldQuery fieldQuery = new FieldQuery(query.build(), reader, true, fieldMatch);
String[] bestFragments;
if (useMatchedFields) {
Set<String> matchedFields = new HashSet<>();
matchedFields.add("field");
matchedFields.add("field_exact");
matchedFields.add("field_super_exact");
matchedFields.add("field_characters");
matchedFields.add("field_tripples");
matchedFields.add("field_sliced");
matchedFields.add("field_der_red");
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
} else {
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
}
assertEquals(expected, bestFragments[0]);
reader.close();
writer.close();
dir.close();
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestQPHelper method testStopwords.
public void testStopwords() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
Query result = qp.parse("a:the OR a:foo", "a");
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a MatchNoDocsQuery", result instanceof MatchNoDocsQuery);
result = qp.parse("a:woo OR a:the", "a");
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a TermQuery", result instanceof TermQuery);
result = qp.parse("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", "a");
Query expected = new BooleanQuery.Builder().add(new TermQuery(new Term("fieldX", "xxxxx")), Occur.SHOULD).add(new TermQuery(new Term("fieldy", "xxxxxxxx")), Occur.SHOULD).build();
expected = new BoostQuery(expected, 2f);
assertEquals(expected, result);
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestFSTs method testRealTerms.
// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
final LineFileDocs docs = new LineFileDocs(random());
final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
final Path tempDir = createTempDir("fstlines");
final Directory dir = newFSDirectory(tempDir);
final IndexWriter writer = new IndexWriter(dir, conf);
Document doc;
int docCount = 0;
while ((doc = docs.nextDoc()) != null && docCount < numDocs) {
writer.addDocument(doc);
docCount++;
}
IndexReader r = DirectoryReader.open(writer);
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
if (storeOrd) {
System.out.println("FST stores ord");
} else {
System.out.println("FST stores docFreq");
}
}
Terms terms = MultiFields.getTerms(r, "body");
if (terms != null) {
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
final TermsEnum termsEnum = terms.iterator();
if (VERBOSE) {
System.out.println("TEST: got termsEnum=" + termsEnum);
}
BytesRef term;
int ord = 0;
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
while ((term = termsEnum.next()) != null) {
BytesRef term2 = termsEnum2.next();
assertNotNull(term2);
assertEquals(term, term2);
assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
if (ord == 0) {
try {
termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
if (VERBOSE) {
System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
}
storeOrd = false;
}
}
final int output;
if (storeOrd) {
output = ord;
} else {
output = termsEnum.docFreq();
}
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
}
FST<Long> fst = builder.finish();
if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
}
if (ord > 0) {
final Random random = new Random(random().nextLong());
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
int num = atLeast(1000);
for (int iter = 0; iter < num; iter++) {
final BytesRef randomTerm = new BytesRef(getRandomString(random));
if (VERBOSE) {
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
}
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
if (seekResult == TermsEnum.SeekStatus.END) {
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
} else {
assertSame(termsEnum, fstEnum, storeOrd);
for (int nextIter = 0; nextIter < 10; nextIter++) {
if (VERBOSE) {
System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
}
if (termsEnum.next() != null) {
if (VERBOSE) {
System.out.println(" term=" + termsEnum.term().utf8ToString());
}
assertNotNull(fstEnum.next());
assertSame(termsEnum, fstEnum, storeOrd);
} else {
if (VERBOSE) {
System.out.println(" end!");
}
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
}
}
}
}
}
}
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class QueryParserTestBase method testPhraseQueryPositionIncrements.
public void testPhraseQueryPositionIncrements() throws Exception {
CharacterRunAutomaton stopStopList = new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
qp.setEnablePositionIncrements(true);
PhraseQuery.Builder phraseQuery = new PhraseQuery.Builder();
phraseQuery.add(new Term("field", "1"));
phraseQuery.add(new Term("field", "2"), 2);
assertEquals(phraseQuery.build(), getQuery("\"1 stop 2\"", qp));
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestTermsEnum method testIntersectEmptyString.
public void testIntersectEmptyString() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newStringField("field", "", Field.Store.NO));
doc.add(newStringField("field", "abc", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
// add empty string to both documents, so that singletonDocID == -1.
// For a FST-based term dict, we'll expect to see the first arc is
// flaged with HAS_FINAL_OUTPUT
doc.add(newStringField("field", "abc", Field.Store.NO));
doc.add(newStringField("field", "", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
// accept ALL
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.intersect(ca, null);
PostingsEnum de;
assertEquals("", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertEquals("abc", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertNull(te.next());
// pass empty string
te = terms.intersect(ca, new BytesRef(""));
assertEquals("abc", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertNull(te.next());
r.close();
dir.close();
}
Aggregations