use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestTermsEnum method testIntersectRegexp.
// LUCENE-7576
public void testIntersectRegexp() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document doc = new Document();
doc.add(newStringField("field", "foobar", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
Fields fields = MultiFields.getFields(r);
CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
Terms terms = fields.terms("field");
String message = expectThrows(IllegalArgumentException.class, () -> {
terms.intersect(automaton, null);
}).getMessage();
assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
r.close();
w.close();
d.close();
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestSpanFirstQuery method testStartPositions.
public void testStartPositions() throws Exception {
Directory dir = newDirectory();
// mimic StopAnalyzer
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
Document doc = new Document();
doc.add(newTextField("field", "the quick brown fox", Field.Store.NO));
writer.addDocument(doc);
Document doc2 = new Document();
doc2.add(newTextField("field", "quick brown fox", Field.Store.NO));
writer.addDocument(doc2);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
// user queries on "starts-with quick"
SpanQuery sfq = spanFirstQuery(spanTermQuery("field", "quick"), 1);
assertEquals(1, searcher.search(sfq, 10).totalHits);
// user queries on "starts-with the quick"
SpanQuery include = spanFirstQuery(spanTermQuery("field", "quick"), 2);
sfq = spanNotQuery(include, sfq);
assertEquals(1, searcher.search(sfq, 10).totalHits);
writer.close();
reader.close();
dir.close();
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class Dictionary method parseAffix.
/**
* Parses a specific affix rule putting the result into the provided affix map
*
* @param affixes Map where the result of the parsing will be put
* @param header Header line of the affix rule
* @param reader BufferedReader to read the content of the rule from
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
* pattern
* @param seenPatterns map from condition -> index of patterns, for deduplication.
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException {
BytesRefBuilder scratch = new BytesRefBuilder();
StringBuilder sb = new StringBuilder();
String[] args = header.split("\\s+");
boolean crossProduct = args[2].equals("Y");
boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
int numLines = Integer.parseInt(args[3]);
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
for (int i = 0; i < numLines; i++) {
assert affixWriter.getPosition() == currentAffix << 3;
String line = reader.readLine();
String[] ruleArgs = line.split("\\s+");
// condition is optional
if (ruleArgs.length < 4) {
throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber());
}
char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
String affixArg = ruleArgs[3];
char[] appendFlags = null;
// first: parse continuation classes out of affix
int flagSep = affixArg.lastIndexOf('/');
if (flagSep != -1) {
String flagPart = affixArg.substring(flagSep + 1);
affixArg = affixArg.substring(0, flagSep);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
appendFlags = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(appendFlags);
twoStageAffix = true;
}
// zero affix -> empty string
if ("0".equals(affixArg)) {
affixArg = "";
}
String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
// at least the gascon affix file has this issue
if (condition.startsWith("[") && condition.indexOf(']') == -1) {
condition = condition + "]";
}
// "dash hasn't got special meaning" (we must escape it)
if (condition.indexOf('-') >= 0) {
condition = escapeDash(condition);
}
final String regex;
if (".".equals(condition)) {
// Zero condition is indicated by dot
regex = ".*";
} else if (condition.equals(strip)) {
// TODO: optimize this better:
regex = ".*";
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
// but this is complicated...
} else {
regex = String.format(Locale.ROOT, conditionPattern, condition);
}
// deduplicate patterns
Integer patternIndex = seenPatterns.get(regex);
if (patternIndex == null) {
patternIndex = patterns.size();
if (patternIndex > Short.MAX_VALUE) {
throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");
}
seenPatterns.put(regex, patternIndex);
CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
patterns.add(pattern);
}
Integer stripOrd = seenStrips.get(strip);
if (stripOrd == null) {
stripOrd = seenStrips.size();
seenStrips.put(strip, stripOrd);
if (stripOrd > Character.MAX_VALUE) {
throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
}
}
if (appendFlags == null) {
appendFlags = NOFLAGS;
}
encodeFlags(scratch, appendFlags);
int appendFlagsOrd = flagLookup.add(scratch.get());
if (appendFlagsOrd < 0) {
// already exists in our hash
appendFlagsOrd = (-appendFlagsOrd) - 1;
} else if (appendFlagsOrd > Short.MAX_VALUE) {
// this limit is probably flexible, but it's a good sanity check too
throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org");
}
affixWriter.writeShort((short) flag);
affixWriter.writeShort((short) stripOrd.intValue());
// encode crossProduct into patternIndex
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
affixWriter.writeShort((short) patternOrd);
affixWriter.writeShort((short) appendFlagsOrd);
if (needsInputCleaning) {
CharSequence cleaned = cleanInput(affixArg, sb);
affixArg = cleaned.toString();
}
if (isSuffix) {
affixArg = new StringBuilder(affixArg).reverse().toString();
}
List<Integer> list = affixes.get(affixArg);
if (list == null) {
list = new ArrayList<>();
affixes.put(affixArg, list);
}
list.add(currentAffix);
currentAffix++;
}
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class MinHashFilterTest method createMockShingleTokenizer.
private static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
MockTokenizer tokenizer = new MockTokenizer(new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){" + (shingleSize - 1) + "}").toAutomaton()), true);
tokenizer.setEnableChecks(true);
if (shingles != null) {
tokenizer.setReader(new StringReader(shingles));
}
return tokenizer;
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestBlockPostingsFormat3 method assertTerms.
// following code is almost an exact dup of code from TestDuelingCodecs: sorry!
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
if (leftTerms == null || rightTerms == null) {
assertNull(leftTerms);
assertNull(rightTerms);
return;
}
assertTermsStatistics(leftTerms, rightTerms);
// NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different
boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
TermsEnum leftTermsEnum = leftTerms.iterator();
TermsEnum rightTermsEnum = rightTerms.iterator();
assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHavePositions);
assertTermsSeeking(leftTerms, rightTerms);
if (deep) {
int numIntersections = atLeast(3);
for (int i = 0; i < numIntersections; i++) {
String re = AutomatonTestUtil.randomRegexp(random());
CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
// TODO: test start term too
TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHavePositions);
}
}
}
}
Aggregations