use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project elasticsearch by elastic.
the class XContentMapValues method filter.
/**
* Returns a function that filters a document map based on the given include and exclude rules.
* @see #filter(Map, String[], String[]) for details
*/
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
CharacterRunAutomaton include;
if (includes == null || includes.length == 0) {
include = matchAllAutomaton;
} else {
Automaton includeA = Regex.simpleMatchToAutomaton(includes);
includeA = makeMatchDotsInFieldNames(includeA);
include = new CharacterRunAutomaton(includeA);
}
Automaton excludeA;
if (excludes == null || excludes.length == 0) {
excludeA = Automata.makeEmpty();
} else {
excludeA = Regex.simpleMatchToAutomaton(excludes);
excludeA = makeMatchDotsInFieldNames(excludeA);
}
CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);
return (map) -> filter(map, include, 0, exclude, 0, matchAllAutomaton);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestSpanFirstQuery method testStartPositions.
public void testStartPositions() throws Exception {
Directory dir = newDirectory();
// mimic StopAnalyzer
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
Document doc = new Document();
doc.add(newTextField("field", "the quick brown fox", Field.Store.NO));
writer.addDocument(doc);
Document doc2 = new Document();
doc2.add(newTextField("field", "quick brown fox", Field.Store.NO));
writer.addDocument(doc2);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
// user queries on "starts-with quick"
SpanQuery sfq = spanFirstQuery(spanTermQuery("field", "quick"), 1);
assertEquals(1, searcher.search(sfq, 10).totalHits);
// user queries on "starts-with the quick"
SpanQuery include = spanFirstQuery(spanTermQuery("field", "quick"), 2);
sfq = spanNotQuery(include, sfq);
assertEquals(1, searcher.search(sfq, 10).totalHits);
writer.close();
reader.close();
dir.close();
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class Dictionary method parseAffix.
/**
* Parses a specific affix rule putting the result into the provided affix map
*
* @param affixes Map where the result of the parsing will be put
* @param header Header line of the affix rule
* @param reader BufferedReader to read the content of the rule from
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
* pattern
* @param seenPatterns map from condition -> index of patterns, for deduplication.
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException {
BytesRefBuilder scratch = new BytesRefBuilder();
StringBuilder sb = new StringBuilder();
String[] args = header.split("\\s+");
boolean crossProduct = args[2].equals("Y");
boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
int numLines = Integer.parseInt(args[3]);
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
for (int i = 0; i < numLines; i++) {
assert affixWriter.getPosition() == currentAffix << 3;
String line = reader.readLine();
String[] ruleArgs = line.split("\\s+");
// condition is optional
if (ruleArgs.length < 4) {
throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber());
}
char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
String affixArg = ruleArgs[3];
char[] appendFlags = null;
// first: parse continuation classes out of affix
int flagSep = affixArg.lastIndexOf('/');
if (flagSep != -1) {
String flagPart = affixArg.substring(flagSep + 1);
affixArg = affixArg.substring(0, flagSep);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
appendFlags = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(appendFlags);
twoStageAffix = true;
}
// zero affix -> empty string
if ("0".equals(affixArg)) {
affixArg = "";
}
String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
// at least the gascon affix file has this issue
if (condition.startsWith("[") && condition.indexOf(']') == -1) {
condition = condition + "]";
}
// "dash hasn't got special meaning" (we must escape it)
if (condition.indexOf('-') >= 0) {
condition = escapeDash(condition);
}
final String regex;
if (".".equals(condition)) {
// Zero condition is indicated by dot
regex = ".*";
} else if (condition.equals(strip)) {
// TODO: optimize this better:
regex = ".*";
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
// but this is complicated...
} else {
regex = String.format(Locale.ROOT, conditionPattern, condition);
}
// deduplicate patterns
Integer patternIndex = seenPatterns.get(regex);
if (patternIndex == null) {
patternIndex = patterns.size();
if (patternIndex > Short.MAX_VALUE) {
throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");
}
seenPatterns.put(regex, patternIndex);
CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
patterns.add(pattern);
}
Integer stripOrd = seenStrips.get(strip);
if (stripOrd == null) {
stripOrd = seenStrips.size();
seenStrips.put(strip, stripOrd);
if (stripOrd > Character.MAX_VALUE) {
throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
}
}
if (appendFlags == null) {
appendFlags = NOFLAGS;
}
encodeFlags(scratch, appendFlags);
int appendFlagsOrd = flagLookup.add(scratch.get());
if (appendFlagsOrd < 0) {
// already exists in our hash
appendFlagsOrd = (-appendFlagsOrd) - 1;
} else if (appendFlagsOrd > Short.MAX_VALUE) {
// this limit is probably flexible, but it's a good sanity check too
throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org");
}
affixWriter.writeShort((short) flag);
affixWriter.writeShort((short) stripOrd.intValue());
// encode crossProduct into patternIndex
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
affixWriter.writeShort((short) patternOrd);
affixWriter.writeShort((short) appendFlagsOrd);
if (needsInputCleaning) {
CharSequence cleaned = cleanInput(affixArg, sb);
affixArg = cleaned.toString();
}
if (isSuffix) {
affixArg = new StringBuilder(affixArg).reverse().toString();
}
List<Integer> list = affixes.get(affixArg);
if (list == null) {
list = new ArrayList<>();
affixes.put(affixArg, list);
}
list.add(currentAffix);
currentAffix++;
}
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class MinHashFilterTest method createMockShingleTokenizer.
private static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
MockTokenizer tokenizer = new MockTokenizer(new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){" + (shingleSize - 1) + "}").toAutomaton()), true);
tokenizer.setEnableChecks(true);
if (shingles != null) {
tokenizer.setReader(new StringReader(shingles));
}
return tokenizer;
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class MemoryIndexOffsetStrategy method buildCombinedAutomaton.
/**
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
*/
private static CharacterRunAutomaton buildCombinedAutomaton(Predicate<String> fieldMatcher, BytesRef[] terms, CharacterRunAutomaton[] automata, PhraseHelper strictPhrases, Function<Query, Collection<Query>> multiTermQueryRewrite) {
List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
if (terms.length > 0) {
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
}
Collections.addAll(allAutomata, automata);
for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
Collections.addAll(allAutomata, //true==lookInSpan
MultiTermHighlighting.extractAutomata(spanQuery, fieldMatcher, true, multiTermQueryRewrite));
}
if (allAutomata.size() == 1) {
return allAutomata.get(0);
}
// Return an aggregate CharacterRunAutomaton of others
return new // the makeEmpty() is bogus; won't be used
CharacterRunAutomaton(// the makeEmpty() is bogus; won't be used
Automata.makeEmpty()) {
@Override
public boolean run(char[] chars, int offset, int length) {
for (int i = 0; i < allAutomata.size(); i++) {
// don't use foreach to avoid Iterator allocation
if (allAutomata.get(i).run(chars, offset, length)) {
return true;
}
}
return false;
}
};
}
Aggregations