use of org.apache.lucene.analysis.shingle.ShingleFilterFactory in project SearchServices by Alfresco.
the class MinHashFilterTest method createMinHashAnalyzer.
public static TokenizerChain createMinHashAnalyzer(int min, int hashCount, int hashSetSize) {
WhitespaceTokenizerFactory icutf = new WhitespaceTokenizerFactory(Collections.<String, String>emptyMap());
HashMap<String, String> sffargs = new HashMap<String, String>();
sffargs.put("minShingleSize", "" + min);
sffargs.put("maxShingleSize", "" + min);
sffargs.put("outputUnigrams", "false");
sffargs.put("outputUnigramsIfNoShingles", "false");
sffargs.put("tokenSeparator", " ");
ShingleFilterFactory sff = new ShingleFilterFactory(sffargs);
HashMap<String, String> lshffargs = new HashMap<String, String>();
lshffargs.put("hashCount", "" + hashCount);
lshffargs.put("hashSetSize", "" + hashSetSize);
MinHashFilterFactory lshff = new MinHashFilterFactory(lshffargs);
TokenizerChain chain = new TokenizerChain(new CharFilterFactory[] {}, icutf, new TokenFilterFactory[] { sff, lshff });
return chain;
}
use of org.apache.lucene.analysis.shingle.ShingleFilterFactory in project SearchServices by Alfresco.
the class Solr4QueryParser method getFieldQueryImpl.
@SuppressWarnings("unchecked")
protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode, LuceneFunction luceneFunction) throws ParseException, IOException {
// make sure the field exists or return a dummy query so we have no
// error ....ACE-3231
SchemaField schemaField = schema.getFieldOrNull(field);
boolean isNumeric = false;
if (schemaField == null) {
return new TermQuery(new Term("_dummy_", "_miss_"));
} else {
isNumeric = (schemaField.getType().getNumericType() != null);
}
if (luceneFunction != LuceneFunction.FIELD) {
throw new UnsupportedOperationException("Field queries are not supported on lucene functions (UPPER, LOWER, etc)");
}
// if the incoming string already has a language identifier we strip it
// iff and addit back on again
String localePrefix = "";
String toTokenise = queryText;
if (queryText.startsWith("{")) {
int position = queryText.indexOf("}");
if (position > 0) {
String language = queryText.substring(0, position + 1);
Locale locale = new Locale(queryText.substring(1, position));
String token = queryText.substring(position + 1);
boolean found = false;
for (Locale current : Locale.getAvailableLocales()) {
if (current.toString().equalsIgnoreCase(locale.toString())) {
found = true;
break;
}
}
if (found) {
localePrefix = language;
toTokenise = token;
} else {
// toTokenise = token;
}
}
}
String testText = toTokenise;
boolean requiresMLTokenDuplication = false;
String localeString = null;
if (isPropertyField(field) && (localePrefix.length() == 0)) {
if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) {
int position = queryText.indexOf("\u0000", 1);
testText = queryText.substring(position + 1);
requiresMLTokenDuplication = true;
localeString = queryText.substring(1, position);
}
}
// find the positions of any escaped * and ? and ignore them
Set<Integer> wildcardPoistions = getWildcardPositions(testText);
TokenStream source = null;
ArrayList<PackedTokenAttributeImpl> list = new ArrayList<PackedTokenAttributeImpl>();
boolean severalTokensAtSamePosition = false;
PackedTokenAttributeImpl nextToken;
int positionCount = 0;
try {
source = getAnalyzer().tokenStream(field, new StringReader(toTokenise));
source.reset();
while (source.incrementToken()) {
CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = null;
if (source.hasAttribute(TypeAttribute.class)) {
typeAtt = source.getAttribute(TypeAttribute.class);
}
PositionIncrementAttribute posIncAtt = null;
if (source.hasAttribute(PositionIncrementAttribute.class)) {
posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
}
nextToken = new PackedTokenAttributeImpl();
nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
if (typeAtt != null) {
nextToken.setType(typeAtt.type());
}
if (posIncAtt != null) {
nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
}
list.add(nextToken);
if (nextToken.getPositionIncrement() != 0)
positionCount += nextToken.getPositionIncrement();
else
severalTokensAtSamePosition = true;
}
} finally {
try {
if (source != null) {
source.close();
}
} catch (IOException e) {
// ignore
}
}
for (int index = 0; index < testText.length(); index++) {
char current = testText.charAt(index);
if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) {
StringBuilder pre = new StringBuilder(10);
if (index == 0) {
// "*" and "?" at the start
boolean found = false;
for (int j = 0; j < list.size(); j++) {
PackedTokenAttributeImpl test = list.get(j);
if ((test.startOffset() <= 0) && (0 < test.endOffset())) {
found = true;
break;
}
}
if (!found && (list.size() == 0)) {
// Add new token followed by * not given by the
// tokeniser
PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
newToken.setEmpty().append("", 0, 0);
newToken.setType("ALPHANUM");
if (requiresMLTokenDuplication) {
Locale locale = I18NUtil.parseLocale(localeString);
@SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE);
Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
if (it != null) {
int count = 0;
while (it.hasNext()) {
list.add(it.next());
count++;
if (count > 1) {
severalTokensAtSamePosition = true;
}
}
}
} else // content
{
list.add(newToken);
}
}
} else if (index > 0) {
// Add * and ? back into any tokens from which it has been
// removed
boolean tokenFound = false;
for (int j = 0; j < list.size(); j++) {
PackedTokenAttributeImpl test = list.get(j);
if ((test.startOffset() <= index) && (index < test.endOffset())) {
if (requiresMLTokenDuplication) {
String termText = test.toString();
int position = termText.indexOf("}");
String language = termText.substring(0, position + 1);
String token = termText.substring(position + 1);
if (index >= test.startOffset() + token.length()) {
test.setEmpty();
test.append(language + token + current);
}
} else {
if (index >= test.startOffset() + test.length()) {
test.setEmpty();
test.append(test.toString() + current);
}
}
tokenFound = true;
break;
}
}
if (!tokenFound) {
for (int i = index - 1; i >= 0; i--) {
char c = testText.charAt(i);
if (Character.isLetterOrDigit(c)) {
boolean found = false;
for (int j = 0; j < list.size(); j++) {
PackedTokenAttributeImpl test = list.get(j);
if ((test.startOffset() <= i) && (i < test.endOffset())) {
found = true;
break;
}
}
if (found) {
break;
} else {
pre.insert(0, c);
}
} else {
break;
}
}
if (pre.length() > 0) {
// Add new token followed by * not given by the
// tokeniser
PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
newToken.setEmpty().append(pre.toString());
newToken.setOffset(index - pre.length(), index);
newToken.setType("ALPHANUM");
if (requiresMLTokenDuplication) {
Locale locale = I18NUtil.parseLocale(localeString);
@SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE);
Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
if (it != null) {
int count = 0;
while (it.hasNext()) {
list.add(it.next());
count++;
if (count > 1) {
severalTokensAtSamePosition = true;
}
}
}
} else // content
{
list.add(newToken);
}
}
}
}
StringBuilder post = new StringBuilder(10);
if (index > 0) {
for (int i = index + 1; i < testText.length(); i++) {
char c = testText.charAt(i);
if (Character.isLetterOrDigit(c)) {
boolean found = false;
for (int j = 0; j < list.size(); j++) {
PackedTokenAttributeImpl test = list.get(j);
if ((test.startOffset() <= i) && (i < test.endOffset())) {
found = true;
break;
}
}
if (found) {
break;
} else {
post.append(c);
}
} else {
break;
}
}
if (post.length() > 0) {
// Add new token followed by * not given by the
// tokeniser
PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
newToken.setEmpty().append(post.toString());
newToken.setOffset(index + 1, index + 1 + post.length());
newToken.setType("ALPHANUM");
if (requiresMLTokenDuplication) {
Locale locale = I18NUtil.parseLocale(localeString);
@SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE);
Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken);
if (it != null) {
int count = 0;
while (it.hasNext()) {
list.add(it.next());
count++;
if (count > 1) {
severalTokensAtSamePosition = true;
}
}
}
} else // content
{
list.add(newToken);
}
}
}
}
}
// Put in real position increments as we treat them correctly
int curentIncrement = -1;
for (PackedTokenAttributeImpl c : list) {
if (curentIncrement == -1) {
curentIncrement = c.getPositionIncrement();
} else if (c.getPositionIncrement() > 0) {
curentIncrement = c.getPositionIncrement();
} else {
c.setPositionIncrement(curentIncrement);
}
}
// Fix up position increments for in phrase isolated wildcards
boolean lastWasWild = false;
for (int i = 0; i < list.size() - 1; i++) {
for (int j = list.get(i).endOffset() + 1; j < list.get(i + 1).startOffset() - 1; j++) {
if (wildcardPoistions.contains(j)) {
if (!lastWasWild) {
list.get(i + 1).setPositionIncrement(list.get(i + 1).getPositionIncrement() + 1);
}
lastWasWild = true;
} else {
lastWasWild = false;
}
}
}
Collections.sort(list, new Comparator<PackedTokenAttributeImpl>() {
public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) {
int dif = o1.startOffset() - o2.startOffset();
return dif;
}
});
// Combined * and ? based strings - should redo the tokeniser
// Build tokens by position
LinkedList<LinkedList<PackedTokenAttributeImpl>> tokensByPosition = new LinkedList<LinkedList<PackedTokenAttributeImpl>>();
LinkedList<PackedTokenAttributeImpl> currentList = null;
int lastStart = 0;
for (PackedTokenAttributeImpl c : list) {
if (c.startOffset() == lastStart) {
if (currentList == null) {
currentList = new LinkedList<PackedTokenAttributeImpl>();
tokensByPosition.add(currentList);
}
currentList.add(c);
} else {
currentList = new LinkedList<PackedTokenAttributeImpl>();
tokensByPosition.add(currentList);
currentList.add(c);
}
lastStart = c.startOffset();
}
// Build all the token sequences and see which ones get strung together
OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> allTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();
for (LinkedList<PackedTokenAttributeImpl> tokensAtPosition : tokensByPosition) {
OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> positionalSynonymSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();
OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> newAllTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>();
FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (PackedTokenAttributeImpl t : tokensAtPosition) {
PackedTokenAttributeImpl replace = new PackedTokenAttributeImpl();
replace.setEmpty().append(t);
replace.setOffset(t.startOffset(), t.endOffset());
replace.setType(t.type());
replace.setPositionIncrement(t.getPositionIncrement());
boolean tokenFoundSequence = false;
for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequencesSet) {
LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
newEntry.addAll(tokenSequence);
if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) {
positionalSynonymSequencesSet.add(tokenSequence);
newEntry.add(replace);
tokenFoundSequence = true;
} else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) {
if (newEntry.toString().endsWith(replace.toString())) {
// already in the gram
positionalSynonymSequencesSet.add(tokenSequence);
tokenFoundSequence = true;
} else {
// need to replace the synonym in the current
// gram
tokenFoundSequence = true;
StringBuffer old = new StringBuffer(newEntry.getLast().toString());
old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString());
PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
newToken.setEmpty().append(old.toString());
newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset());
newEntry.removeLast();
newEntry.add(newToken);
}
}
} else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) {
if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
positionalSynonymSequencesSet.add(tokenSequence);
}
newEntry.add(replace);
tokenFoundSequence = true;
}
newAllTokenSequencesSet.add(newEntry);
}
if (false == tokenFoundSequence) {
for (LinkedList<PackedTokenAttributeImpl> tokenSequence : newAllTokenSequencesSet) {
LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
newEntry.addAll(tokenSequence);
if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) {
positionalSynonymSequencesSet.add(tokenSequence);
newEntry.add(replace);
tokenFoundSequence = true;
} else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) {
if (newEntry.toString().endsWith(replace.toString())) {
// already in the gram
positionalSynonymSequencesSet.add(tokenSequence);
tokenFoundSequence = true;
} else {
// need to replace the synonym in the
// current gram
tokenFoundSequence = true;
StringBuffer old = new StringBuffer(newEntry.getLast().toString());
old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString());
PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl();
newToken.setEmpty().append(old.toString());
newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset());
newEntry.removeLast();
newEntry.add(newToken);
positionalSynonymSequencesSet.add(newEntry);
}
}
} else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) {
if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) {
positionalSynonymSequencesSet.add(tokenSequence);
newEntry.add(replace);
tokenFoundSequence = true;
}
}
}
}
if (false == tokenFoundSequence) {
LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>();
newEntry.add(replace);
newAllTokenSequencesSet.add(newEntry);
}
// Limit the max number of permutations we consider
if (newAllTokenSequencesSet.size() > 64) {
break FOR_FIRST_TOKEN_AT_POSITION_ONLY;
}
}
allTokenSequencesSet = newAllTokenSequencesSet;
allTokenSequencesSet.addAll(positionalSynonymSequencesSet);
}
LinkedList<LinkedList<PackedTokenAttributeImpl>> allTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(allTokenSequencesSet);
// build the unique
LinkedList<LinkedList<PackedTokenAttributeImpl>> fixedTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>();
for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequences) {
LinkedList<PackedTokenAttributeImpl> fixedTokenSequence = new LinkedList<PackedTokenAttributeImpl>();
fixedTokenSequences.add(fixedTokenSequence);
PackedTokenAttributeImpl replace = null;
for (PackedTokenAttributeImpl c : tokenSequence) {
if (replace == null) {
StringBuilder prefix = new StringBuilder();
for (int i = c.startOffset() - 1; i >= 0; i--) {
char test = testText.charAt(i);
if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
prefix.insert(0, test);
} else {
break;
}
}
String pre = prefix.toString();
if (requiresMLTokenDuplication) {
String termText = c.toString();
int position = termText.indexOf("}");
String language = termText.substring(0, position + 1);
String token = termText.substring(position + 1);
replace = new PackedTokenAttributeImpl();
replace.setEmpty().append(language + pre + token);
replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
replace.setType(c.type());
replace.setPositionIncrement(c.getPositionIncrement());
} else {
String termText = c.toString();
replace = new PackedTokenAttributeImpl();
replace.setEmpty().append(pre + termText);
replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
replace.setType(c.type());
replace.setPositionIncrement(c.getPositionIncrement());
}
} else {
StringBuilder prefix = new StringBuilder();
StringBuilder postfix = new StringBuilder();
StringBuilder builder = prefix;
for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) {
char test = testText.charAt(i);
if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
builder.insert(0, test);
} else {
builder = postfix;
postfix.setLength(0);
}
}
String pre = prefix.toString();
String post = postfix.toString();
// Does it bridge?
if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) {
String termText = c.toString();
if (requiresMLTokenDuplication) {
int position = termText.indexOf("}");
@SuppressWarnings("unused") String language = termText.substring(0, position + 1);
String token = termText.substring(position + 1);
int oldPositionIncrement = replace.getPositionIncrement();
String replaceTermText = replace.toString();
replace = new PackedTokenAttributeImpl();
replace.setEmpty().append(replaceTermText + pre + token);
replace.setOffset(replace.startOffset(), c.endOffset());
replace.setType(replace.type());
replace.setPositionIncrement(oldPositionIncrement);
} else {
int oldPositionIncrement = replace.getPositionIncrement();
String replaceTermText = replace.toString();
replace = new PackedTokenAttributeImpl();
replace.setEmpty().append(replaceTermText + pre + termText);
replace.setOffset(replace.startOffset(), c.endOffset());
replace.setType(replace.type());
replace.setPositionIncrement(oldPositionIncrement);
}
} else {
String termText = c.toString();
if (requiresMLTokenDuplication) {
int position = termText.indexOf("}");
String language = termText.substring(0, position + 1);
String token = termText.substring(position + 1);
String replaceTermText = replace.toString();
PackedTokenAttributeImpl last = new PackedTokenAttributeImpl();
last.setEmpty().append(replaceTermText + post);
last.setOffset(replace.startOffset(), replace.endOffset() + post.length());
last.setType(replace.type());
last.setPositionIncrement(replace.getPositionIncrement());
fixedTokenSequence.add(last);
replace = new PackedTokenAttributeImpl();
replace.setEmpty().append(language + pre + token);
replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
replace.setType(c.type());
replace.setPositionIncrement(c.getPositionIncrement());
} else {
String replaceTermText = replace.toString();
PackedTokenAttributeImpl last = new PackedTokenAttributeImpl();
last.setEmpty().append(replaceTermText + post);
last.setOffset(replace.startOffset(), replace.endOffset() + post.length());
last.setType(replace.type());
last.setPositionIncrement(replace.getPositionIncrement());
fixedTokenSequence.add(last);
replace = new PackedTokenAttributeImpl();
replace.setEmpty().append(pre + termText);
replace.setOffset(c.startOffset() - pre.length(), c.endOffset());
replace.setType(c.type());
replace.setPositionIncrement(c.getPositionIncrement());
}
}
}
}
// finish last
if (replace != null) {
StringBuilder postfix = new StringBuilder();
if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) {
for (int i = replace.endOffset(); i < testText.length(); i++) {
char test = testText.charAt(i);
if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
postfix.append(test);
} else {
break;
}
}
}
String post = postfix.toString();
int oldPositionIncrement = replace.getPositionIncrement();
String replaceTermText = replace.toString();
PackedTokenAttributeImpl terminal = new PackedTokenAttributeImpl();
terminal.setEmpty().append(replaceTermText + post);
terminal.setOffset(replace.startOffset(), replace.endOffset() + post.length());
terminal.setType(replace.type());
terminal.setPositionIncrement(oldPositionIncrement);
fixedTokenSequence.add(terminal);
}
}
// rebuild fixed list
ArrayList<PackedTokenAttributeImpl> fixed = new ArrayList<PackedTokenAttributeImpl>();
for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
for (PackedTokenAttributeImpl token : tokenSequence) {
fixed.add(token);
}
}
// reorder by start position and increment
Collections.sort(fixed, new Comparator<PackedTokenAttributeImpl>() {
public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) {
int dif = o1.startOffset() - o2.startOffset();
if (dif != 0) {
return dif;
} else {
return o1.getPositionIncrement() - o2.getPositionIncrement();
}
}
});
// make sure we remove any tokens we have duplicated
@SuppressWarnings("rawtypes") OrderedHashSet unique = new OrderedHashSet();
unique.addAll(fixed);
fixed = new ArrayList<PackedTokenAttributeImpl>(unique);
list = fixed;
if (localePrefix.length() > 0) {
for (int j = 0; j < list.size(); j++) {
PackedTokenAttributeImpl currentToken = list.get(j);
String termText = currentToken.toString();
currentToken.setEmpty();
currentToken.append(localePrefix + termText);
}
}
SchemaField sf = schema.getField(field);
boolean isShingled = false;
@SuppressWarnings("resource") TokenizerChain tokenizerChain = (sf.getType().getQueryAnalyzer() instanceof TokenizerChain) ? ((TokenizerChain) sf.getType().getQueryAnalyzer()) : null;
if (tokenizerChain != null) {
for (TokenFilterFactory factory : tokenizerChain.getTokenFilterFactories()) {
if (factory instanceof ShingleFilterFactory) {
isShingled = true;
break;
}
}
}
@SuppressWarnings("resource") AlfrescoAnalyzerWrapper analyzerWrapper = (sf.getType().getQueryAnalyzer() instanceof AlfrescoAnalyzerWrapper) ? ((AlfrescoAnalyzerWrapper) sf.getType().getQueryAnalyzer()) : null;
if (analyzerWrapper != null) {
// assume if there are no term positions it is shingled ....
isShingled = true;
}
boolean forceConjuncion = rerankPhase == RerankPhase.QUERY_PHASE;
if (list.size() == 0) {
return null;
} else if (list.size() == 1) {
nextToken = list.get(0);
String termText = nextToken.toString();
if (!isNumeric && (termText.contains("*") || termText.contains("?"))) {
return newWildcardQuery(new Term(field, termText));
} else {
return newTermQuery(new Term(field, termText));
}
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
Builder q = newBooleanQuery();
for (int i = 0; i < list.size(); i++) {
Query currentQuery;
nextToken = list.get(i);
String termText = nextToken.toString();
if (termText.contains("*") || termText.contains("?")) {
currentQuery = newWildcardQuery(new Term(field, termText));
} else {
currentQuery = newTermQuery(new Term(field, termText));
}
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q.build();
} else if (forceConjuncion) {
BooleanQuery.Builder or = new BooleanQuery.Builder();
for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
BooleanQuery.Builder and = new BooleanQuery.Builder();
for (int i = 0; i < tokenSequence.size(); i++) {
nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i);
String termText = nextToken.toString();
Term term = new Term(field, termText);
if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(term);
and.add(wildQuery, Occur.MUST);
} else {
TermQuery termQuery = new TermQuery(term);
and.add(termQuery, Occur.MUST);
}
}
if (and.build().clauses().size() > 0) {
or.add(and.build(), Occur.SHOULD);
}
}
return or.build();
} else // shingle
if (sf.omitPositions() && isShingled) {
ArrayList<PackedTokenAttributeImpl> nonContained = getNonContained(list);
Query currentQuery;
BooleanQuery.Builder weakPhrase = new BooleanQuery.Builder();
for (PackedTokenAttributeImpl shingleToken : nonContained) {
String termText = shingleToken.toString();
Term term = new Term(field, termText);
if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
currentQuery = new org.apache.lucene.search.WildcardQuery(term);
} else {
currentQuery = new TermQuery(term);
}
weakPhrase.add(currentQuery, Occur.MUST);
}
return weakPhrase.build();
} else // Word delimiter factory and other odd things generate complex
// token patterns
// Smart skip token sequences with small tokens that generate
// toomany wildcards
// Fall back to the larger pattern
// e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches
// too much (S ite1*) and (Site1*) will still be OK
// If we skip all (for just 1* in the input) this is still an
// issue.
{
return generateSpanOrQuery(field, fixedTokenSequences);
}
} else {
if (forceConjuncion) {
BooleanQuery.Builder or = new BooleanQuery.Builder();
for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) {
BooleanQuery.Builder and = new BooleanQuery.Builder();
for (int i = 0; i < tokenSequence.size(); i++) {
nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i);
String termText = nextToken.toString();
Term term = new Term(field, termText);
if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(term);
and.add(wildQuery, Occur.MUST);
} else {
TermQuery termQuery = new TermQuery(term);
and.add(termQuery, Occur.MUST);
}
}
if (and.build().clauses().size() > 0) {
or.add(and.build(), Occur.SHOULD);
}
}
return or.build();
} else {
SpanQuery spanQuery = null;
ArrayList<SpanQuery> atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
int gap = 0;
for (int i = 0; i < list.size(); i++) {
nextToken = list.get(i);
String termText = nextToken.toString();
Term term = new Term(field, termText);
if (getEnablePositionIncrements()) {
SpanQuery nextSpanQuery;
if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(term);
SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>(wildQuery);
wrapper.setRewriteMethod(new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit));
nextSpanQuery = wrapper;
} else {
nextSpanQuery = new SpanTermQuery(term);
}
if (gap == 0) {
atSamePositionSpanOrQueryParts.add(nextSpanQuery);
} else {
if (atSamePositionSpanOrQueryParts.size() == 0) {
if (spanQuery == null) {
spanQuery = nextSpanQuery;
} else {
spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, nextSpanQuery }, (gap - 1) + internalSlop, internalSlop < 2);
}
atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
} else if (atSamePositionSpanOrQueryParts.size() == 1) {
if (spanQuery == null) {
spanQuery = atSamePositionSpanOrQueryParts.get(0);
} else {
spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2);
}
atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
atSamePositionSpanOrQueryParts.add(nextSpanQuery);
} else {
if (spanQuery == null) {
spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {}));
} else {
spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2);
}
atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>();
atSamePositionSpanOrQueryParts.add(nextSpanQuery);
}
}
gap = nextToken.getPositionIncrement();
} else {
SpanQuery nextSpanQuery;
if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery(term);
SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>(wildQuery);
wrapper.setRewriteMethod(new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit));
nextSpanQuery = wrapper;
} else {
nextSpanQuery = new SpanTermQuery(term);
}
if (spanQuery == null) {
spanQuery = new SpanOrQuery(nextSpanQuery);
} else {
spanQuery = new SpanOrQuery(spanQuery, nextSpanQuery);
}
}
}
if (atSamePositionSpanOrQueryParts.size() == 0) {
return spanQuery;
} else if (atSamePositionSpanOrQueryParts.size() == 1) {
if (spanQuery == null) {
spanQuery = atSamePositionSpanOrQueryParts.get(0);
} else {
spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2);
}
return spanQuery;
} else {
if (spanQuery == null) {
spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {}));
} else {
spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2);
}
return spanQuery;
}
}
}
}
}
Aggregations