use of org.apache.lucene.search.suggest.InputIterator in project lucene-solr by apache.
the class TestFreeTextSuggester method testWiki.
@Ignore
public void testWiki() throws Exception {
final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt");
// Skip header:
lfd.nextDoc();
Analyzer analyzer = new MockAnalyzer(random());
FreeTextSuggester sug = new FreeTextSuggester(analyzer);
sug.build(new InputIterator() {
private int count;
@Override
public long weight() {
return 1;
}
@Override
public BytesRef next() {
Document doc;
try {
doc = lfd.nextDoc();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
if (doc == null) {
return null;
}
if (count++ == 10000) {
return null;
}
return new BytesRef(doc.get("body"));
}
@Override
public BytesRef payload() {
return null;
}
@Override
public boolean hasPayloads() {
return false;
}
@Override
public Set<BytesRef> contexts() {
return null;
}
@Override
public boolean hasContexts() {
return false;
}
});
if (VERBOSE) {
System.out.println(sug.ramBytesUsed() + " bytes");
List<LookupResult> results = sug.lookup("general r", 10);
System.out.println("results:");
for (LookupResult result : results) {
System.out.println(" " + result);
}
}
analyzer.close();
}
use of org.apache.lucene.search.suggest.InputIterator in project lucene-solr by apache.
the class WFSTCompletionLookup method build.
@Override
public void build(InputIterator iterator) throws IOException {
if (iterator.hasPayloads()) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
count = 0;
BytesRef scratch = new BytesRef();
InputIterator iter = new WFSTInputIterator(tempDir, tempFileNamePrefix, iterator);
IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRefBuilder previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) {
long cost = iter.weight();
if (previous == null) {
previous = new BytesRefBuilder();
} else if (scratch.equals(previous.get())) {
// for duplicate suggestions, the best weight is actually
continue;
// added
}
Util.toIntsRef(scratch, scratchInts);
builder.add(scratchInts.get(), cost);
previous.copyBytes(scratch);
count++;
}
fst = builder.finish();
}
use of org.apache.lucene.search.suggest.InputIterator in project lucene-solr by apache.
the class TestFreeTextSuggester method testRandom.
public void testRandom() throws IOException {
String[] terms = new String[TestUtil.nextInt(random(), 2, 10)];
Set<String> seen = new HashSet<>();
while (seen.size() < terms.length) {
String token = TestUtil.randomSimpleString(random(), 1, 5);
if (!seen.contains(token)) {
terms[seen.size()] = token;
seen.add(token);
}
}
Analyzer a = new MockAnalyzer(random());
int numDocs = atLeast(10);
long totTokens = 0;
final String[][] docs = new String[numDocs][];
for (int i = 0; i < numDocs; i++) {
docs[i] = new String[atLeast(100)];
if (VERBOSE) {
System.out.print(" doc " + i + ":");
}
for (int j = 0; j < docs[i].length; j++) {
docs[i][j] = getZipfToken(terms);
if (VERBOSE) {
System.out.print(" " + docs[i][j]);
}
}
if (VERBOSE) {
System.out.println();
}
totTokens += docs[i].length;
}
int grams = TestUtil.nextInt(random(), 1, 4);
if (VERBOSE) {
System.out.println("TEST: " + terms.length + " terms; " + numDocs + " docs; " + grams + " grams");
}
// Build suggester model:
FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte) 0x20);
sug.build(new InputIterator() {
int upto;
@Override
public BytesRef next() {
if (upto == docs.length) {
return null;
} else {
StringBuilder b = new StringBuilder();
for (String token : docs[upto]) {
b.append(' ');
b.append(token);
}
upto++;
return new BytesRef(b.toString());
}
}
@Override
public long weight() {
return random().nextLong();
}
@Override
public BytesRef payload() {
return null;
}
@Override
public boolean hasPayloads() {
return false;
}
@Override
public Set<BytesRef> contexts() {
return null;
}
@Override
public boolean hasContexts() {
return false;
}
});
// Build inefficient but hopefully correct model:
List<Map<String, Integer>> gramCounts = new ArrayList<>(grams);
for (int gram = 0; gram < grams; gram++) {
if (VERBOSE) {
System.out.println("TEST: build model for gram=" + gram);
}
Map<String, Integer> model = new HashMap<>();
gramCounts.add(model);
for (String[] doc : docs) {
for (int i = 0; i < doc.length - gram; i++) {
StringBuilder b = new StringBuilder();
for (int j = i; j <= i + gram; j++) {
if (j > i) {
b.append(' ');
}
b.append(doc[j]);
}
String token = b.toString();
Integer curCount = model.get(token);
if (curCount == null) {
model.put(token, 1);
} else {
model.put(token, 1 + curCount);
}
if (VERBOSE) {
System.out.println(" add '" + token + "' -> count=" + model.get(token));
}
}
}
}
int lookups = atLeast(100);
for (int iter = 0; iter < lookups; iter++) {
String[] tokens = new String[TestUtil.nextInt(random(), 1, 5)];
for (int i = 0; i < tokens.length; i++) {
tokens[i] = getZipfToken(terms);
}
// Maybe trim last token; be sure not to create the
// empty string:
int trimStart;
if (tokens.length == 1) {
trimStart = 1;
} else {
trimStart = 0;
}
int trimAt = TestUtil.nextInt(random(), trimStart, tokens[tokens.length - 1].length());
tokens[tokens.length - 1] = tokens[tokens.length - 1].substring(0, trimAt);
int num = TestUtil.nextInt(random(), 1, 100);
StringBuilder b = new StringBuilder();
for (String token : tokens) {
b.append(' ');
b.append(token);
}
String query = b.toString();
query = query.substring(1);
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " query='" + query + "' num=" + num);
}
// Expected:
List<LookupResult> expected = new ArrayList<>();
double backoff = 1.0;
seen = new HashSet<>();
if (VERBOSE) {
System.out.println(" compute expected");
}
for (int i = grams - 1; i >= 0; i--) {
if (VERBOSE) {
System.out.println(" grams=" + i);
}
if (tokens.length < i + 1) {
// Don't have enough tokens to use this model
if (VERBOSE) {
System.out.println(" skip");
}
continue;
}
if (i == 0 && tokens[tokens.length - 1].length() == 0) {
// Never suggest unigrams from empty string:
if (VERBOSE) {
System.out.println(" skip unigram priors only");
}
continue;
}
// Build up "context" ngram:
b = new StringBuilder();
for (int j = tokens.length - i - 1; j < tokens.length - 1; j++) {
b.append(' ');
b.append(tokens[j]);
}
String context = b.toString();
if (context.length() > 0) {
context = context.substring(1);
}
if (VERBOSE) {
System.out.println(" context='" + context + "'");
}
long contextCount;
if (context.length() == 0) {
contextCount = totTokens;
} else {
Integer count = gramCounts.get(i - 1).get(context);
if (count == null) {
// We never saw this context:
backoff *= FreeTextSuggester.ALPHA;
if (VERBOSE) {
System.out.println(" skip: never saw context");
}
continue;
}
contextCount = count;
}
if (VERBOSE) {
System.out.println(" contextCount=" + contextCount);
}
Map<String, Integer> model = gramCounts.get(i);
// First pass, gather all predictions for this model:
if (VERBOSE) {
System.out.println(" find terms w/ prefix=" + tokens[tokens.length - 1]);
}
List<LookupResult> tmp = new ArrayList<>();
for (String term : terms) {
if (term.startsWith(tokens[tokens.length - 1])) {
if (VERBOSE) {
System.out.println(" term=" + term);
}
if (seen.contains(term)) {
if (VERBOSE) {
System.out.println(" skip seen");
}
continue;
}
String ngram = (context + " " + term).trim();
Integer count = model.get(ngram);
if (count != null) {
LookupResult lr = new LookupResult(ngram, (long) (Long.MAX_VALUE * (backoff * (double) count / contextCount)));
tmp.add(lr);
if (VERBOSE) {
System.out.println(" add tmp key='" + lr.key + "' score=" + lr.value);
}
}
}
}
// Second pass, trim to only top N, and fold those
// into overall suggestions:
Collections.sort(tmp, byScoreThenKey);
if (tmp.size() > num) {
tmp.subList(num, tmp.size()).clear();
}
for (LookupResult result : tmp) {
String key = result.key.toString();
int idx = key.lastIndexOf(' ');
String lastToken;
if (idx != -1) {
lastToken = key.substring(idx + 1);
} else {
lastToken = key;
}
if (!seen.contains(lastToken)) {
seen.add(lastToken);
expected.add(result);
if (VERBOSE) {
System.out.println(" keep key='" + result.key + "' score=" + result.value);
}
}
}
backoff *= FreeTextSuggester.ALPHA;
}
Collections.sort(expected, byScoreThenKey);
if (expected.size() > num) {
expected.subList(num, expected.size()).clear();
}
// Actual:
List<LookupResult> actual = sug.lookup(query, num);
if (VERBOSE) {
System.out.println(" expected: " + expected);
System.out.println(" actual: " + actual);
}
assertEquals(expected.toString(), actual.toString());
}
a.close();
}
Aggregations