use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestPhraseQuery method testRandomPhrases.
public void testRandomPhrases() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
List<List<String>> docs = new ArrayList<>();
Document d = new Document();
Field f = newTextField("f", "", Field.Store.NO);
d.add(f);
Random r = random();
int NUM_DOCS = atLeast(10);
for (int i = 0; i < NUM_DOCS; i++) {
// must be > 4096 so it spans multiple chunks
int termCount = TestUtil.nextInt(random(), 4097, 8200);
List<String> doc = new ArrayList<>();
StringBuilder sb = new StringBuilder();
while (doc.size() < termCount) {
if (r.nextInt(5) == 1 || docs.size() == 0) {
// make new non-empty-string term
String term;
while (true) {
term = TestUtil.randomUnicodeString(r);
if (term.length() > 0) {
break;
}
}
try (TokenStream ts = analyzer.tokenStream("ignore", term)) {
CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String text = termAttr.toString();
doc.add(text);
sb.append(text).append(' ');
}
ts.end();
}
} else {
// pick existing sub-phrase
List<String> lastDoc = docs.get(r.nextInt(docs.size()));
int len = TestUtil.nextInt(r, 1, 10);
int start = r.nextInt(lastDoc.size() - len);
for (int k = start; k < start + len; k++) {
String t = lastDoc.get(k);
doc.add(t);
sb.append(t).append(' ');
}
}
}
docs.add(doc);
f.setStringValue(sb.toString());
w.addDocument(d);
}
IndexReader reader = w.getReader();
IndexSearcher s = newSearcher(reader);
w.close();
// now search
int num = atLeast(10);
for (int i = 0; i < num; i++) {
int docID = r.nextInt(docs.size());
List<String> doc = docs.get(docID);
final int numTerm = TestUtil.nextInt(r, 2, 20);
final int start = r.nextInt(doc.size() - numTerm);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
StringBuilder sb = new StringBuilder();
for (int t = start; t < start + numTerm; t++) {
builder.add(new Term("f", doc.get(t)), t);
sb.append(doc.get(t)).append(' ');
}
PhraseQuery pq = builder.build();
TopDocs hits = s.search(pq, NUM_DOCS);
boolean found = false;
for (int j = 0; j < hits.scoreDocs.length; j++) {
if (hits.scoreDocs[j].doc == docID) {
found = true;
break;
}
}
assertTrue("phrase '" + sb + "' not found; start=" + start + ", it=" + i + ", expected doc " + docID, found);
}
reader.close();
dir.close();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class EdgeNGramTokenFilterTest method testSupplementaryCharacters.
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer) tk).setReader(new StringReader(s));
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, i);
assertEquals(s.substring(0, end), termAtt.toString());
}
assertFalse(tk.incrementToken());
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class NGramTokenFilterTest method testSupplementaryCharacters.
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer) tk).setReader(new StringReader(s));
tk = new NGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int startIndex = Character.offsetByCodePoints(s, 0, start);
final int endIndex = Character.offsetByCodePoints(s, 0, end);
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
}
}
assertFalse(tk.incrementToken());
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestJapaneseNumberFilter method analyze.
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
TokenStream stream = analyzer.tokenStream("dummy", reader);
stream.reset();
CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
writer.write(termAttr.toString());
writer.write("\n");
}
reader.close();
writer.close();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestJapaneseTokenizer method testSurrogates2.
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
int numIterations = atLeast(10000);
for (int i = 0; i < numIterations; i++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + i);
}
String s = TestUtil.randomUnicodeString(random(), 100);
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
ts.end();
}
}
}
Aggregations