use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestAsciiFoldingFilterFactory method testMultiTermAnalysis.
public void testMultiTermAnalysis() throws IOException {
TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestIndexWriter method testEmptyNorm.
public void testEmptyNorm() throws Exception {
Directory d = newDirectory();
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
doc.add(new TextField("foo", new CannedTokenStream()));
w.addDocument(doc);
w.commit();
w.close();
DirectoryReader r = DirectoryReader.open(d);
NumericDocValues norms = getOnlyLeafReader(r).getNormValues("foo");
assertEquals(0, norms.nextDoc());
assertEquals(0, norms.longValue());
r.close();
d.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestPostingsOffsets method testRandom.
public void testRandom() throws Exception {
// token -> docID -> tokens
final Map<String, Map<Integer, List<Token>>> actualTokens = new HashMap<>();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
final int numDocs = atLeast(20);
//final int numDocs = atLeast(5);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
// TODO: randomize what IndexOptions we use; also test
// changing this up in one IW buffered segment...:
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(random().nextBoolean());
ft.setStoreTermVectorPositions(random().nextBoolean());
}
for (int docCount = 0; docCount < numDocs; docCount++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("id", docCount));
List<Token> tokens = new ArrayList<>();
final int numTokens = atLeast(100);
//final int numTokens = atLeast(20);
int pos = -1;
int offset = 0;
//System.out.println("doc id=" + docCount);
for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) {
final String text;
if (random().nextBoolean()) {
text = "a";
} else if (random().nextBoolean()) {
text = "b";
} else if (random().nextBoolean()) {
text = "c";
} else {
text = "d";
}
int posIncr = random().nextBoolean() ? 1 : random().nextInt(5);
if (tokenCount == 0 && posIncr == 0) {
posIncr = 1;
}
final int offIncr = random().nextBoolean() ? 0 : random().nextInt(5);
final int tokenOffset = random().nextInt(5);
final Token token = makeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset);
if (!actualTokens.containsKey(text)) {
actualTokens.put(text, new HashMap<Integer, List<Token>>());
}
final Map<Integer, List<Token>> postingsByDoc = actualTokens.get(text);
if (!postingsByDoc.containsKey(docCount)) {
postingsByDoc.put(docCount, new ArrayList<Token>());
}
postingsByDoc.get(docCount).add(token);
tokens.add(token);
pos += posIncr;
// stuff abs position into type:
token.setType("" + pos);
offset += offIncr + tokenOffset;
//System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
}
doc.add(new Field("content", new CannedTokenStream(tokens.toArray(new Token[tokens.size()])), ft));
w.addDocument(doc);
}
final DirectoryReader r = w.getReader();
w.close();
final String[] terms = new String[] { "a", "b", "c", "d" };
for (LeafReaderContext ctx : r.leaves()) {
// TODO: improve this
LeafReader sub = ctx.reader();
//System.out.println("\nsub=" + sub);
final TermsEnum termsEnum = sub.fields().terms("content").iterator();
PostingsEnum docs = null;
PostingsEnum docsAndPositions = null;
PostingsEnum docsAndPositionsAndOffsets = null;
int[] docIDToID = new int[sub.maxDoc()];
NumericDocValues values = DocValues.getNumeric(sub, "id");
for (int i = 0; i < sub.maxDoc(); i++) {
assertEquals(i, values.nextDoc());
docIDToID[i] = (int) values.longValue();
}
for (String term : terms) {
//System.out.println(" term=" + term);
if (termsEnum.seekExact(new BytesRef(term))) {
docs = termsEnum.postings(docs);
assertNotNull(docs);
int doc;
//System.out.println(" doc/freq");
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docs.freq());
}
// explicitly exclude offsets here
docsAndPositions = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
assertNotNull(docsAndPositions);
//System.out.println(" doc/freq/pos");
while ((doc = docsAndPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositions.freq());
for (Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositions.nextPosition());
}
}
docsAndPositionsAndOffsets = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
assertNotNull(docsAndPositionsAndOffsets);
//System.out.println(" doc/freq/pos/offs");
while ((doc = docsAndPositionsAndOffsets.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositionsAndOffsets.freq());
for (Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositionsAndOffsets.nextPosition());
assertEquals(token.startOffset(), docsAndPositionsAndOffsets.startOffset());
assertEquals(token.endOffset(), docsAndPositionsAndOffsets.endOffset());
}
}
}
}
// TODO: test advance:
}
r.close();
dir.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestPhraseQuery method testZeroPosIncr.
/** Tests PhraseQuery with terms at the same position in the query. */
public void testZeroPosIncr() throws IOException {
Directory dir = newDirectory();
final Token[] tokens = new Token[3];
tokens[0] = new Token();
tokens[0].append("a");
tokens[0].setPositionIncrement(1);
tokens[1] = new Token();
tokens[1].append("aa");
tokens[1].setPositionIncrement(0);
tokens[2] = new Token();
tokens[2].append("b");
tokens[2].setPositionIncrement(1);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(tokens)));
writer.addDocument(doc);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
// Sanity check; simple "a b" phrase:
PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
// Now with "a|aa b"
pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "aa"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
// Now with "a|z b" which should not match; this isn't a MultiPhraseQuery
pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "z"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(0, searcher.search(pqBuilder.build(), 1).totalHits);
r.close();
dir.close();
}
use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.
the class TestMultiPhraseQuery method testZeroPosIncr.
public void testZeroPosIncr() throws IOException {
Directory dir = new RAMDirectory();
final Token[] tokens = new Token[3];
tokens[0] = new Token();
tokens[0].append("a");
tokens[0].setPositionIncrement(1);
tokens[1] = new Token();
tokens[1].append("b");
tokens[1].setPositionIncrement(0);
tokens[2] = new Token();
tokens[2].append("c");
tokens[2].setPositionIncrement(0);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(tokens)));
writer.addDocument(doc);
doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(tokens)));
writer.addDocument(doc);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher s = newSearcher(r);
MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
// case):
if (true) {
mpqb.add(new Term[] { new Term("field", "b"), new Term("field", "c") }, 0);
mpqb.add(new Term[] { new Term("field", "a") }, 0);
} else {
mpqb.add(new Term[] { new Term("field", "a") }, 0);
mpqb.add(new Term[] { new Term("field", "b"), new Term("field", "c") }, 0);
}
TopDocs hits = s.search(mpqb.build(), 2);
assertEquals(2, hits.totalHits);
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
/*
for(int hit=0;hit<hits.totalHits;hit++) {
ScoreDoc sd = hits.scoreDocs[hit];
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
}
*/
r.close();
dir.close();
}
Aggregations