use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestAsciiFoldingFilterFactory method testMultiTermAnalysis.
public void testMultiTermAnalysis() throws IOException {
TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestPostingsOffsets method testRandom.
public void testRandom() throws Exception {
// token -> docID -> tokens
final Map<String, Map<Integer, List<Token>>> actualTokens = new HashMap<>();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
final int numDocs = atLeast(20);
//final int numDocs = atLeast(5);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
// TODO: randomize what IndexOptions we use; also test
// changing this up in one IW buffered segment...:
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(random().nextBoolean());
ft.setStoreTermVectorPositions(random().nextBoolean());
}
for (int docCount = 0; docCount < numDocs; docCount++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("id", docCount));
List<Token> tokens = new ArrayList<>();
final int numTokens = atLeast(100);
//final int numTokens = atLeast(20);
int pos = -1;
int offset = 0;
//System.out.println("doc id=" + docCount);
for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) {
final String text;
if (random().nextBoolean()) {
text = "a";
} else if (random().nextBoolean()) {
text = "b";
} else if (random().nextBoolean()) {
text = "c";
} else {
text = "d";
}
int posIncr = random().nextBoolean() ? 1 : random().nextInt(5);
if (tokenCount == 0 && posIncr == 0) {
posIncr = 1;
}
final int offIncr = random().nextBoolean() ? 0 : random().nextInt(5);
final int tokenOffset = random().nextInt(5);
final Token token = makeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset);
if (!actualTokens.containsKey(text)) {
actualTokens.put(text, new HashMap<Integer, List<Token>>());
}
final Map<Integer, List<Token>> postingsByDoc = actualTokens.get(text);
if (!postingsByDoc.containsKey(docCount)) {
postingsByDoc.put(docCount, new ArrayList<Token>());
}
postingsByDoc.get(docCount).add(token);
tokens.add(token);
pos += posIncr;
// stuff abs position into type:
token.setType("" + pos);
offset += offIncr + tokenOffset;
//System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
}
doc.add(new Field("content", new CannedTokenStream(tokens.toArray(new Token[tokens.size()])), ft));
w.addDocument(doc);
}
final DirectoryReader r = w.getReader();
w.close();
final String[] terms = new String[] { "a", "b", "c", "d" };
for (LeafReaderContext ctx : r.leaves()) {
// TODO: improve this
LeafReader sub = ctx.reader();
//System.out.println("\nsub=" + sub);
final TermsEnum termsEnum = sub.fields().terms("content").iterator();
PostingsEnum docs = null;
PostingsEnum docsAndPositions = null;
PostingsEnum docsAndPositionsAndOffsets = null;
int[] docIDToID = new int[sub.maxDoc()];
NumericDocValues values = DocValues.getNumeric(sub, "id");
for (int i = 0; i < sub.maxDoc(); i++) {
assertEquals(i, values.nextDoc());
docIDToID[i] = (int) values.longValue();
}
for (String term : terms) {
//System.out.println(" term=" + term);
if (termsEnum.seekExact(new BytesRef(term))) {
docs = termsEnum.postings(docs);
assertNotNull(docs);
int doc;
//System.out.println(" doc/freq");
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docs.freq());
}
// explicitly exclude offsets here
docsAndPositions = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
assertNotNull(docsAndPositions);
//System.out.println(" doc/freq/pos");
while ((doc = docsAndPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositions.freq());
for (Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositions.nextPosition());
}
}
docsAndPositionsAndOffsets = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
assertNotNull(docsAndPositionsAndOffsets);
//System.out.println(" doc/freq/pos/offs");
while ((doc = docsAndPositionsAndOffsets.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
//System.out.println(" doc=" + docIDToID[doc] + " " + expected.size() + " freq");
assertNotNull(expected);
assertEquals(expected.size(), docsAndPositionsAndOffsets.freq());
for (Token token : expected) {
int pos = Integer.parseInt(token.type());
//System.out.println(" pos=" + pos);
assertEquals(pos, docsAndPositionsAndOffsets.nextPosition());
assertEquals(token.startOffset(), docsAndPositionsAndOffsets.startOffset());
assertEquals(token.endOffset(), docsAndPositionsAndOffsets.endOffset());
}
}
}
}
// TODO: test advance:
}
r.close();
dir.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestPhraseQuery method testZeroPosIncr.
/** Tests PhraseQuery with terms at the same position in the query. */
public void testZeroPosIncr() throws IOException {
Directory dir = newDirectory();
final Token[] tokens = new Token[3];
tokens[0] = new Token();
tokens[0].append("a");
tokens[0].setPositionIncrement(1);
tokens[1] = new Token();
tokens[1].append("aa");
tokens[1].setPositionIncrement(0);
tokens[2] = new Token();
tokens[2].append("b");
tokens[2].setPositionIncrement(1);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(tokens)));
writer.addDocument(doc);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
// Sanity check; simple "a b" phrase:
PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
// Now with "a|aa b"
pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "aa"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
// Now with "a|z b" which should not match; this isn't a MultiPhraseQuery
pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "z"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(0, searcher.search(pqBuilder.build(), 1).totalHits);
r.close();
dir.close();
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestMultiPhraseQuery method makeToken.
private static Token makeToken(String text, int posIncr) {
final Token t = new Token();
t.append(text);
t.setPositionIncrement(posIncr);
return t;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TestMultiPhraseQuery method testZeroPosIncrSloppyMpqAnd.
/**
* MPQ AND Mode - Manually creating a multiple phrase query
*/
public void testZeroPosIncrSloppyMpqAnd() throws IOException {
final MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
int pos = -1;
for (Token tap : INCR_0_QUERY_TOKENS_AND) {
pos += tap.getPositionIncrement();
//AND logic
mpqb.add(new Term[] { new Term("field", tap.toString()) }, pos);
}
doTestZeroPosIncrSloppy(mpqb.build(), 0);
mpqb.setSlop(1);
doTestZeroPosIncrSloppy(mpqb.build(), 0);
mpqb.setSlop(2);
doTestZeroPosIncrSloppy(mpqb.build(), 1);
}
Aggregations