Search in sources :

Example 6 with WikipediaTokenizer

use of org.apache.lucene.analysis.wikipedia.WikipediaTokenizer in project lucene-solr by apache.

the class WikipediaTokenizerTest method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
            return new TokenStreamComponents(tokenizer, tokenizer);
        }
    };
    // TODO: properly support positionLengthAttribute
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER, 20, false, false);
    a.close();
}
Also used : WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 7 with WikipediaTokenizer

use of org.apache.lucene.analysis.wikipedia.WikipediaTokenizer in project lucene-solr by apache.

the class WikipediaTokenizerTest method testHandwritten.

public void testHandwritten() throws Exception {
    // make sure all tokens are in only one type
    String test = "[[link]] This is a [[Category:foo]] Category  This is a linked [[:Category:bar none withstanding]] " + "Category This is (parens) This is a [[link]]  This is an external URL [http://lucene.apache.org] " + "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " + " This is a [[link|display info]]  This is a period.  Here is $3.25 and here is 3.50.  Here's Johnny.  " + "==heading== ===sub head=== followed by some text  [[Category:blah| ]] " + "''[[Category:ital_cat]]''  here is some that is ''italics [[Category:foo]] but is never closed." + "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" + " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" + " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
    WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(tf, new String[] { "link", "This", "is", "a", "foo", "Category", "This", "is", "a", "linked", "bar", "none", "withstanding", "Category", "This", "is", "parens", "This", "is", "a", "link", "This", "is", "an", "external", "URL", "http://lucene.apache.org", "Here", "is", "italics", "and", "more", "italics", "bold", "and", "five", "quotes", "This", "is", "a", "link", "display", "info", "This", "is", "a", "period", "Here", "is", "3.25", "and", "here", "is", "3.50", "Here's", "Johnny", "heading", "sub", "head", "followed", "by", "some", "text", "blah", "ital", "cat", "here", "is", "some", "that", "is", "italics", "foo", "but", "is", "never", "closed", "same", "foo", "goes", "for", "this", "and2", "foo", "and", "this", "http://foo.boo.com/test/test/", "Test", "Test", "http://foo.boo.com/test/test/test.html", "Test", "Test", "http://foo.boo.com/test/test/test.html?g=b&c=d", "Test", "Test", "Citation", "martian", "code" }, new String[] { INTERNAL_LINK, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY, CATEGORY, CATEGORY, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", INTERNAL_LINK, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", EXTERNAL_LINK_URL, "<ALPHANUM>", "<ALPHANUM>", ITALICS, "<ALPHANUM>", ITALICS, ITALICS, BOLD, "<ALPHANUM>", BOLD_ITALICS, BOLD_ITALICS, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", INTERNAL_LINK, INTERNAL_LINK, INTERNAL_LINK, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<APOSTROPHE>", "<ALPHANUM>", HEADING, SUB_HEADING, SUB_HEADING, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY, CATEGORY, CATEGORY, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", ITALICS, CATEGORY, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", BOLD, CATEGORY, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", BOLD_ITALICS, CATEGORY, "<ALPHANUM>", "<ALPHANUM>", EXTERNAL_LINK_URL, EXTERNAL_LINK, EXTERNAL_LINK, EXTERNAL_LINK_URL, EXTERNAL_LINK, EXTERNAL_LINK, EXTERNAL_LINK_URL, EXTERNAL_LINK, EXTERNAL_LINK, CITATION, "<ALPHANUM>", "<ALPHANUM>" });
}
Also used : WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) StringReader(java.io.StringReader)

Example 8 with WikipediaTokenizer

use of org.apache.lucene.analysis.wikipedia.WikipediaTokenizer in project lucene-solr by apache.

the class WikipediaTokenizerTest method testLucene1133.

public void testLucene1133() throws Exception {
    Set<String> untoks = new HashSet<>();
    untoks.add(WikipediaTokenizer.CATEGORY);
    untoks.add(WikipediaTokenizer.ITALICS);
    //should be exactly the same, regardless of untoks
    WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, untoks);
    tf.setReader(new StringReader(LINK_PHRASES));
    checkLinkPhrases(tf);
    String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    tf = new WikipediaTokenizer(WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(tf, new String[] { "a b c d", "e f g", "link", "here", "link", "there", "italics here", "something", "more italics", "h   i   j" }, new int[] { 11, 32, 42, 47, 56, 61, 71, 86, 98, 124 }, new int[] { 18, 37, 46, 51, 60, 66, 83, 95, 110, 133 }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
Also used : WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) StringReader(java.io.StringReader) HashSet(java.util.HashSet)

Example 9 with WikipediaTokenizer

use of org.apache.lucene.analysis.wikipedia.WikipediaTokenizer in project lucene-solr by apache.

the class WikipediaTokenizerTest method testLinks.

public void testLinks() throws Exception {
    String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
    WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(tf, new String[] { "http://lucene.apache.org/java/docs/index.html#news", "here", "http://lucene.apache.org/java/docs/index.html?b=c", "here", "https://lucene.apache.org/java/docs/index.html?b=c", "here" }, new String[] { EXTERNAL_LINK_URL, EXTERNAL_LINK, EXTERNAL_LINK_URL, EXTERNAL_LINK, EXTERNAL_LINK_URL, EXTERNAL_LINK });
}
Also used : WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) StringReader(java.io.StringReader)

Aggregations

WikipediaTokenizer (org.apache.lucene.analysis.wikipedia.WikipediaTokenizer)9 StringReader (java.io.StringReader)6 Analyzer (org.apache.lucene.analysis.Analyzer)3 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 HashSet (java.util.HashSet)2 Random (java.util.Random)1 CharArraySet (org.apache.lucene.analysis.CharArraySet)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)1 EdgeNGramTokenizer (org.apache.lucene.analysis.ngram.EdgeNGramTokenizer)1 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)1