Search in sources :

Example 76 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class PolishWordTokenizerTest method testTokenize.

@Test
public void testTokenize() {
    final PolishWordTokenizer wordTokenizer = new PolishWordTokenizer();
    final List<String> tokens = wordTokenizer.tokenize("To jest  test");
    assertEquals(tokens.size(), 6);
    assertEquals("[To,  , jest,  ,  , test]", tokens.toString());
    final List<String> tokens2 = wordTokenizer.tokenize("To\rłamie");
    assertEquals(3, tokens2.size());
    assertEquals("[To, \r, łamie]", tokens2.toString());
    //hyphen with no whitespace
    final List<String> tokens3 = wordTokenizer.tokenize("A to jest-naprawdę-test!");
    assertEquals(tokens3.size(), 6);
    assertEquals("[A,  , to,  , jest-naprawdę-test, !]", tokens3.toString());
    //hyphen at the end of the word
    final List<String> tokens4 = wordTokenizer.tokenize("Niemiecko- i angielsko-polski");
    assertEquals(tokens4.size(), 6);
    assertEquals("[Niemiecko, -,  , i,  , angielsko-polski]", tokens4.toString());
    //hyphen probably instead of mdash
    final List<String> tokens5 = wordTokenizer.tokenize("Widzę krowę -i to dobrze!");
    assertEquals(11, tokens5.size());
    assertEquals("[Widzę,  , krowę,  , -, i,  , to,  , dobrze, !]", tokens5.toString());
    //mdash
    final List<String> tokens6 = wordTokenizer.tokenize("A to jest zdanie—rzeczywiście—z wtrąceniem.");
    assertEquals(tokens6.size(), 14);
    assertEquals("[A,  , to,  , jest,  , zdanie, —, rzeczywiście, —, z,  , wtrąceniem, .]", tokens6.toString());
    //compound words with hyphens
    final String compoundSentence = "To jest kobieta-wojownik w polsko-czeskim ubraniu, która wysłała dwa SMS-y.";
    List<String> compoundTokens = wordTokenizer.tokenize(compoundSentence);
    assertEquals(21, compoundTokens.size());
    assertEquals("[To,  , jest,  , kobieta-wojownik,  , w,  , polsko-czeskim,  , ubraniu, ,,  , która,  , wysłała,  , dwa,  , SMS-y, .]", compoundTokens.toString());
    //now setup the tagger...
    Language pl = new Polish();
    wordTokenizer.setTagger(pl.getTagger());
    compoundTokens = wordTokenizer.tokenize(compoundSentence);
    //we should get 4 more tokens: two hyphen tokens and two for the split words
    assertEquals(25, compoundTokens.size());
    assertEquals("[To,  , jest,  , kobieta, -, wojownik,  , " + "w,  , polsko, -, czeskim,  , ubraniu, ,,  " + ", która,  , wysłała,  , dwa,  , SMS-y, .]", compoundTokens.toString());
    compoundTokens = wordTokenizer.tokenize("Miała osiemnaście-dwadzieścia lat.");
    assertEquals(8, compoundTokens.size());
    assertEquals("[Miała,  , osiemnaście, -, dwadzieścia,  , lat, .]", compoundTokens.toString());
    // now three-part adja-adja-adj...:
    compoundTokens = wordTokenizer.tokenize("Słownik polsko-niemiecko-indonezyjski");
    assertEquals(7, compoundTokens.size());
    assertEquals("[Słownik,  , polsko, -, niemiecko, -, indonezyjski]", compoundTokens.toString());
    // number ranges:
    compoundTokens = wordTokenizer.tokenize("Impreza odbędzie się w dniach 1-23 maja.");
    assertEquals(16, compoundTokens.size());
    assertEquals("[Impreza,  , odbędzie,  , się,  , w,  , dniach,  , 1, -, 23,  , maja, .]", compoundTokens.toString());
    // number ranges:
    compoundTokens = wordTokenizer.tokenize("Impreza odbędzie się w dniach 1--23 maja.");
    assertEquals(18, compoundTokens.size());
    assertEquals("[Impreza,  , odbędzie,  , się,  , w,  , dniach,  , 1, -, , -, 23,  , maja, .]", compoundTokens.toString());
}
Also used : Polish(org.languagetool.language.Polish) Language(org.languagetool.Language) Test(org.junit.Test)

Example 77 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class MorfologikBritishSpellerRuleTest method testSuggestions.

@Test
public void testSuggestions() throws IOException {
    Language language = new BritishEnglish();
    Rule rule = new MorfologikBritishSpellerRule(TestTools.getMessages("en"), language);
    super.testNonVariantSpecificSuggestions(rule, language);
}
Also used : BritishEnglish(org.languagetool.language.BritishEnglish) Language(org.languagetool.Language) Rule(org.languagetool.rules.Rule) Test(org.junit.Test)

Example 78 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class MorfologikCanadianSpellerRuleTest method testSuggestions.

@Test
public void testSuggestions() throws IOException {
    Language language = new CanadianEnglish();
    Rule rule = new MorfologikCanadianSpellerRule(TestTools.getMessages("en"), language);
    super.testNonVariantSpecificSuggestions(rule, language);
}
Also used : Language(org.languagetool.Language) CanadianEnglish(org.languagetool.language.CanadianEnglish) Rule(org.languagetool.rules.Rule) Test(org.junit.Test)

Example 79 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class MorfologikSouthAfricanSpellerRuleTest method testSuggestions.

@Test
public void testSuggestions() throws IOException {
    Language language = new SouthAfricanEnglish();
    Rule rule = new MorfologikSouthAfricanSpellerRule(TestTools.getMessages("en"), language);
    super.testNonVariantSpecificSuggestions(rule, language);
}
Also used : Language(org.languagetool.Language) SouthAfricanEnglish(org.languagetool.language.SouthAfricanEnglish) Rule(org.languagetool.rules.Rule) Test(org.junit.Test)

Example 80 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class HunspellRuleTest method testPerformance.

@Ignore("just for internal performance testing, thus ignored by default")
@Test
public void testPerformance() throws Exception {
    List<Language> allLanguages = Languages.get();
    for (Language language : allLanguages) {
        JLanguageTool langTool = new JLanguageTool(language);
        //HunspellRule rule = new HunspellRule(TestTools.getMessages("German"), language);
        // make sure everything is initialized when actually testing
        langTool.check("warmup");
        langTool.check("anotherwarmup");
        long startTime = System.currentTimeMillis();
        langTool.check("fdfds fdfdsa fdfdsb fdfdsc fdfdsd fdfdse fdfdsf fdfds fdfdsa fdfdsb fdfdsc fdfdsd fdfdse fdfdsf");
        //String[] w = {"foo", "warmup", "Rechtschreipreform", "Theatrekasse", "Zoobesuck", "Handselvertreter", "Mückenstick", "gewönlich", "Traprennen", "Autoverkehrr"};
        //AnalyzedSentence analyzedSentence = langTool.getAnalyzedSentence("fdfds fdfdsa fdfdsb fdfdsc fdfdsd fdfdse fdfdsf");
        //rule.match(analyzedSentence);
        long endTime = System.currentTimeMillis();
        System.out.println((endTime - startTime) + "ms for " + language);
    }
}
Also used : Language(org.languagetool.Language) JLanguageTool(org.languagetool.JLanguageTool) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

Language (org.languagetool.Language)84 Test (org.junit.Test)23 File (java.io.File)15 ArrayList (java.util.ArrayList)12 JLanguageTool (org.languagetool.JLanguageTool)11 Rule (org.languagetool.rules.Rule)11 RuleMatch (org.languagetool.rules.RuleMatch)10 IOException (java.io.IOException)7 Ignore (org.junit.Ignore)6 StringTools.readerToString (org.languagetool.tools.StringTools.readerToString)5 InputStream (java.io.InputStream)4 English (org.languagetool.language.English)4 BitextRule (org.languagetool.rules.bitext.BitextRule)4 URL (java.net.URL)3 HashSet (java.util.HashSet)3 MultiThreadedJLanguageTool (org.languagetool.MultiThreadedJLanguageTool)3 AmericanEnglish (org.languagetool.language.AmericanEnglish)3 LanguageModel (org.languagetool.languagemodel.LanguageModel)3 LuceneLanguageModel (org.languagetool.languagemodel.LuceneLanguageModel)3 BufferedReader (java.io.BufferedReader)2