use of org.languagetool.Language in project languagetool by languagetool-org.
the class PolishWordTokenizerTest method testTokenize.
@Test
public void testTokenize() {
final PolishWordTokenizer wordTokenizer = new PolishWordTokenizer();
final List<String> tokens = wordTokenizer.tokenize("To jest test");
assertEquals(tokens.size(), 6);
assertEquals("[To, , jest, , , test]", tokens.toString());
final List<String> tokens2 = wordTokenizer.tokenize("To\rłamie");
assertEquals(3, tokens2.size());
assertEquals("[To, \r, łamie]", tokens2.toString());
//hyphen with no whitespace
final List<String> tokens3 = wordTokenizer.tokenize("A to jest-naprawdę-test!");
assertEquals(tokens3.size(), 6);
assertEquals("[A, , to, , jest-naprawdę-test, !]", tokens3.toString());
//hyphen at the end of the word
final List<String> tokens4 = wordTokenizer.tokenize("Niemiecko- i angielsko-polski");
assertEquals(tokens4.size(), 6);
assertEquals("[Niemiecko, -, , i, , angielsko-polski]", tokens4.toString());
//hyphen probably instead of mdash
final List<String> tokens5 = wordTokenizer.tokenize("Widzę krowę -i to dobrze!");
assertEquals(11, tokens5.size());
assertEquals("[Widzę, , krowę, , -, i, , to, , dobrze, !]", tokens5.toString());
//mdash
final List<String> tokens6 = wordTokenizer.tokenize("A to jest zdanie—rzeczywiście—z wtrąceniem.");
assertEquals(tokens6.size(), 14);
assertEquals("[A, , to, , jest, , zdanie, —, rzeczywiście, —, z, , wtrąceniem, .]", tokens6.toString());
//compound words with hyphens
final String compoundSentence = "To jest kobieta-wojownik w polsko-czeskim ubraniu, która wysłała dwa SMS-y.";
List<String> compoundTokens = wordTokenizer.tokenize(compoundSentence);
assertEquals(21, compoundTokens.size());
assertEquals("[To, , jest, , kobieta-wojownik, , w, , polsko-czeskim, , ubraniu, ,, , która, , wysłała, , dwa, , SMS-y, .]", compoundTokens.toString());
//now setup the tagger...
Language pl = new Polish();
wordTokenizer.setTagger(pl.getTagger());
compoundTokens = wordTokenizer.tokenize(compoundSentence);
//we should get 4 more tokens: two hyphen tokens and two for the split words
assertEquals(25, compoundTokens.size());
assertEquals("[To, , jest, , kobieta, -, wojownik, , " + "w, , polsko, -, czeskim, , ubraniu, ,, " + ", która, , wysłała, , dwa, , SMS-y, .]", compoundTokens.toString());
compoundTokens = wordTokenizer.tokenize("Miała osiemnaście-dwadzieścia lat.");
assertEquals(8, compoundTokens.size());
assertEquals("[Miała, , osiemnaście, -, dwadzieścia, , lat, .]", compoundTokens.toString());
// now three-part adja-adja-adj...:
compoundTokens = wordTokenizer.tokenize("Słownik polsko-niemiecko-indonezyjski");
assertEquals(7, compoundTokens.size());
assertEquals("[Słownik, , polsko, -, niemiecko, -, indonezyjski]", compoundTokens.toString());
// number ranges:
compoundTokens = wordTokenizer.tokenize("Impreza odbędzie się w dniach 1-23 maja.");
assertEquals(16, compoundTokens.size());
assertEquals("[Impreza, , odbędzie, , się, , w, , dniach, , 1, -, 23, , maja, .]", compoundTokens.toString());
// number ranges:
compoundTokens = wordTokenizer.tokenize("Impreza odbędzie się w dniach 1--23 maja.");
assertEquals(18, compoundTokens.size());
assertEquals("[Impreza, , odbędzie, , się, , w, , dniach, , 1, -, , -, 23, , maja, .]", compoundTokens.toString());
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class MorfologikBritishSpellerRuleTest method testSuggestions.
@Test
public void testSuggestions() throws IOException {
Language language = new BritishEnglish();
Rule rule = new MorfologikBritishSpellerRule(TestTools.getMessages("en"), language);
super.testNonVariantSpecificSuggestions(rule, language);
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class MorfologikCanadianSpellerRuleTest method testSuggestions.
@Test
public void testSuggestions() throws IOException {
Language language = new CanadianEnglish();
Rule rule = new MorfologikCanadianSpellerRule(TestTools.getMessages("en"), language);
super.testNonVariantSpecificSuggestions(rule, language);
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class MorfologikSouthAfricanSpellerRuleTest method testSuggestions.
@Test
public void testSuggestions() throws IOException {
Language language = new SouthAfricanEnglish();
Rule rule = new MorfologikSouthAfricanSpellerRule(TestTools.getMessages("en"), language);
super.testNonVariantSpecificSuggestions(rule, language);
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class HunspellRuleTest method testPerformance.
@Ignore("just for internal performance testing, thus ignored by default")
@Test
public void testPerformance() throws Exception {
List<Language> allLanguages = Languages.get();
for (Language language : allLanguages) {
JLanguageTool langTool = new JLanguageTool(language);
//HunspellRule rule = new HunspellRule(TestTools.getMessages("German"), language);
// make sure everything is initialized when actually testing
langTool.check("warmup");
langTool.check("anotherwarmup");
long startTime = System.currentTimeMillis();
langTool.check("fdfds fdfdsa fdfdsb fdfdsc fdfdsd fdfdse fdfdsf fdfds fdfdsa fdfdsb fdfdsc fdfdsd fdfdse fdfdsf");
//String[] w = {"foo", "warmup", "Rechtschreipreform", "Theatrekasse", "Zoobesuck", "Handselvertreter", "Mückenstick", "gewönlich", "Traprennen", "Autoverkehrr"};
//AnalyzedSentence analyzedSentence = langTool.getAnalyzedSentence("fdfds fdfdsa fdfdsb fdfdsc fdfdsd fdfdse fdfdsf");
//rule.match(analyzedSentence);
long endTime = System.currentTimeMillis();
System.out.println((endTime - startTime) + "ms for " + language);
}
}
Aggregations