use of org.languagetool.tokenizers.WordTokenizer in project languagetool by languagetool-org.
the class LanguageModelTest method testPerformance.
protected void testPerformance(LuceneLanguageModel model, int ngramLength) throws Exception {
try (FileInputStream fis = new FileInputStream(FILE)) {
String content = StringTools.readStream(fis, "UTF-8");
WordTokenizer wordTokenizer = new WordTokenizer();
List<String> words = wordTokenizer.tokenize(content);
String prevPrevWord = null;
String prevWord = null;
int i = 0;
long totalMicros = 0;
for (String word : words) {
if (word.trim().isEmpty()) {
continue;
}
if (prevWord != null) {
long t1 = System.nanoTime() / 1000;
long count = 0;
if (ngramLength == 2) {
count = model.getCount(Arrays.asList(prevWord, word));
} else if (ngramLength == 3) {
if (prevPrevWord != null) {
count = model.getCount(Arrays.asList(prevPrevWord, prevWord, word));
}
} else {
throw new IllegalArgumentException("ngram length not supported: " + ngramLength);
}
long timeMicros = (System.nanoTime() / 1000) - t1;
long timeMillis = timeMicros / 1000;
if (ngramLength == 2) {
System.out.println(count + "\t\t" + prevWord + " " + word + ": " + timeMicros + "µs = " + timeMillis + "ms");
} else {
System.out.println(count + "\t\t" + prevPrevWord + " " + prevWord + " " + word + ": " + timeMicros + "µs = " + timeMillis + "ms");
}
if (i > SKIP_FIRST_ITEMS) {
totalMicros += timeMicros;
}
if (++i % 25 == 0) {
printStats(i, totalMicros);
}
}
prevPrevWord = prevWord;
prevWord = word;
}
printStats(i, totalMicros);
}
}
use of org.languagetool.tokenizers.WordTokenizer in project languagetool by languagetool-org.
the class EnglishDisambiguationRuleTest method setUp.
@Before
public void setUp() {
tagger = new EnglishTagger();
tokenizer = new WordTokenizer();
sentenceTokenizer = new SRXSentenceTokenizer(new English());
disambiguator = new XmlRuleDisambiguator(new English());
disamb2 = new DemoDisambiguator();
}
use of org.languagetool.tokenizers.WordTokenizer in project languagetool by languagetool-org.
the class EnglishTaggerTest method setUp.
@Before
public void setUp() {
tagger = new EnglishTagger();
tokenizer = new WordTokenizer();
}
use of org.languagetool.tokenizers.WordTokenizer in project languagetool by languagetool-org.
the class CatalanTaggerTest method setUp.
@Before
public void setUp() {
tagger = new CatalanTagger();
tokenizer = new WordTokenizer();
}
use of org.languagetool.tokenizers.WordTokenizer in project languagetool by languagetool-org.
the class SwedishTaggerTest method setUp.
@Before
public void setUp() {
tagger = new SwedishTagger();
tokenizer = new WordTokenizer();
}
Aggregations