use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.
the class Lingo24LangDetectorTest method testLanguageDetection.
@Test
public void testLanguageDetection() throws Exception {
LanguageDetector detector = new Lingo24LangDetector();
assumeTrue(((Lingo24LangDetector) detector).isAvailable());
LanguageWriter writer = new LanguageWriter(detector);
// Reusing the test data from OptimaizeLangDetectorTest
List<String> lines = IOUtils.readLines(Lingo24LangDetectorTest.class.getResourceAsStream("text-test.tsv"));
for (String line : lines) {
String[] data = line.split("\t");
if (data.length != 2)
continue;
writer.reset();
writer.append(data[1]);
// Only check supported languages
if (detector.hasModel(data[0])) {
LanguageResult result = detector.detect();
assertNotNull(result);
assertEquals(data[0], result.getLanguage());
}
}
writer.close();
}
use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.
the class OptimaizeLangDetectorTest method testUniversalDeclarationOfHumanRights.
/*
* The complete list of supported languages (as of 0.5) is below.
* The ones we have tests for have '*' after the name.
*
af Afrikaans
an Aragonese
ar Arabic
ast Asturian
be Belarusian
br Breton
ca Catalan
bg Bulgarian
bn Bengali
cs Czech
cy Welsh
da Danish *
de German *
el Greek *
en English *
es Spanish *
et Estonian
eu Basque
fa Persian
fi Finnish *
fr French *
ga Irish
gl Galician
gu Gujarati
he Hebrew
hi Hindi
hr Croatian
ht Haitian
hu Hungarian
id Indonesian
is Icelandic
it Italian *
ja Japanese *
km Khmer
kn Kannada
ko Korean
lt Lithuanian *
lv Latvian
mk Macedonian
ml Malayalam
mr Marathi
ms Malay
mt Maltese
ne Nepali
nl Dutch *
no Norwegian
oc Occitan
pa Punjabi
pl Polish
pt Portuguese *
ro Romanian
ru Russian
sk Slovak
sl Slovene
so Somali
sq Albanian
sr Serbian
sv Swedish *
sw Swahili
ta Tamil
te Telugu
th Thai *
tl Tagalog
tr Turkish
uk Ukrainian
ur Urdu
vi Vietnamese
yi Yiddish
zh-CN Simplified Chinese * (just generic Chinese)
zh-TW Traditional Chinese * (just generic Chinese)
*/
/**
* Test correct detection for the many (short) translations of the
* "Universal Declaration of Human Rights (Article 1)", at
* http://www.omniglot.com/udhr
*
* Also make sure we get uncertain results for some set of unsupported
* languages.
*
* @throws Exception
*/
@Test
public void testUniversalDeclarationOfHumanRights() throws Exception {
LanguageDetector detector = new OptimaizeLangDetector();
detector.loadModels();
LanguageWriter writer = new LanguageWriter(detector);
Map<String, String> knownText = getTestLanguages("udhr-known.txt");
for (String language : knownText.keySet()) {
writer.reset();
writer.append(knownText.get(language));
LanguageResult result = detector.detect();
assertNotNull(result);
assertEquals(language, result.getLanguage());
// System.out.println(String.format("'%s': %s (%f)", language, result.getConfidence(), result.getRawScore()));
}
Map<String, String> unknownText = getTestLanguages("udhr-unknown.txt");
for (String language : unknownText.keySet()) {
writer.reset();
writer.append(unknownText.get(language));
LanguageResult result = detector.detect();
if (result != null) {
assertFalse(result.isReasonablyCertain());
// System.out.println(String.format("Looking for '%s', got '%s': %s (%f)", language, result.getLanguage(), result.getConfidence(), result.getRawScore()));
}
}
writer.close();
}
use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.
the class OptimaizeLangDetectorTest method testShortText.
@Test
public void testShortText() throws IOException {
LanguageDetector detector = new OptimaizeLangDetector().setShortText(true).loadModels();
// First verify that we get no result with empty or very short text.
LanguageWriter writer = new LanguageWriter(detector);
writer.append("");
assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence());
writer.reset();
writer.append(" ");
assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence());
for (String language : getTestLanguages()) {
// Short pieces of Japanese are detected as Chinese
if (language.equals("ja")) {
continue;
}
// We need at least 300 characters to detect Chinese reliably.
writer.reset();
writeTo(language, writer, 300);
LanguageResult result = detector.detect();
assertNotNull(String.format(Locale.US, "Language '%s' wasn't detected", language), result);
assertTrue(String.format(Locale.US, "Language '%s' was detected as '%s'", language, result.getLanguage()), result.isLanguage(language));
assertTrue(String.format(Locale.US, "Language '%s' isn't reasonably certain: %s", language, result.getConfidence()), result.isReasonablyCertain());
}
writer.close();
}
use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.
the class OptimaizeLangDetectorTest method testAllLanguages.
@Test
public void testAllLanguages() throws IOException {
LanguageDetector detector = new OptimaizeLangDetector();
detector.loadModels();
LanguageWriter writer = new LanguageWriter(detector);
for (String language : getTestLanguages()) {
writer.reset();
writeTo(language, writer);
LanguageResult result = detector.detect();
assertNotNull(result);
assertTrue(result.isLanguage(language));
assertTrue(result.isReasonablyCertain());
}
}
use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.
the class OptimaizeLangDetectorTest method testMixedLanguages.
@Test
public void testMixedLanguages() throws IOException {
LanguageDetector detector = new OptimaizeLangDetector().setMixedLanguages(true);
detector.loadModels();
LanguageWriter writer = new LanguageWriter(detector);
String[] languages = getTestLanguages();
for (int i = 0; i < languages.length; i++) {
String language = languages[i];
for (int j = i + 1; j < languages.length; j++) {
String other = languages[j];
writer.reset();
writeTo(language, writer);
writeTo(other, writer);
List<LanguageResult> results = detector.detectAll();
if (results.size() > 0) {
LanguageResult result = results.get(0);
assertFalse("mix of " + language + " and " + other + " incorrectly detected as " + result, result.isReasonablyCertain());
}
}
}
writer.close();
}
Aggregations