Search in sources :

Example 1 with LanguageWriter

use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.

the class Lingo24LangDetectorTest method testLanguageDetection.

@Test
public void testLanguageDetection() throws Exception {
    LanguageDetector detector = new Lingo24LangDetector();
    assumeTrue(((Lingo24LangDetector) detector).isAvailable());
    LanguageWriter writer = new LanguageWriter(detector);
    // Reusing the test data from OptimaizeLangDetectorTest
    List<String> lines = IOUtils.readLines(Lingo24LangDetectorTest.class.getResourceAsStream("text-test.tsv"));
    for (String line : lines) {
        String[] data = line.split("\t");
        if (data.length != 2)
            continue;
        writer.reset();
        writer.append(data[1]);
        // Only check supported languages
        if (detector.hasModel(data[0])) {
            LanguageResult result = detector.detect();
            assertNotNull(result);
            assertEquals(data[0], result.getLanguage());
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 2 with LanguageWriter

use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.

the class OptimaizeLangDetectorTest method testUniversalDeclarationOfHumanRights.

/*
	 * The complete list of supported languages (as of 0.5) is below.
	 * The ones we have tests for have '*' after the name.
	 * 
    af Afrikaans
    an Aragonese
    ar Arabic
    ast Asturian
    be Belarusian
    br Breton
    ca Catalan
    bg Bulgarian
    bn Bengali
    cs Czech
    cy Welsh
    da Danish *
    de German *
    el Greek *
    en English *
    es Spanish *
    et Estonian
    eu Basque
    fa Persian
    fi Finnish *
    fr French *
    ga Irish
    gl Galician
    gu Gujarati
    he Hebrew
    hi Hindi
    hr Croatian
    ht Haitian
    hu Hungarian
    id Indonesian
    is Icelandic
    it Italian *
    ja Japanese *
    km Khmer
    kn Kannada
    ko Korean
    lt Lithuanian *
    lv Latvian
    mk Macedonian
    ml Malayalam
    mr Marathi
    ms Malay
    mt Maltese
    ne Nepali
    nl Dutch *
    no Norwegian
    oc Occitan
    pa Punjabi
    pl Polish
    pt Portuguese *
    ro Romanian
    ru Russian
    sk Slovak
    sl Slovene
    so Somali
    sq Albanian
    sr Serbian
    sv Swedish *
    sw Swahili
    ta Tamil
    te Telugu
    th Thai *
    tl Tagalog
    tr Turkish
    uk Ukrainian
    ur Urdu
    vi Vietnamese
    yi Yiddish
    zh-CN Simplified Chinese * (just generic Chinese)
    zh-TW Traditional Chinese * (just generic Chinese)
	*/
/**
	 * Test correct detection for the many (short) translations of the
	 * "Universal Declaration of Human Rights (Article 1)", at
	 * http://www.omniglot.com/udhr
	 * 
	 * Also make sure we get uncertain results for some set of unsupported
	 * languages.
	 * 
	 * @throws Exception
	 */
@Test
public void testUniversalDeclarationOfHumanRights() throws Exception {
    LanguageDetector detector = new OptimaizeLangDetector();
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    Map<String, String> knownText = getTestLanguages("udhr-known.txt");
    for (String language : knownText.keySet()) {
        writer.reset();
        writer.append(knownText.get(language));
        LanguageResult result = detector.detect();
        assertNotNull(result);
        assertEquals(language, result.getLanguage());
    // System.out.println(String.format("'%s': %s (%f)", language, result.getConfidence(), result.getRawScore()));
    }
    Map<String, String> unknownText = getTestLanguages("udhr-unknown.txt");
    for (String language : unknownText.keySet()) {
        writer.reset();
        writer.append(unknownText.get(language));
        LanguageResult result = detector.detect();
        if (result != null) {
            assertFalse(result.isReasonablyCertain());
        // System.out.println(String.format("Looking for '%s', got '%s': %s (%f)", language, result.getLanguage(), result.getConfidence(), result.getRawScore()));
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 3 with LanguageWriter

use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.

the class OptimaizeLangDetectorTest method testShortText.

@Test
public void testShortText() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector().setShortText(true).loadModels();
    // First verify that we get no result with empty or very short text.
    LanguageWriter writer = new LanguageWriter(detector);
    writer.append("");
    assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence());
    writer.reset();
    writer.append("  ");
    assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence());
    for (String language : getTestLanguages()) {
        // Short pieces of Japanese are detected as Chinese
        if (language.equals("ja")) {
            continue;
        }
        // We need at least 300 characters to detect Chinese reliably.
        writer.reset();
        writeTo(language, writer, 300);
        LanguageResult result = detector.detect();
        assertNotNull(String.format(Locale.US, "Language '%s' wasn't detected", language), result);
        assertTrue(String.format(Locale.US, "Language '%s' was detected as '%s'", language, result.getLanguage()), result.isLanguage(language));
        assertTrue(String.format(Locale.US, "Language '%s' isn't reasonably certain: %s", language, result.getConfidence()), result.isReasonablyCertain());
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 4 with LanguageWriter

use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.

the class OptimaizeLangDetectorTest method testAllLanguages.

@Test
public void testAllLanguages() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector();
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    for (String language : getTestLanguages()) {
        writer.reset();
        writeTo(language, writer);
        LanguageResult result = detector.detect();
        assertNotNull(result);
        assertTrue(result.isLanguage(language));
        assertTrue(result.isReasonablyCertain());
    }
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 5 with LanguageWriter

use of org.apache.tika.language.detect.LanguageWriter in project tika by apache.

the class OptimaizeLangDetectorTest method testMixedLanguages.

@Test
public void testMixedLanguages() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector().setMixedLanguages(true);
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    String[] languages = getTestLanguages();
    for (int i = 0; i < languages.length; i++) {
        String language = languages[i];
        for (int j = i + 1; j < languages.length; j++) {
            String other = languages[j];
            writer.reset();
            writeTo(language, writer);
            writeTo(other, writer);
            List<LanguageResult> results = detector.detectAll();
            if (results.size() > 0) {
                LanguageResult result = results.get(0);
                assertFalse("mix of " + language + " and " + other + " incorrectly detected as " + result, result.isReasonablyCertain());
            }
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Aggregations

LanguageDetector (org.apache.tika.language.detect.LanguageDetector)7 LanguageResult (org.apache.tika.language.detect.LanguageResult)7 LanguageWriter (org.apache.tika.language.detect.LanguageWriter)7 Test (org.junit.Test)6 OptimaizeLangDetector (org.apache.tika.langdetect.OptimaizeLangDetector)1