Search in sources :

Example 6 with LanguageResult

use of org.apache.tika.language.detect.LanguageResult in project tika by apache.

the class Lingo24LangDetectorTest method testLanguageDetection.

@Test
public void testLanguageDetection() throws Exception {
    LanguageDetector detector = new Lingo24LangDetector();
    assumeTrue(((Lingo24LangDetector) detector).isAvailable());
    LanguageWriter writer = new LanguageWriter(detector);
    // Reusing the test data from OptimaizeLangDetectorTest
    List<String> lines = IOUtils.readLines(Lingo24LangDetectorTest.class.getResourceAsStream("text-test.tsv"));
    for (String line : lines) {
        String[] data = line.split("\t");
        if (data.length != 2)
            continue;
        writer.reset();
        writer.append(data[1]);
        // Only check supported languages
        if (detector.hasModel(data[0])) {
            LanguageResult result = detector.detect();
            assertNotNull(result);
            assertEquals(data[0], result.getLanguage());
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 7 with LanguageResult

use of org.apache.tika.language.detect.LanguageResult in project tika by apache.

the class OptimaizeLangDetectorTest method testUniversalDeclarationOfHumanRights.

/*
	 * The complete list of supported languages (as of 0.5) is below.
	 * The ones we have tests for have '*' after the name.
	 * 
    af Afrikaans
    an Aragonese
    ar Arabic
    ast Asturian
    be Belarusian
    br Breton
    ca Catalan
    bg Bulgarian
    bn Bengali
    cs Czech
    cy Welsh
    da Danish *
    de German *
    el Greek *
    en English *
    es Spanish *
    et Estonian
    eu Basque
    fa Persian
    fi Finnish *
    fr French *
    ga Irish
    gl Galician
    gu Gujarati
    he Hebrew
    hi Hindi
    hr Croatian
    ht Haitian
    hu Hungarian
    id Indonesian
    is Icelandic
    it Italian *
    ja Japanese *
    km Khmer
    kn Kannada
    ko Korean
    lt Lithuanian *
    lv Latvian
    mk Macedonian
    ml Malayalam
    mr Marathi
    ms Malay
    mt Maltese
    ne Nepali
    nl Dutch *
    no Norwegian
    oc Occitan
    pa Punjabi
    pl Polish
    pt Portuguese *
    ro Romanian
    ru Russian
    sk Slovak
    sl Slovene
    so Somali
    sq Albanian
    sr Serbian
    sv Swedish *
    sw Swahili
    ta Tamil
    te Telugu
    th Thai *
    tl Tagalog
    tr Turkish
    uk Ukrainian
    ur Urdu
    vi Vietnamese
    yi Yiddish
    zh-CN Simplified Chinese * (just generic Chinese)
    zh-TW Traditional Chinese * (just generic Chinese)
	*/
/**
	 * Test correct detection for the many (short) translations of the
	 * "Universal Declaration of Human Rights (Article 1)", at
	 * http://www.omniglot.com/udhr
	 * 
	 * Also make sure we get uncertain results for some set of unsupported
	 * languages.
	 * 
	 * @throws Exception
	 */
@Test
public void testUniversalDeclarationOfHumanRights() throws Exception {
    LanguageDetector detector = new OptimaizeLangDetector();
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    Map<String, String> knownText = getTestLanguages("udhr-known.txt");
    for (String language : knownText.keySet()) {
        writer.reset();
        writer.append(knownText.get(language));
        LanguageResult result = detector.detect();
        assertNotNull(result);
        assertEquals(language, result.getLanguage());
    // System.out.println(String.format("'%s': %s (%f)", language, result.getConfidence(), result.getRawScore()));
    }
    Map<String, String> unknownText = getTestLanguages("udhr-unknown.txt");
    for (String language : unknownText.keySet()) {
        writer.reset();
        writer.append(unknownText.get(language));
        LanguageResult result = detector.detect();
        if (result != null) {
            assertFalse(result.isReasonablyCertain());
        // System.out.println(String.format("Looking for '%s', got '%s': %s (%f)", language, result.getLanguage(), result.getConfidence(), result.getRawScore()));
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 8 with LanguageResult

use of org.apache.tika.language.detect.LanguageResult in project tika by apache.

the class OptimaizeLangDetectorTest method testShortText.

@Test
public void testShortText() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector().setShortText(true).loadModels();
    // First verify that we get no result with empty or very short text.
    LanguageWriter writer = new LanguageWriter(detector);
    writer.append("");
    assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence());
    writer.reset();
    writer.append("  ");
    assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence());
    for (String language : getTestLanguages()) {
        // Short pieces of Japanese are detected as Chinese
        if (language.equals("ja")) {
            continue;
        }
        // We need at least 300 characters to detect Chinese reliably.
        writer.reset();
        writeTo(language, writer, 300);
        LanguageResult result = detector.detect();
        assertNotNull(String.format(Locale.US, "Language '%s' wasn't detected", language), result);
        assertTrue(String.format(Locale.US, "Language '%s' was detected as '%s'", language, result.getLanguage()), result.isLanguage(language));
        assertTrue(String.format(Locale.US, "Language '%s' isn't reasonably certain: %s", language, result.getConfidence()), result.isReasonablyCertain());
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 9 with LanguageResult

use of org.apache.tika.language.detect.LanguageResult in project tika by apache.

the class OptimaizeLangDetectorTest method testAllLanguages.

@Test
public void testAllLanguages() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector();
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    for (String language : getTestLanguages()) {
        writer.reset();
        writeTo(language, writer);
        LanguageResult result = detector.detect();
        assertNotNull(result);
        assertTrue(result.isLanguage(language));
        assertTrue(result.isReasonablyCertain());
    }
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 10 with LanguageResult

use of org.apache.tika.language.detect.LanguageResult in project tika by apache.

the class OptimaizeLangDetectorTest method testMixedLanguages.

@Test
public void testMixedLanguages() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector().setMixedLanguages(true);
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    String[] languages = getTestLanguages();
    for (int i = 0; i < languages.length; i++) {
        String language = languages[i];
        for (int j = i + 1; j < languages.length; j++) {
            String other = languages[j];
            writer.reset();
            writeTo(language, writer);
            writeTo(other, writer);
            List<LanguageResult> results = detector.detectAll();
            if (results.size() > 0) {
                LanguageResult result = results.get(0);
                assertFalse("mix of " + language + " and " + other + " incorrectly detected as " + result, result.isReasonablyCertain());
            }
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Aggregations

LanguageResult (org.apache.tika.language.detect.LanguageResult)20 LanguageDetector (org.apache.tika.language.detect.LanguageDetector)10 OptimaizeLangDetector (org.apache.tika.langdetect.OptimaizeLangDetector)7 LanguageWriter (org.apache.tika.language.detect.LanguageWriter)7 Test (org.junit.Test)6 Consumes (javax.ws.rs.Consumes)3 POST (javax.ws.rs.POST)3 PUT (javax.ws.rs.PUT)3 Path (javax.ws.rs.Path)3 Produces (javax.ws.rs.Produces)3 ArrayList (java.util.ArrayList)2 LanguageHandler (org.apache.tika.language.detect.LanguageHandler)2 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)2 ParseContext (org.apache.tika.parser.ParseContext)2 ContentHandler (org.xml.sax.ContentHandler)2 DetectedLanguage (com.optimaize.langdetect.DetectedLanguage)1 File (java.io.File)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 Detector (org.apache.tika.detect.Detector)1