Search in sources :

Example 1 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class Language method languageDetection.

public static void languageDetection() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector().loadModels();
    LanguageResult result = detector.detect("Alla människor är födda fria och lika i värde och rättigheter.");
    System.out.println(result.getLanguage());
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) OptimaizeLangDetector(org.apache.tika.langdetect.OptimaizeLangDetector)

Example 2 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class LanguageDetectorExample method detectLanguage.

public String detectLanguage(String text) throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector().loadModels();
    LanguageResult result = detector.detect(text);
    return result.getLanguage();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) OptimaizeLangDetector(org.apache.tika.langdetect.OptimaizeLangDetector)

Example 3 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class Lingo24LangDetectorTest method testLanguageDetection.

@Test
public void testLanguageDetection() throws Exception {
    LanguageDetector detector = new Lingo24LangDetector();
    assumeTrue(((Lingo24LangDetector) detector).isAvailable());
    LanguageWriter writer = new LanguageWriter(detector);
    // Reusing the test data from OptimaizeLangDetectorTest
    List<String> lines = IOUtils.readLines(Lingo24LangDetectorTest.class.getResourceAsStream("text-test.tsv"));
    for (String line : lines) {
        String[] data = line.split("\t");
        if (data.length != 2)
            continue;
        writer.reset();
        writer.append(data[1]);
        // Only check supported languages
        if (detector.hasModel(data[0])) {
            LanguageResult result = detector.detect();
            assertNotNull(result);
            assertEquals(data[0], result.getLanguage());
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 4 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class OptimaizeLangDetectorTest method testUniversalDeclarationOfHumanRights.

/*
	 * The complete list of supported languages (as of 0.5) is below.
	 * The ones we have tests for have '*' after the name.
	 * 
    af Afrikaans
    an Aragonese
    ar Arabic
    ast Asturian
    be Belarusian
    br Breton
    ca Catalan
    bg Bulgarian
    bn Bengali
    cs Czech
    cy Welsh
    da Danish *
    de German *
    el Greek *
    en English *
    es Spanish *
    et Estonian
    eu Basque
    fa Persian
    fi Finnish *
    fr French *
    ga Irish
    gl Galician
    gu Gujarati
    he Hebrew
    hi Hindi
    hr Croatian
    ht Haitian
    hu Hungarian
    id Indonesian
    is Icelandic
    it Italian *
    ja Japanese *
    km Khmer
    kn Kannada
    ko Korean
    lt Lithuanian *
    lv Latvian
    mk Macedonian
    ml Malayalam
    mr Marathi
    ms Malay
    mt Maltese
    ne Nepali
    nl Dutch *
    no Norwegian
    oc Occitan
    pa Punjabi
    pl Polish
    pt Portuguese *
    ro Romanian
    ru Russian
    sk Slovak
    sl Slovene
    so Somali
    sq Albanian
    sr Serbian
    sv Swedish *
    sw Swahili
    ta Tamil
    te Telugu
    th Thai *
    tl Tagalog
    tr Turkish
    uk Ukrainian
    ur Urdu
    vi Vietnamese
    yi Yiddish
    zh-CN Simplified Chinese * (just generic Chinese)
    zh-TW Traditional Chinese * (just generic Chinese)
	*/
/**
	 * Test correct detection for the many (short) translations of the
	 * "Universal Declaration of Human Rights (Article 1)", at
	 * http://www.omniglot.com/udhr
	 * 
	 * Also make sure we get uncertain results for some set of unsupported
	 * languages.
	 * 
	 * @throws Exception
	 */
@Test
public void testUniversalDeclarationOfHumanRights() throws Exception {
    LanguageDetector detector = new OptimaizeLangDetector();
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    Map<String, String> knownText = getTestLanguages("udhr-known.txt");
    for (String language : knownText.keySet()) {
        writer.reset();
        writer.append(knownText.get(language));
        LanguageResult result = detector.detect();
        assertNotNull(result);
        assertEquals(language, result.getLanguage());
    // System.out.println(String.format("'%s': %s (%f)", language, result.getConfidence(), result.getRawScore()));
    }
    Map<String, String> unknownText = getTestLanguages("udhr-unknown.txt");
    for (String language : unknownText.keySet()) {
        writer.reset();
        writer.append(unknownText.get(language));
        LanguageResult result = detector.detect();
        if (result != null) {
            assertFalse(result.isReasonablyCertain());
        // System.out.println(String.format("Looking for '%s', got '%s': %s (%f)", language, result.getLanguage(), result.getConfidence(), result.getRawScore()));
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 5 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class OptimaizeLangDetectorTest method testShortText.

@Test
public void testShortText() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector().setShortText(true).loadModels();
    // First verify that we get no result with empty or very short text.
    LanguageWriter writer = new LanguageWriter(detector);
    writer.append("");
    assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence());
    writer.reset();
    writer.append("  ");
    assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence());
    for (String language : getTestLanguages()) {
        // Short pieces of Japanese are detected as Chinese
        if (language.equals("ja")) {
            continue;
        }
        // We need at least 300 characters to detect Chinese reliably.
        writer.reset();
        writeTo(language, writer, 300);
        LanguageResult result = detector.detect();
        assertNotNull(String.format(Locale.US, "Language '%s' wasn't detected", language), result);
        assertTrue(String.format(Locale.US, "Language '%s' was detected as '%s'", language, result.getLanguage()), result.isLanguage(language));
        assertTrue(String.format(Locale.US, "Language '%s' isn't reasonably certain: %s", language, result.getConfidence()), result.isReasonablyCertain());
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Aggregations

LanguageDetector (org.apache.tika.language.detect.LanguageDetector)11 LanguageResult (org.apache.tika.language.detect.LanguageResult)10 LanguageWriter (org.apache.tika.language.detect.LanguageWriter)7 Test (org.junit.Test)7 OptimaizeLangDetector (org.apache.tika.langdetect.OptimaizeLangDetector)4 File (java.io.File)1 InputStream (java.io.InputStream)1 Detector (org.apache.tika.detect.Detector)1 TikaInputStream (org.apache.tika.io.TikaInputStream)1 MediaType (org.apache.tika.mime.MediaType)1 MimeTypes (org.apache.tika.mime.MimeTypes)1 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)1 ParseContext (org.apache.tika.parser.ParseContext)1 Parser (org.apache.tika.parser.Parser)1 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)1 ContentHandler (org.xml.sax.ContentHandler)1