Search in sources :

Example 6 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class OptimaizeLangDetectorTest method testAllLanguages.

@Test
public void testAllLanguages() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector();
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    for (String language : getTestLanguages()) {
        writer.reset();
        writeTo(language, writer);
        LanguageResult result = detector.detect();
        assertNotNull(result);
        assertTrue(result.isLanguage(language));
        assertTrue(result.isReasonablyCertain());
    }
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 7 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class OptimaizeLangDetectorTest method testMixedLanguages.

@Test
public void testMixedLanguages() throws IOException {
    LanguageDetector detector = new OptimaizeLangDetector().setMixedLanguages(true);
    detector.loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    String[] languages = getTestLanguages();
    for (int i = 0; i < languages.length; i++) {
        String language = languages[i];
        for (int j = i + 1; j < languages.length; j++) {
            String other = languages[j];
            writer.reset();
            writeTo(language, writer);
            writeTo(other, writer);
            List<LanguageResult> results = detector.detectAll();
            if (results.size() > 0) {
                LanguageResult result = results.get(0);
                assertFalse("mix of " + language + " and " + other + " incorrectly detected as " + result, result.isReasonablyCertain());
            }
        }
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 8 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class TextLangDetectorTest method test.

@Test
public void test() throws Exception {
    assumeTrue(TextLangDetector.canRun());
    LanguageDetector detector = new TextLangDetector();
    LanguageWriter writer = new LanguageWriter(detector);
    List<String> lines = IOUtils.readLines(TextLangDetectorTest.class.getResourceAsStream("text-test.tsv"));
    for (String line : lines) {
        String[] data = line.split("\t");
        if (data.length != 2)
            continue;
        writer.reset();
        writer.append(data[1]);
        LanguageResult result = detector.detect();
        assertNotNull(result);
        assertEquals(data[0], result.getLanguage());
    }
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) LanguageWriter(org.apache.tika.language.detect.LanguageWriter) Test(org.junit.Test)

Example 9 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class Language method languageDetectionWithWriter.

public static void languageDetectionWithWriter() throws IOException {
    // TODO support version of LanguageWriter that doesn't need a detector.
    LanguageDetector detector = new OptimaizeLangDetector().loadModels();
    LanguageWriter writer = new LanguageWriter(detector);
    writer.append("Minden emberi lény");
    writer.append(" szabadon születik és");
    writer.append(" egyenlő méltósága és");
    writer.append(" joga van.");
    LanguageResult result = writer.getLanguage();
    System.out.println(result.getLanguage());
    writer.close();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) OptimaizeLangDetector(org.apache.tika.langdetect.OptimaizeLangDetector) LanguageWriter(org.apache.tika.language.detect.LanguageWriter)

Example 10 with LanguageDetector

use of org.apache.tika.language.detect.LanguageDetector in project tika by apache.

the class MyFirstTika method parseUsingComponents.

public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
    MimeTypes mimeRegistry = tikaConfig.getMimeRepository();
    System.out.println("Examining: [" + filename + "]");
    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]");
    InputStream stream = TikaInputStream.get(new File(filename));
    System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]");
    stream = TikaInputStream.get(new File(filename));
    Detector detector = tikaConfig.getDetector();
    System.out.println("The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]");
    LanguageDetector langDetector = new OptimaizeLangDetector().loadModels();
    LanguageResult lang = langDetector.detect(FileUtils.readFileToString(new File(filename), UTF_8));
    System.out.println("The language of this content is: [" + lang.getLanguage() + "]");
    // Get a non-detecting parser that handles all the types it can
    Parser parser = tikaConfig.getParser();
    // Tell it what we think the content is
    MediaType type = detector.detect(stream, metadata);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // Have the file parsed to get the content and metadata
    ContentHandler handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, new ParseContext());
    return handler.toString();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) LanguageDetector(org.apache.tika.language.detect.LanguageDetector) Detector(org.apache.tika.detect.Detector) OptimaizeLangDetector(org.apache.tika.langdetect.OptimaizeLangDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) OptimaizeLangDetector(org.apache.tika.langdetect.OptimaizeLangDetector) MediaType(org.apache.tika.mime.MediaType) MimeTypes(org.apache.tika.mime.MimeTypes) File(java.io.File) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Aggregations

LanguageDetector (org.apache.tika.language.detect.LanguageDetector)11 LanguageResult (org.apache.tika.language.detect.LanguageResult)10 LanguageWriter (org.apache.tika.language.detect.LanguageWriter)7 Test (org.junit.Test)7 OptimaizeLangDetector (org.apache.tika.langdetect.OptimaizeLangDetector)4 File (java.io.File)1 InputStream (java.io.InputStream)1 Detector (org.apache.tika.detect.Detector)1 TikaInputStream (org.apache.tika.io.TikaInputStream)1 MediaType (org.apache.tika.mime.MediaType)1 MimeTypes (org.apache.tika.mime.MimeTypes)1 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)1 ParseContext (org.apache.tika.parser.ParseContext)1 Parser (org.apache.tika.parser.Parser)1 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)1 ContentHandler (org.xml.sax.ContentHandler)1