Search in sources :

Example 36 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaGUI method main.

/**
     * Main method. Sets the Swing look and feel to the operating system
     * settings, and starts the Tika GUI with an {@link AutoDetectParser}
     * instance as the default parser.
     *
     * @param args ignored
     * @throws Exception if an error occurs
     */
public static void main(String[] args) throws Exception {
    TikaConfig config = TikaConfig.getDefaultConfig();
    if (args.length > 0) {
        File configFile = new File(args[0]);
        config = new TikaConfig(configFile);
    }
    UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
    final TikaConfig finalConfig = config;
    SwingUtilities.invokeLater(new Runnable() {

        public void run() {
            new TikaGUI(new DigestingParser(new AutoDetectParser(finalConfig), new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256))).setVisible(true);
        }
    });
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) File(java.io.File)

Example 37 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class ISATabUtils method parseStudy.

public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
        CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        Iterator<CSVRecord> iterator = csvParser.iterator();
        xhtml.startElement("table");
        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");
        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) CSVParser(org.apache.commons.csv.CSVParser) TikaInputStream(org.apache.tika.io.TikaInputStream) CSVRecord(org.apache.commons.csv.CSVRecord) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 38 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class ISATabUtils method parseAssay.

public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
        CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        xhtml.startElement("table");
        Iterator<CSVRecord> iterator = csvParser.iterator();
        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");
        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) CSVParser(org.apache.commons.csv.CSVParser) TikaInputStream(org.apache.tika.io.TikaInputStream) CSVRecord(org.apache.commons.csv.CSVRecord) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 39 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class ObjectRecognitionParserTest method jpegTesorflowTest.

@Ignore("If tensorflow not available Ignore")
@Test
public void jpegTesorflowTest() throws IOException, TikaException, SAXException {
    try (InputStream stream = loader.getResourceAsStream(CONFIG_FILE)) {
        assert stream != null;
        Tika tika = new Tika(new TikaConfig(stream));
        Metadata metadata = new Metadata();
        try (InputStream imageStream = loader.getResourceAsStream(CAT_IMAGE)) {
            Reader reader = tika.parse(imageStream, metadata);
            List<String> lines = IOUtils.readLines(reader);
            String text = StringUtils.join(lines, " ");
            String[] expectedObjects = { "Egyptian cat", "tabby, tabby cat" };
            String metaValues = StringUtils.join(metadata.getValues(ObjectRecognitionParser.MD_KEY), " ");
            for (String expectedObject : expectedObjects) {
                String message = "'" + expectedObject + "' must have been detected";
                Assert.assertTrue(message, text.contains(expectedObject));
                Assert.assertTrue(message, metaValues.contains(expectedObject));
            }
        }
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaInputStream(org.apache.tika.io.TikaInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 40 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class ObjectRecognitionParserTest method testREST.

@Ignore("Configure Rest API service")
@Test
public void testREST() throws Exception {
    try (InputStream stream = loader.getResourceAsStream(CONFIG_REST_FILE)) {
        assert stream != null;
        Tika tika = new Tika(new TikaConfig(stream));
        Metadata metadata = new Metadata();
        try (InputStream imageStream = loader.getResourceAsStream(CAT_IMAGE)) {
            Reader reader = tika.parse(imageStream, metadata);
            String text = IOUtils.toString(reader);
            String[] expectedObjects = { "Egyptian cat", "tabby, tabby cat" };
            String metaValues = StringUtils.join(metadata.getValues(ObjectRecognitionParser.MD_KEY), " ");
            for (String expectedObject : expectedObjects) {
                String message = "'" + expectedObject + "' must have been detected";
                Assert.assertTrue(message, text.contains(expectedObject));
                Assert.assertTrue(message, metaValues.contains(expectedObject));
            }
        }
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaInputStream(org.apache.tika.io.TikaInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5