use of org.apache.tika.config.TikaConfig in project tika by apache.
the class TikaGUI method main.
/**
* Main method. Sets the Swing look and feel to the operating system
* settings, and starts the Tika GUI with an {@link AutoDetectParser}
* instance as the default parser.
*
* @param args ignored
* @throws Exception if an error occurs
*/
public static void main(String[] args) throws Exception {
TikaConfig config = TikaConfig.getDefaultConfig();
if (args.length > 0) {
File configFile = new File(args[0]);
config = new TikaConfig(configFile);
}
UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
final TikaConfig finalConfig = config;
SwingUtilities.invokeLater(new Runnable() {
public void run() {
new TikaGUI(new DigestingParser(new AutoDetectParser(finalConfig), new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256))).setVisible(true);
}
});
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class ISATabUtils method parseStudy.
public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
TikaInputStream tis = TikaInputStream.get(stream);
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
Iterator<CSVRecord> iterator = csvParser.iterator();
xhtml.startElement("table");
xhtml.startElement("thead");
if (iterator.hasNext()) {
CSVRecord record = iterator.next();
for (int i = 0; i < record.size(); i++) {
xhtml.startElement("th");
xhtml.characters(record.get(i));
xhtml.endElement("th");
}
}
xhtml.endElement("thead");
xhtml.startElement("tbody");
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
xhtml.startElement("tr");
for (int j = 0; j < record.size(); j++) {
xhtml.startElement("td");
xhtml.characters(record.get(j));
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class ISATabUtils method parseAssay.
public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
TikaInputStream tis = TikaInputStream.get(stream);
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
xhtml.startElement("table");
Iterator<CSVRecord> iterator = csvParser.iterator();
xhtml.startElement("thead");
if (iterator.hasNext()) {
CSVRecord record = iterator.next();
for (int i = 0; i < record.size(); i++) {
xhtml.startElement("th");
xhtml.characters(record.get(i));
xhtml.endElement("th");
}
}
xhtml.endElement("thead");
xhtml.startElement("tbody");
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
xhtml.startElement("tr");
for (int j = 0; j < record.size(); j++) {
xhtml.startElement("td");
xhtml.characters(record.get(j));
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class ObjectRecognitionParserTest method jpegTesorflowTest.
@Ignore("If tensorflow not available Ignore")
@Test
public void jpegTesorflowTest() throws IOException, TikaException, SAXException {
try (InputStream stream = loader.getResourceAsStream(CONFIG_FILE)) {
assert stream != null;
Tika tika = new Tika(new TikaConfig(stream));
Metadata metadata = new Metadata();
try (InputStream imageStream = loader.getResourceAsStream(CAT_IMAGE)) {
Reader reader = tika.parse(imageStream, metadata);
List<String> lines = IOUtils.readLines(reader);
String text = StringUtils.join(lines, " ");
String[] expectedObjects = { "Egyptian cat", "tabby, tabby cat" };
String metaValues = StringUtils.join(metadata.getValues(ObjectRecognitionParser.MD_KEY), " ");
for (String expectedObject : expectedObjects) {
String message = "'" + expectedObject + "' must have been detected";
Assert.assertTrue(message, text.contains(expectedObject));
Assert.assertTrue(message, metaValues.contains(expectedObject));
}
}
}
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class ObjectRecognitionParserTest method testREST.
@Ignore("Configure Rest API service")
@Test
public void testREST() throws Exception {
try (InputStream stream = loader.getResourceAsStream(CONFIG_REST_FILE)) {
assert stream != null;
Tika tika = new Tika(new TikaConfig(stream));
Metadata metadata = new Metadata();
try (InputStream imageStream = loader.getResourceAsStream(CAT_IMAGE)) {
Reader reader = tika.parse(imageStream, metadata);
String text = IOUtils.toString(reader);
String[] expectedObjects = { "Egyptian cat", "tabby, tabby cat" };
String metaValues = StringUtils.join(metadata.getValues(ObjectRecognitionParser.MD_KEY), " ");
for (String expectedObject : expectedObjects) {
String message = "'" + expectedObject + "' must have been detected";
Assert.assertTrue(message, text.contains(expectedObject));
Assert.assertTrue(message, metaValues.contains(expectedObject));
}
}
}
}
Aggregations