use of org.apache.tika.Tika in project tika by apache.
the class TikaEncodingDetectorTest method testEncodingDetectorConfigurability.
@Test
public void testEncodingDetectorConfigurability() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
try {
Metadata metadata = getXML("english.cp500.txt", p).metadata;
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
Tika tika = new Tika(tikaConfig);
try {
String txt = tika.parseToString(getResourceAsFile("/test-documents/english.cp500.txt"));
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
}
use of org.apache.tika.Tika in project tika by apache.
the class SimpleTextExtractor method main.
public static void main(String[] args) throws Exception {
// Create a Tika instance with the default configuration
Tika tika = new Tika();
// text content
for (String file : args) {
String text = tika.parseToString(new File(file));
System.out.print(text);
}
}
use of org.apache.tika.Tika in project tika by apache.
the class AudioParserTest method testAU.
@Test
public void testAU() throws Exception {
String path = "/test-documents/testAU.au";
Metadata metadata = new Metadata();
String content = new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), metadata);
assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("44100.0", metadata.get("samplerate"));
assertEquals("2", metadata.get("channels"));
assertEquals("16", metadata.get("bits"));
assertEquals("PCM_SIGNED", metadata.get("encoding"));
assertEquals("", content);
}
use of org.apache.tika.Tika in project tika by apache.
the class MidiParserTest method testMID.
@Test
public void testMID() throws Exception {
String path = "/test-documents/testMID.mid";
Metadata metadata = new Metadata();
String content = new Tika().parseToString(MidiParserTest.class.getResourceAsStream(path), metadata);
assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("2", metadata.get("tracks"));
assertEquals("0", metadata.get("patches"));
assertEquals("PPQ", metadata.get("divisionType"));
assertContains("Untitled", content);
}
use of org.apache.tika.Tika in project tika by apache.
the class HtmlParserTest method XtestParseUTF8.
@Test
@Ignore("The file 'testXHTML_utf8.html' is not available for testing")
public void XtestParseUTF8() throws IOException, SAXException, TikaException {
String path = "/test-documents/testXHTML_utf8.html";
Metadata metadata = new Metadata();
String content = new Tika().parseToString(HtmlParserTest.class.getResourceAsStream(path), metadata);
assertTrue("Did not contain expected text:" + "Title : Tilte with UTF-8 chars öäå", content.contains("Title : Tilte with UTF-8 chars öäå"));
assertTrue("Did not contain expected text:" + "Content with UTF-8 chars", content.contains("Content with UTF-8 chars"));
assertTrue("Did not contain expected text:" + "åäö", content.contains("åäö"));
}
Aggregations