use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class Mp3ParserTest method testTIKA424.
/**
* This test will do nothing, unless you've downloaded the
* mp3 file from TIKA-424 - the file cannot be
* distributed with Tika.
* This test will check for the complicated set of ID3v2.4
* tags.
*/
@Test
public void testTIKA424() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/test2.mp3")) {
if (stream == null) {
// Skip the test
return;
}
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Merzhin", metadata.get(Metadata.AUTHOR));
String content = handler.toString();
assertContains("Plus loin vers l'ouest", content);
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("2", metadata.get("channels"));
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class Mp3ParserTest method testMp3ParsingLyrics.
/**
* Tests that a file with both lyrics and
* ID3v2 tags gets both extracted correctly
*/
@Test
public void testMp3ParsingLyrics() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/testMP3lyrics.mp3")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
String content = handler.toString();
assertContains("Test Title", content);
assertContains("Test Artist", content);
assertContains("Test Album", content);
assertContains("2008", content);
assertContains("Test Comment", content);
assertContains("Rock", content);
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("2", metadata.get("channels"));
checkDuration(metadata, 1);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ODFParserTest method testODTFooter.
@Test
public void testODTFooter() throws Exception {
try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testFooter.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser().parse(input, handler, metadata);
String content = handler.toString();
assertContains("Here is some text...", content);
assertContains("Here is some text on page 2", content);
assertContains("Here is footer text", content);
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class PDFParserTest method testAnnotations.
@Test
public void testAnnotations() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
String content;
try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
content = getText(stream, parser);
}
content = content.replaceAll("[\\s ]+", " ");
assertContains("Here is some text", content);
assertContains("Here is a comment", content);
// Test w/ annotation text disabled:
PDFParser pdfParser = new PDFParser();
pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
content = getText(stream, pdfParser);
}
content = content.replaceAll("[\\s ]+", " ");
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));
// annotation text disabled through parsecontext
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
config.setExtractAnnotationText(false);
context.set(PDFParserConfig.class, config);
try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
content = getText(stream, parser, context);
}
content = content.replaceAll("[\\s ]+", " ");
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));
// TIKA-738: make sure no extra </p> tags
String xml = getXML("testAnnotations.pdf").xml;
assertEquals(substringCount("<p>", xml), substringCount("</p>", xml));
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class PDFParserTest method testEmbeddedFileMarkup.
//TIKA-1427
@Test
public void testEmbeddedFileMarkup() throws Exception {
Parser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(org.apache.tika.parser.Parser.class, parser);
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
XMLResult r = getXML("testPDF_childAttachments.pdf", context);
//regular attachment
assertContains("<div source=\"attachment\" class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
//inline image
assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", r.xml);
//doc embedded inside an annotation
r = getXML("testPDFFileEmbInAnnotation.pdf");
assertContains("<div source=\"annotation\" class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
}
Aggregations