Search in sources :

Example 51 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class DcXMLParserTest method testXMLParserNonAsciiChars.

@Test
public void testXMLParserNonAsciiChars() throws Exception {
    try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) {
        Metadata metadata = new Metadata();
        new DcXMLParser().parse(input, new DefaultHandler(), metadata);
        final String expected = "Archimède et Lius à Châteauneuf testing chars en été";
        assertEquals(expected, metadata.get(TikaCoreProperties.RIGHTS));
    }
}
Also used : InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 52 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class RFC822ParserTest method getDate.

private Date getDate(String dateString) throws Exception {
    String mail = "From: dev@tika.apache.org\n" + "Date: " + dateString + "\n";
    Parser p = new RFC822Parser();
    Metadata m = new Metadata();
    try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
        p.parse(is, new DefaultHandler(), m, new ParseContext());
    }
    return m.getDate(TikaCoreProperties.CREATED);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 53 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class HtmlParserTest method testParseAscii.

@Test
public void testParseAscii() throws Exception {
    String path = "/test-documents/testHTML.html";
    final StringWriter href = new StringWriter();
    final StringWriter name = new StringWriter();
    ContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = HtmlParserTest.class.getResourceAsStream(path)) {
        ContentHandler link = new DefaultHandler() {

            @Override
            public void startElement(String u, String l, String n, Attributes a) throws SAXException {
                if ("a".equals(l)) {
                    if (a.getValue("href") != null) {
                        href.append(a.getValue("href"));
                    } else if (a.getValue("name") != null) {
                        name.append(a.getValue("name"));
                    }
                }
            }
        };
        new HtmlParser().parse(stream, new TeeContentHandler(body, link), metadata, new ParseContext());
    }
    assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Tika Developers", metadata.get("Author"));
    assertEquals("5", metadata.get("refresh"));
    assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
    assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
    assertEquals("http://www.apache.org/", href.toString());
    assertEquals("test-anchor", name.toString());
    String content = body.toString();
    assertTrue("Did not contain expected text:" + "Test Indexation Html", content.contains("Test Indexation Html"));
    assertTrue("Did not contain expected text:" + "Indexation du fichier", content.contains("Indexation du fichier"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) StringWriter(java.io.StringWriter) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Attributes(org.xml.sax.Attributes) ParseContext(org.apache.tika.parser.ParseContext) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 54 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class BPGParserTest method testBPG_Geo.

/**
     * Tests a file with geographic information in it
     */
@Test
public void testBPG_Geo() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testBPG_GEO.bpg");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
    assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
    assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
    assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE));
    assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE));
    // TODO Get the geographic data to be properly extracted, see TIKA-1495
    if (false) {
        assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
        assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
    }
    // TODO Get the exif data to be properly extracted, see TIKA-1495
    if (false) {
        // 1/1600
        assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME));
        assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
        assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
        assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
        assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
        assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
        assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
        assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
    }
}
Also used : InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 55 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class BPGParserTest method testBPG_Commented.

/**
     * Tests a file with comments
     */
@Test
public void testBPG_Commented() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testBPG_commented.bpg");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
    assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
    assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
    assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE));
    assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE));
    // TODO Get the exif comment data to be properly extracted, see TIKA-1495
    if (false) {
        assertEquals("Tosteberga Ängar", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Bird site in north eastern Skåne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
        List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
        assertTrue(keywords.contains("coast"));
        assertTrue(keywords.contains("bird watching"));
        assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)));
    }
    // TODO Get the exif data to be properly extracted, see TIKA-1495
    if (false) {
        // 1/1000000
        assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME));
        assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
        assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
        assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS));
        assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE));
        assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL));
        assertEquals(null, metadata.get(Metadata.SOFTWARE));
        assertEquals("1", metadata.get(Metadata.ORIENTATION));
        assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
        assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
    }
}
Also used : InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Aggregations

DefaultHandler (org.xml.sax.helpers.DefaultHandler)148 InputStream (java.io.InputStream)65 Metadata (org.apache.tika.metadata.Metadata)59 ParseContext (org.apache.tika.parser.ParseContext)52 Test (org.junit.Test)44 Attributes (org.xml.sax.Attributes)41 SAXParser (javax.xml.parsers.SAXParser)40 SAXException (org.xml.sax.SAXException)39 ByteArrayInputStream (java.io.ByteArrayInputStream)32 SAXParserFactory (javax.xml.parsers.SAXParserFactory)29 IOException (java.io.IOException)26 InputSource (org.xml.sax.InputSource)23 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)22 Parser (org.apache.tika.parser.Parser)22 TikaInputStream (org.apache.tika.io.TikaInputStream)20 ContentHandler (org.xml.sax.ContentHandler)20 File (java.io.File)19 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 FileInputStream (java.io.FileInputStream)15