Search in sources :

Example 81 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class ImageParserTest method testGIF.

@Test
public void testGIF() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/gif");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testGIF.gif");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
    assertEquals("75", metadata.get("height"));
    assertEquals("100", metadata.get("width"));
    assertEquals("true", metadata.get("Compression Lossless"));
    assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
    assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
    assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
    assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100, imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
    assertEquals("Index", metadata.get("Data SampleFormat"));
    assertEquals("3", metadata.get("Chroma NumChannels"));
    assertEquals("1", metadata.get("Compression NumProgressiveScans"));
    assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
    assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("CommentExtensions CommentExtension"));
    assertEquals("value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
    assertEquals("true", metadata.get("Chroma BlackIsZero"));
    assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
    assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
    assertEquals("image/gif", metadata.get("Content-Type"));
    assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
    assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
    assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
}
Also used : InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 82 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class IWorkParserTest method setUp.

@Before
public void setUp() {
    iWorkParser = new IWorkPackageParser();
    parseContext = new ParseContext();
    parseContext.set(Parser.class, new AutoDetectParser());
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Before(org.junit.Before)

Example 83 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method assertScriptLink.

private void assertScriptLink(String html, String url) throws Exception {
    // IdentityHtmlMapper is needed to extract <script> tags
    ParseContext context = new ParseContext();
    context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "text/html");
    final List<String> links = new ArrayList<String>();
    new HtmlParser().parse(new ByteArrayInputStream(html.getBytes(UTF_8)), new DefaultHandler() {

        @Override
        public void startElement(String u, String l, String name, Attributes atts) {
            if (name.equals("script") && atts.getValue("", "src") != null) {
                links.add(atts.getValue("", "src"));
            }
        }
    }, metadata, context);
    assertEquals(1, links.size());
    assertEquals(url, links.get(0));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) Attributes(org.xml.sax.Attributes) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 84 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testHtml5Charset.

/**
     * Test case for TIKA-892
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
     */
@Test
public void testHtml5Charset() throws Exception {
    String test = "<html><head><meta charset=\"ISO-8859-15\" />" + "<title>the name is ándre</title>" + "</head><body></body></html>";
    Metadata metadata = new Metadata();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 85 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testFrameSrcExtraction.

/**
     * Test case for TIKA-463. Don't skip elements that have URLs.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
     */
@Test
public void testFrameSrcExtraction() throws Exception {
    final String test = "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" + "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
    StringWriter sw = new StringWriter();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), new Metadata(), new ParseContext());
    String result = sw.toString();
    // <frame> tag should exist, with fully resolved URL
    assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
}
Also used : StringWriter(java.io.StringWriter) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)336 Metadata (org.apache.tika.metadata.Metadata)281 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)163 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)117 Parser (org.apache.tika.parser.Parser)107 ByteArrayInputStream (java.io.ByteArrayInputStream)91 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)29 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)24 SAXException (org.xml.sax.SAXException)24 CompositeParser (org.apache.tika.parser.CompositeParser)22 FileInputStream (java.io.FileInputStream)19