Search in sources :

Example 46 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class ContentHandlerExample method parseBodyToHTML.

/**
     * Example of extracting just the body as HTML, without the
     * head part, as a string
     */
public String parseBodyToHTML() throws IOException, SAXException, TikaException {
    ContentHandler handler = new BodyContentHandler(new ToXMLContentHandler());
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 47 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class DisplayMetInstance method getMet.

public static Metadata getMet(URL url) throws IOException, SAXException, TikaException {
    Metadata met = new Metadata();
    PDFParser parser = new PDFParser();
    parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext());
    return met;
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) PDFParser(org.apache.tika.parser.pdf.PDFParser) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext)

Example 48 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class DisplayMetInstance method main.

public static void main(String[] args) throws Exception {
    Metadata met = DisplayMetInstance.getMet(new URL(args[0]));
    System.out.println(met);
}
Also used : Metadata(org.apache.tika.metadata.Metadata) URL(java.net.URL)

Example 49 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class FontParsersTest method testTTFParsing.

@Test
public void testTTFParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
        parser.parse(stream, handler, metadata, context);
    }
    assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
    assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
    assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
    assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
    assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
    assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
    // Not extracted
    assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
    assertEquals(null, metadata.get(MET_FONT_WEIGHT));
    assertEquals(null, metadata.get(MET_FONT_VERSION));
    // Currently, the parser doesn't extract any contents
    String content = handler.toString();
    assertEquals("", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AdobeFontMetricParser(org.apache.tika.parser.font.AdobeFontMetricParser) Test(org.junit.Test)

Example 50 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TestGDALParser method testParseMetadata.

@Test
public void testParseMetadata() {
    assumeTrue(canRun());
    final String expectedNcInst = "NCAR (National Center for Atmospheric Research, Boulder, CO, USA)";
    final String expectedModelNameEnglish = "NCAR CCSM";
    final String expectedProgramId = "Source file unknown Version unknown Date unknown";
    final String expectedProjectId = "IPCC Fourth Assessment";
    final String expectedRealization = "1";
    final String expectedTitle = "model output prepared for IPCC AR4";
    final String expectedSub8Name = "\":ua";
    final String expectedSub8Desc = "[1x17x128x256] eastward_wind (32-bit floating-point)";
    GDALParser parser = new GDALParser();
    InputStream stream = TestGDALParser.class.getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
    Metadata met = new Metadata();
    BodyContentHandler handler = new BodyContentHandler();
    try {
        parser.parse(stream, handler, met, new ParseContext());
        assertNotNull(met);
        assertNotNull(met.get("NC_GLOBAL#institution"));
        assertEquals(expectedNcInst, met.get("NC_GLOBAL#institution"));
        assertNotNull(met.get("NC_GLOBAL#model_name_english"));
        assertEquals(expectedModelNameEnglish, met.get("NC_GLOBAL#model_name_english"));
        assertNotNull(met.get("NC_GLOBAL#prg_ID"));
        assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID"));
        assertNotNull(met.get("NC_GLOBAL#prg_ID"));
        assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID"));
        assertNotNull(met.get("NC_GLOBAL#project_id"));
        assertEquals(expectedProjectId, met.get("NC_GLOBAL#project_id"));
        assertNotNull(met.get("NC_GLOBAL#realization"));
        assertEquals(expectedRealization, met.get("NC_GLOBAL#realization"));
        assertNotNull(met.get("NC_GLOBAL#title"));
        assertEquals(expectedTitle, met.get("NC_GLOBAL#title"));
        assertNotNull(met.get("SUBDATASET_8_NAME"));
        assertTrue(met.get("SUBDATASET_8_NAME").endsWith(expectedSub8Name));
        assertNotNull(met.get("SUBDATASET_8_DESC"));
        assertEquals(expectedSub8Desc, met.get("SUBDATASET_8_DESC"));
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) SAXException(org.xml.sax.SAXException) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)651 Test (org.junit.Test)467 InputStream (java.io.InputStream)320 ParseContext (org.apache.tika.parser.ParseContext)283 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)269 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)229 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)154 ByteArrayInputStream (java.io.ByteArrayInputStream)143 Parser (org.apache.tika.parser.Parser)136 TikaInputStream (org.apache.tika.io.TikaInputStream)133 IOException (java.io.IOException)66 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)48 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)29 MediaType (org.apache.tika.mime.MediaType)29 SAXException (org.xml.sax.SAXException)29