Search in sources :

Example 11 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class ExtractEmbeddedFiles method extract.

public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException {
    Metadata m = new Metadata();
    ParseContext c = new ParseContext();
    ContentHandler h = new BodyContentHandler(-1);
    c.set(Parser.class, parser);
    EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c);
    c.set(EmbeddedDocumentExtractor.class, ex);
    parser.parse(is, h, m, c);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ParsingEmbeddedDocumentExtractor(org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 12 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class Language method languageDetectionWithHandler.

public static void languageDetectionWithHandler() throws Exception {
    LanguageHandler handler = new LanguageHandler();
    new AutoDetectParser().parse(System.in, handler, new Metadata(), new ParseContext());
    LanguageResult result = handler.getLanguage();
    System.out.println(result.getLanguage());
}
Also used : LanguageHandler(org.apache.tika.language.detect.LanguageHandler) LanguageResult(org.apache.tika.language.detect.LanguageResult) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 13 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class DisplayMetInstance method getMet.

public static Metadata getMet(URL url) throws IOException, SAXException, TikaException {
    Metadata met = new Metadata();
    PDFParser parser = new PDFParser();
    parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext());
    return met;
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) PDFParser(org.apache.tika.parser.pdf.PDFParser) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext)

Example 14 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class FontParsersTest method testTTFParsing.

@Test
public void testTTFParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
        parser.parse(stream, handler, metadata, context);
    }
    assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
    assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
    assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
    assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
    assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
    assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
    // Not extracted
    assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
    assertEquals(null, metadata.get(MET_FONT_WEIGHT));
    assertEquals(null, metadata.get(MET_FONT_VERSION));
    // Currently, the parser doesn't extract any contents
    String content = handler.toString();
    assertEquals("", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AdobeFontMetricParser(org.apache.tika.parser.font.AdobeFontMetricParser) Test(org.junit.Test)

Example 15 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class TestGDALParser method testParseMetadata.

@Test
public void testParseMetadata() {
    assumeTrue(canRun());
    final String expectedNcInst = "NCAR (National Center for Atmospheric Research, Boulder, CO, USA)";
    final String expectedModelNameEnglish = "NCAR CCSM";
    final String expectedProgramId = "Source file unknown Version unknown Date unknown";
    final String expectedProjectId = "IPCC Fourth Assessment";
    final String expectedRealization = "1";
    final String expectedTitle = "model output prepared for IPCC AR4";
    final String expectedSub8Name = "\":ua";
    final String expectedSub8Desc = "[1x17x128x256] eastward_wind (32-bit floating-point)";
    GDALParser parser = new GDALParser();
    InputStream stream = TestGDALParser.class.getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
    Metadata met = new Metadata();
    BodyContentHandler handler = new BodyContentHandler();
    try {
        parser.parse(stream, handler, met, new ParseContext());
        assertNotNull(met);
        assertNotNull(met.get("NC_GLOBAL#institution"));
        assertEquals(expectedNcInst, met.get("NC_GLOBAL#institution"));
        assertNotNull(met.get("NC_GLOBAL#model_name_english"));
        assertEquals(expectedModelNameEnglish, met.get("NC_GLOBAL#model_name_english"));
        assertNotNull(met.get("NC_GLOBAL#prg_ID"));
        assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID"));
        assertNotNull(met.get("NC_GLOBAL#prg_ID"));
        assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID"));
        assertNotNull(met.get("NC_GLOBAL#project_id"));
        assertEquals(expectedProjectId, met.get("NC_GLOBAL#project_id"));
        assertNotNull(met.get("NC_GLOBAL#realization"));
        assertEquals(expectedRealization, met.get("NC_GLOBAL#realization"));
        assertNotNull(met.get("NC_GLOBAL#title"));
        assertEquals(expectedTitle, met.get("NC_GLOBAL#title"));
        assertNotNull(met.get("SUBDATASET_8_NAME"));
        assertTrue(met.get("SUBDATASET_8_NAME").endsWith(expectedSub8Name));
        assertNotNull(met.get("SUBDATASET_8_DESC"));
        assertEquals(expectedSub8Desc, met.get("SUBDATASET_8_DESC"));
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) SAXException(org.xml.sax.SAXException) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)336 Metadata (org.apache.tika.metadata.Metadata)281 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)163 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)117 Parser (org.apache.tika.parser.Parser)107 ByteArrayInputStream (java.io.ByteArrayInputStream)91 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)29 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)24 SAXException (org.xml.sax.SAXException)24 CompositeParser (org.apache.tika.parser.CompositeParser)22 FileInputStream (java.io.FileInputStream)19