use of org.apache.tika.metadata.Metadata in project tika by apache.
the class ContentHandlerExample method parseBodyToHTML.
/**
* Example of extracting just the body as HTML, without the
* head part, as a string
*/
public String parseBodyToHTML() throws IOException, SAXException, TikaException {
ContentHandler handler = new BodyContentHandler(new ToXMLContentHandler());
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
parser.parse(stream, handler, metadata);
return handler.toString();
}
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class DisplayMetInstance method getMet.
public static Metadata getMet(URL url) throws IOException, SAXException, TikaException {
Metadata met = new Metadata();
PDFParser parser = new PDFParser();
parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext());
return met;
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class DisplayMetInstance method main.
public static void main(String[] args) throws Exception {
Metadata met = DisplayMetInstance.getMet(new URL(args[0]));
System.out.println(met);
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class FontParsersTest method testTTFParsing.
@Test
public void testTTFParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
parser.parse(stream, handler, metadata, context);
}
assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
// Not extracted
assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
assertEquals(null, metadata.get(MET_FONT_WEIGHT));
assertEquals(null, metadata.get(MET_FONT_VERSION));
// Currently, the parser doesn't extract any contents
String content = handler.toString();
assertEquals("", content);
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TestGDALParser method testParseMetadata.
@Test
public void testParseMetadata() {
assumeTrue(canRun());
final String expectedNcInst = "NCAR (National Center for Atmospheric Research, Boulder, CO, USA)";
final String expectedModelNameEnglish = "NCAR CCSM";
final String expectedProgramId = "Source file unknown Version unknown Date unknown";
final String expectedProjectId = "IPCC Fourth Assessment";
final String expectedRealization = "1";
final String expectedTitle = "model output prepared for IPCC AR4";
final String expectedSub8Name = "\":ua";
final String expectedSub8Desc = "[1x17x128x256] eastward_wind (32-bit floating-point)";
GDALParser parser = new GDALParser();
InputStream stream = TestGDALParser.class.getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
Metadata met = new Metadata();
BodyContentHandler handler = new BodyContentHandler();
try {
parser.parse(stream, handler, met, new ParseContext());
assertNotNull(met);
assertNotNull(met.get("NC_GLOBAL#institution"));
assertEquals(expectedNcInst, met.get("NC_GLOBAL#institution"));
assertNotNull(met.get("NC_GLOBAL#model_name_english"));
assertEquals(expectedModelNameEnglish, met.get("NC_GLOBAL#model_name_english"));
assertNotNull(met.get("NC_GLOBAL#prg_ID"));
assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID"));
assertNotNull(met.get("NC_GLOBAL#prg_ID"));
assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID"));
assertNotNull(met.get("NC_GLOBAL#project_id"));
assertEquals(expectedProjectId, met.get("NC_GLOBAL#project_id"));
assertNotNull(met.get("NC_GLOBAL#realization"));
assertEquals(expectedRealization, met.get("NC_GLOBAL#realization"));
assertNotNull(met.get("NC_GLOBAL#title"));
assertEquals(expectedTitle, met.get("NC_GLOBAL#title"));
assertNotNull(met.get("SUBDATASET_8_NAME"));
assertTrue(met.get("SUBDATASET_8_NAME").endsWith(expectedSub8Name));
assertNotNull(met.get("SUBDATASET_8_DESC"));
assertEquals(expectedSub8Desc, met.get("SUBDATASET_8_DESC"));
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
Aggregations