use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TrueTypeParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TikaInputStream tis = TikaInputStream.cast(stream);
// Ask FontBox to parse the file for us
TrueTypeFont font = null;
try {
TTFParser parser = new TTFParser();
if (tis != null && tis.hasFile()) {
font = parser.parse(tis.getFile());
} else {
font = parser.parse(stream);
}
// Report the details of the font
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
metadata.set(TikaCoreProperties.CREATED, font.getHeader().getCreated());
metadata.set(TikaCoreProperties.MODIFIED, font.getHeader().getModified());
metadata.set(AdobeFontMetricParser.MET_DOC_VERSION, Float.toString(font.getHeader().getVersion()));
// Pull out the naming info
NamingTable fontNaming = font.getNaming();
for (NameRecord nr : fontNaming.getNameRecords()) {
if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
metadata.set(TikaCoreProperties.TITLE, nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
metadata.set("Copyright", nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
metadata.set("Trademark", nr.getString());
}
}
} finally {
if (font != null) {
font.close();
}
}
// For now, we only output metadata, no textual contents
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class WebPParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseWebP(tis.getFile());
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class SQLite3DBParser method getConnectionString.
@Override
protected String getConnectionString(InputStream is, Metadata metadata, ParseContext context) throws IOException {
TikaInputStream tis = TikaInputStream.cast(is);
//use original underlying file.
if (tis != null) {
Path dbFile = tis.getPath();
return "jdbc:sqlite:" + dbFile.toAbsolutePath().toString();
} else {
//if not TikaInputStream, create own tmpResources.
tmpFile = Files.createTempFile("tika-sqlite-tmp", "");
Files.copy(is, tmpFile, StandardCopyOption.REPLACE_EXISTING);
return "jdbc:sqlite:" + tmpFile.toAbsolutePath().toString();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ODFParserTest method testNPEFromFile.
@Test
public void testNPEFromFile() throws Exception {
OpenDocumentParser parser = new OpenDocumentParser();
try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource("/test-documents/testNPEOpenDocument.odt"))) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(tis, handler, metadata, new ParseContext());
assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("primero hay que generar un par de claves", content);
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class PDFParserTest method testEmbeddedPDFEmbeddingAnotherDocument.
//TIKA-1124
@Test
public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
/* format of test doc:
docx/
pdf/
docx
*/
String content = getXML("testPDFEmbeddingAndEmbedded.docx").xml;
int outerHaystack = content.indexOf("Outer_haystack");
int pdfHaystack = content.indexOf("pdf_haystack");
int needle = content.indexOf("Needle");
assertTrue(outerHaystack > -1);
assertTrue(pdfHaystack > -1);
assertTrue(needle > -1);
assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
TrackingHandler tracker = new TrackingHandler();
ContainerExtractor ex = new ParserContainerExtractor();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"))) {
ex.extract(tis, ex, tracker);
}
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
assertEquals("image1.emf", tracker.filenames.get(0));
assertNull(tracker.filenames.get(1));
assertEquals("Test.docx", tracker.filenames.get(2));
assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
}
Aggregations