use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class JackcessExtractor method handleOLE.
private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
OleBlob blob = row.getBlob(cName);
//lifted shamelessly from Jackcess's OleBlobTest
if (blob == null)
return;
OleBlob.Content content = blob.getContent();
if (content == null)
return;
switch(content.getType()) {
case LINK:
xhtml.characters(((OleBlob.LinkContent) content).getLinkPath());
break;
case SIMPLE_PACKAGE:
OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
//TODO: find test file that has this kind of attachment
//and see if getFilePath or getLocalFilePath is meaningful
//for TikaCoreProperties.ORIGINAL_RESOURCE_NAME
TikaInputStream tis = null;
try {
tis = TikaInputStream.get(spc.getStream());
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
break;
}
if (tis != null) {
try {
handleEmbeddedResource(tis, //filename
spc.getFileName(), //relationshipId
null, //mediatype
spc.getTypeName(), xhtml, false);
} finally {
IOUtils.closeQuietly(tis);
}
}
break;
case OTHER:
OleBlob.OtherContent oc = (OleBlob.OtherContent) content;
TikaInputStream ocStream = null;
try {
ocStream = TikaInputStream.get(oc.getStream());
} catch (IOException e) {
EmbeddedDocumentUtil.recordException(e, parentMetadata);
}
try {
handleEmbeddedResource(ocStream, //filename
null, //relationshipId
null, //mediatype
oc.getTypeName(), xhtml, false);
} finally {
IOUtils.closeQuietly(ocStream);
}
break;
case COMPOUND_STORAGE:
OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content;
handleCompoundContent(cc, xhtml);
break;
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ISATabUtils method parseStudy.
public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
TikaInputStream tis = TikaInputStream.get(stream);
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
Iterator<CSVRecord> iterator = csvParser.iterator();
xhtml.startElement("table");
xhtml.startElement("thead");
if (iterator.hasNext()) {
CSVRecord record = iterator.next();
for (int i = 0; i < record.size(); i++) {
xhtml.startElement("th");
xhtml.characters(record.get(i));
xhtml.endElement("th");
}
}
xhtml.endElement("thead");
xhtml.startElement("tbody");
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
xhtml.startElement("tr");
for (int j = 0; j < record.size(); j++) {
xhtml.startElement("td");
xhtml.characters(record.get(j));
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ISATabUtils method parseAssay.
public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
TikaInputStream tis = TikaInputStream.get(stream);
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
xhtml.startElement("table");
Iterator<CSVRecord> iterator = csvParser.iterator();
xhtml.startElement("thead");
if (iterator.hasNext()) {
CSVRecord record = iterator.next();
for (int i = 0; i < record.size(); i++) {
xhtml.startElement("th");
xhtml.characters(record.get(i));
xhtml.endElement("th");
}
}
xhtml.endElement("thead");
xhtml.startElement("tbody");
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
xhtml.startElement("tr");
for (int j = 0; j < record.size(); j++) {
xhtml.startElement("td");
xhtml.characters(record.get(j));
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ISArchiveParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp);
try {
if (this.location == null) {
this.location = tis.getFile().getParent() + File.separator;
}
this.studyFileName = tis.getFile().getName();
File locationFile = new File(location);
String[] investigationList = locationFile.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.matches("i_.+\\.txt");
}
});
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
parseInvestigation(investigationList, xhtml, metadata, context);
parseStudy(stream, xhtml, metadata, context);
parseAssay(xhtml, metadata, context);
xhtml.endDocument();
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class JDBCTableReader method handleBlob.
protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex, ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
Metadata m = new Metadata();
m.set(Database.TABLE_NAME, tableName);
m.set(Database.COLUMN_NAME, columnName);
m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum));
m.set(Database.PREFIX + "IS_BLOB", "true");
Blob blob = null;
TikaInputStream is = null;
try {
blob = getBlob(resultSet, columnIndex, m);
if (blob == null) {
return;
}
is = TikaInputStream.get(blob, m);
Attributes attrs = new AttributesImpl();
((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
((AttributesImpl) attrs).addAttribute("", "column_name", "column_name", "CDATA", columnName);
((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA", Integer.toString(rowNum));
handler.startElement("", "span", "span", attrs);
String extension = embeddedDocumentUtil.getExtension(is, m);
m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, //just in case something screwy is going on with the column name
FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + extension)));
if (embeddedDocumentUtil.shouldParseEmbedded(m)) {
embeddedDocumentUtil.parseEmbedded(is, handler, m, true);
}
} finally {
if (blob != null) {
try {
blob.free();
} catch (SQLException | UnsupportedOperationException e) {
//swallow
}
}
IOUtils.closeQuietly(is);
}
handler.endElement("", "span", "span");
}
Aggregations