use of org.apache.tika.exception.TikaException in project tika by apache.
the class TikaEncodingDetectorTest method testEncodingDetectorConfigurability.
@Test
public void testEncodingDetectorConfigurability() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
try {
Metadata metadata = getXML("english.cp500.txt", p).metadata;
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
Tika tika = new Tika(tikaConfig);
try {
String txt = tika.parseToString(getResourceAsFile("/test-documents/english.cp500.txt"));
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class FLVParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
DataInputStream datainput = new DataInputStream(stream);
if (!checkSignature(datainput)) {
throw new TikaException("FLV signature not detected");
}
// header
int version = datainput.readUnsignedByte();
if (version != 1) {
// should be 1, perhaps this is not flv?
throw new TikaException("Unpexpected FLV version: " + version);
}
int typeFlags = datainput.readUnsignedByte();
long len = readUInt32(datainput);
if (len != 9) {
// we only know about format with header of 9 bytes
throw new TikaException("Unpexpected FLV header length: " + len);
}
long sizePrev = readUInt32(datainput);
if (sizePrev != 0) {
// should be 0, perhaps this is not flv?
throw new TikaException("Unpexpected FLV first previous block size: " + sizePrev);
}
metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// flv tag stream follows...
while (true) {
int type = datainput.read();
if (type == -1) {
// EOF
break;
}
//body length
int datalen = readUInt24(datainput);
// timestamp
readUInt32(datainput);
// streamid
readUInt24(datainput);
if (type == TYPE_METADATA) {
// found metadata Tag, read content to buffer
byte[] metaBytes = new byte[datalen];
for (int readCount = 0; readCount < datalen; ) {
int r = stream.read(metaBytes, readCount, datalen - readCount);
if (r != -1) {
readCount += r;
} else {
break;
}
}
ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
DataInputStream dis = new DataInputStream(is);
Object data = null;
for (int i = 0; i < 2; i++) {
data = readAMFData(dis, -1);
}
if (data instanceof Map) {
// TODO if there are multiple metadata values with same key (in
// separate AMF blocks, we currently loose previous values)
Map<String, Object> extractedMetadata = (Map<String, Object>) data;
for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
if (entry.getValue() == null) {
continue;
}
metadata.set(entry.getKey(), entry.getValue().toString());
}
}
} else {
// Tag was not metadata, skip over data we cannot handle
for (int i = 0; i < datalen; i++) {
datainput.readByte();
}
}
// previous block size
sizePrev = readUInt32(datainput);
if (sizePrev != datalen + 11) {
// file was corrupt or we could not parse it...
break;
}
}
xhtml.endDocument();
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class XMLParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class EncryptedPrescriptionParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
try {
Key key = Pharmacy.getKey();
Cipher cipher = Cipher.getInstance("RSA");
cipher.init(Cipher.DECRYPT_MODE, key);
InputStream decrypted = new CipherInputStream(stream, cipher);
new PrescriptionParser().parse(decrypted, handler, metadata, context);
} catch (GeneralSecurityException e) {
throw new TikaException("Unable to decrypt a digital prescription", e);
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class TestChmExtraction method testExtractChmEntry.
protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException {
ChmExtractor chmExtractor = new ChmExtractor(stream);
ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
Set<String> names = new HashSet<String>();
for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
//Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
if (!niceAscFileName(directoryListingEntry.getName())) {
throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
}
final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
//check duplicate entry name which is seen before.
if (names.contains(lowName)) {
throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
}
names.add(lowName);
if (lowName.endsWith(".html") || lowName.endsWith(".htm") || lowName.endsWith(".hhk") || lowName.endsWith(".hhc")) //|| name.endsWith(".bmp")
{
if (findZero(data)) {
throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
}
//validate html
String html = new String(data, ISO_8859_1);
if (!htmlPairP.matcher(html).find()) {
System.err.println(lowName + " is invalid.");
System.err.println(html);
throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
}
// else {
// System.err.println(directoryListingEntry.getName() + " is valid.");
// }
}
}
}
Aggregations