use of org.apache.tika.exception.TikaException in project tika by apache.
the class TextExtractor method processControlWord.
// Handle control word that takes a parameter:
private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
// JFlex), which uses single-pass FSM to do cmp:
if (inHeader) {
if (equals("ansicpg")) {
// ANSI codepage
Charset cs = ANSICPG_MAP.get(param);
if (cs != null) {
globalCharset = cs;
}
} else if (equals("deff")) {
// Default font
globalDefaultFont = param;
} else if (equals("nofpages")) {
metadata.add(Office.PAGE_COUNT, Integer.toString(param));
} else if (equals("nofwords")) {
metadata.add(Office.WORD_COUNT, Integer.toString(param));
} else if (equals("nofchars")) {
metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
} else if (equals("yr")) {
year = param;
} else if (equals("mo")) {
month = param;
} else if (equals("dy")) {
day = param;
} else if (equals("hr")) {
hour = param;
} else if (equals("min")) {
minute = param;
}
if (fontTableState == 1) {
// mappings of fN to the fcharset:
if (groupState.depth < fontTableDepth) {
fontTableState = 2;
} else {
if (equals("f")) {
// Start new font definition
curFontID = param;
} else if (equals("fcharset")) {
Charset cs = FCHARSET_MAP.get(param);
if (cs != null) {
fontToCharset.put(curFontID, cs);
}
}
}
}
if (currentList != null) {
if (equals("listid")) {
currentList.id = param;
currentListTable.put(currentList.id, currentList);
} else if (equals("listtemplateid")) {
currentList.templateID = param;
} else if (equals("levelnfc") || equals("levelnfcn")) {
//sanity check to make sure list information isn't corrupt
if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) {
currentList.numberType[listTableLevel] = param;
}
}
}
} else {
// In document
if (equals("b")) {
// b0
assert param == 0;
if (groupState.bold) {
pushText();
if (groupState.italic) {
end("i");
}
end("b");
if (groupState.italic) {
start("i");
}
groupState.bold = false;
}
} else if (equals("i")) {
// i0
assert param == 0;
if (groupState.italic) {
pushText();
end("i");
groupState.italic = false;
}
} else if (equals("f")) {
// Change current font
Charset fontCharset = fontToCharset.get(param);
// Push any buffered text before changing
// font:
pushText();
if (fontCharset != null) {
groupState.fontCharset = fontCharset;
} else {
// DOC ERROR: font change referenced a
// non-table'd font number
// TODO: log a warning? Throw an exc?
groupState.fontCharset = null;
}
} else if (equals("ls")) {
groupState.list = param;
} else if (equals("lslvl")) {
groupState.listLevel = param;
}
}
// in the header can be unicode escaped as well:
if (equals("u")) {
// Unicode escape
if (!groupState.ignore || groupState.sv || groupState.sn) {
final char utf16CodeUnit = (char) (param & 0xffff);
addOutputChar(utf16CodeUnit);
}
// After seeing a unicode escape we must
// skip the next ucSkip ansi chars (the
// "unicode shadow")
ansiSkip = groupState.ucSkip;
} else if (equals("uc")) {
// Change unicode shadow length
groupState.ucSkip = param;
} else if (equals("bin")) {
if (param >= 0) {
if (groupState.pictDepth == 1) {
try {
embObjHandler.writeBytes(in, param);
} catch (IOException | TikaException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
embObjHandler.reset();
}
} else {
IOUtils.skipFully(in, param);
}
} else {
// log some warning?
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class RTFParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
TaggedInputStream tagged = new TaggedInputStream(stream);
try {
XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb());
final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
ert.extract(stream);
} catch (IOException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("Error parsing an RTF document", e);
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ZipContainerDetector method detect.
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(input, tmp);
// enough for all known formats
byte[] prefix = new byte[1024];
int length = tis.peek(prefix);
MediaType type = detectArchiveFormat(prefix, length);
if (PackageParser.isZipArchive(type) && TikaInputStream.isTikaInputStream(input)) {
return detectZipFormat(tis);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
return detectCompressorFormat(prefix, length);
}
} finally {
try {
tmp.dispose();
} catch (TikaException e) {
// ignore
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class PRTParser method extractText.
/**
* Does our best to turn the bytes into text
*/
private String extractText(byte[] data, boolean trim) throws TikaException {
// The text is always stored null terminated, but sometimes
// may have extra null padding too
int length = data.length - 1;
if (trim) {
for (int i = 0; i < data.length; i++) {
if (data[i] == 0) {
length = i;
break;
}
}
}
// We believe that the text is basically stored as CP437
// That said, there are a few characters slightly wrong for that...
String text;
try {
text = new String(data, 0, length, "cp437");
} catch (UnsupportedEncodingException e) {
throw new TikaException("JVM Broken, core codepage CP437 missing!");
}
// Fix up the known character issues
text = text.replace("φ", "Ø");
// All done, as best as we can!
return text;
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class TikaEncodingDetectorTest method testConfigurabilityOfUserSpecified.
@Test
public void testConfigurabilityOfUserSpecified() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
//make sure that all static and non-static parsers are using the same encoding detector!
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(3, parsers.size());
for (Parser encodingDetectingParser : parsers) {
EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) encodingDetectingParser).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector).getDetectors()) {
assertNotContained("cu4j", child.getClass().getCanonicalName());
}
}
//also just make sure this is still true
try {
Metadata metadata = getXML("english.cp500.txt", p).metadata;
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
}
Aggregations