use of org.apache.tika.parser.chm.core.ChmExtractor in project tika by apache.
the class TestChmExtraction method testExtractChmEntry.
protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException {
ChmExtractor chmExtractor = new ChmExtractor(stream);
ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
Set<String> names = new HashSet<String>();
for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
//Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
if (!niceAscFileName(directoryListingEntry.getName())) {
throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
}
final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
//check duplicate entry name which is seen before.
if (names.contains(lowName)) {
throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
}
names.add(lowName);
if (lowName.endsWith(".html") || lowName.endsWith(".htm") || lowName.endsWith(".hhk") || lowName.endsWith(".hhc")) //|| name.endsWith(".bmp")
{
if (findZero(data)) {
throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
}
//validate html
String html = new String(data, ISO_8859_1);
if (!htmlPairP.matcher(html).find()) {
System.err.println(lowName + " is invalid.");
System.err.println(html);
throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
}
// else {
// System.err.println(directoryListingEntry.getName() + " is valid.");
// }
}
}
}
use of org.apache.tika.parser.chm.core.ChmExtractor in project tika by apache.
the class ChmParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
ChmExtractor chmExtractor = new ChmExtractor(stream);
// metadata
metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
// content
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, context);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
final String entryName = entry.getName();
if (entryName.endsWith(".html") || entryName.endsWith(".htm")) {
// AttributesImpl attrs = new AttributesImpl();
// attrs.addAttribute("", "name", "name", "String", entryName);
// xhtml.startElement("", "document", "document", attrs);
byte[] data = chmExtractor.extractChmEntry(entry);
parsePage(data, htmlParser, xhtml, context);
// xhtml.endElement("", "", "document");
}
}
xhtml.endDocument();
}
Aggregations