use of net.htmlparser.jericho.StartTag in project Xponents by OpenSextant.
the class TikaHTMLConverter method parseHTMLMetadata.
/**
* Heuristics for pulling in metadata that Tika neglects for various reasons.
* This adds found meta tags to given metadata.
*
* TODO: InputStream is difficult to reset after tika parser reads it. So just using the file object,
* Jericho reads the raw file again.
*
* @param doc file object for document
* @param metadata metadata map to backfill
* @throws IOException
*/
private void parseHTMLMetadata(File doc, Map<String, String> md) throws IOException {
net.htmlparser.jericho.Source htmlDoc = new net.htmlparser.jericho.Source(doc);
List<net.htmlparser.jericho.StartTag> tags = htmlDoc.getAllStartTags("meta");
for (StartTag t : tags) {
String n = t.getAttributeValue("name");
String p = t.getAttributeValue("property");
if (p == null && n == null) {
log.debug("Unmatched metadata in HTML {}", t.toString());
continue;
}
String key = p != null ? p : n;
if (!isUsefulMeta(key)) {
continue;
}
/* hopefully value is in content field */
String v = t.getAttributeValue("content");
if (v == null) {
continue;
}
md.put(key, v);
}
}
use of net.htmlparser.jericho.StartTag in project zaproxy by zaproxy.
the class SpiderHtmlParser method parseResource.
/**
* @throws NullPointerException if {@code message} is null.
*/
@Override
public boolean parseResource(HttpMessage message, Source source, int depth) {
// Prepare the source, if not provided
if (source == null) {
source = new Source(message.getResponseBody().toString());
}
// Get the context (base url)
String baseURL = message.getRequestHeader().getURI().toString();
// Try to see if there's any BASE tag that could change the base URL
Element base = source.getFirstElement(HTMLElementName.BASE);
if (base != null) {
if (log.isDebugEnabled()) {
log.debug("Base tag was found in HTML: " + base.getDebugInfo());
}
String href = base.getAttributeValue("href");
if (href != null && !href.isEmpty()) {
baseURL = URLCanonicalizer.getCanonicalURL(href, baseURL);
}
}
// Parse the source
parseSource(message, source, depth, baseURL);
// Parse the comments
if (params.isParseComments()) {
List<StartTag> comments = source.getAllStartTags(StartTagType.COMMENT);
for (StartTag comment : comments) {
Source s = new Source(comment.getTagContent());
if (!parseSource(message, s, depth, baseURL)) {
Matcher matcher = PLAIN_COMMENTS_URL_PATTERN.matcher(s.toString());
while (matcher.find()) {
processURL(message, depth, matcher.group(), baseURL);
}
}
}
}
return false;
}