use of org.apache.nutch.parse.HtmlParseFilters in project nutch by apache.
the class HtmlParser method setConf.
@Override
public void setConf(Configuration conf) {
this.conf = conf;
this.htmlParseFilters = new HtmlParseFilters(getConf());
this.parserImpl = getConf().get("parser.html.impl", "neko");
this.defaultCharEncoding = getConf().get("parser.character.encoding.default", "windows-1252");
this.utils = new DOMContentUtils(conf);
this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT);
}
use of org.apache.nutch.parse.HtmlParseFilters in project nutch by apache.
the class TikaParser method setConf.
public void setConf(Configuration conf) {
this.conf = conf;
this.tikaConfig = null;
// do we want a custom Tika configuration file
// deprecated since Tika 0.7 which is based on
// a service provider based configuration
String customConfFile = conf.get("tika.config.file");
if (customConfFile != null) {
try {
// see if a Tika config file can be found in the job file
URL customTikaConfig = conf.getResource(customConfFile);
if (customTikaConfig != null)
tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader());
} catch (Exception e1) {
String message = "Problem loading custom Tika configuration from " + customConfFile;
LOG.error(message, e1);
}
} else {
try {
tikaConfig = new TikaConfig(this.getClass().getClassLoader());
} catch (Exception e2) {
String message = "Problem loading default Tika configuration";
LOG.error(message, e2);
}
}
// use a custom htmlmapper
String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
if (StringUtils.isNotBlank(htmlmapperClassName)) {
try {
Class HTMLMapperClass = Class.forName(htmlmapperClassName);
boolean interfaceOK = HtmlMapper.class.isAssignableFrom(HTMLMapperClass);
if (!interfaceOK) {
throw new RuntimeException("Class " + htmlmapperClassName + " does not implement HtmlMapper");
}
HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
} catch (Exception e) {
LOG.error("Can't generate instance for class " + htmlmapperClassName);
throw new RuntimeException("Can't generate instance for class " + htmlmapperClassName);
}
}
this.htmlParseFilters = new HtmlParseFilters(getConf());
this.utils = new DOMContentUtils(conf);
this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT);
this.upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names", true);
}
Aggregations