use of org.apache.nutch.parse.ParseData in project nutch by apache.
the class ExtParser method getParse.
public ParseResult getParse(Content content) {
String contentType = content.getContentType();
String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
if (params == null)
return new ParseStatus(ParseStatus.FAILED, "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());
String command = params[0];
int timeout = Integer.parseInt(params[1]);
String encoding = params[2];
if (LOG.isTraceEnabled()) {
LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
}
String text = null;
String title = null;
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete " + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
}
ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
CommandRunner cr = new CommandRunner();
cr.setCommand(command + " " + contentType);
cr.setInputStream(new ByteArrayInputStream(raw));
cr.setStdOutputStream(os);
cr.setStdErrorStream(es);
cr.setTimeout(timeout);
cr.evaluate();
if (cr.getExitValue() != 0)
return new ParseStatus(ParseStatus.FAILED, "External command " + command + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
text = os.toString(encoding);
} catch (Exception e) {
// run time exception
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata());
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
use of org.apache.nutch.parse.ParseData in project nutch by apache.
the class HtmlParser method getParse.
public ParseResult getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
Metadata metadata = new Metadata();
// parse the content
DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(content, defaultCharEncoding);
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
input.setEncoding(encoding);
if (LOG.isTraceEnabled()) {
LOG.trace("Parsing...");
}
root = parse(input);
} catch (IOException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (DOMException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (SAXException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) {
LOG.error("Error: ", e);
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
// populate Nutch metadata with HTML meta directives
metadata.addAll(metaTags.getGeneralTags());
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) {
// okay to index
StringBuffer sb = new StringBuffer();
if (LOG.isTraceEnabled()) {
LOG.trace("Getting text...");
}
// extract text
utils.getText(sb, root);
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) {
LOG.trace("Getting title...");
}
// extract title
utils.getTitle(sb, root);
title = sb.toString().trim();
}
if (!metaTags.getNoFollow()) {
// okay to follow links
// extract outlinks
ArrayList<Outlink> l = new ArrayList<Outlink>();
URL baseTag = base;
String baseTagHref = utils.getBase(root);
if (baseTagHref != null) {
try {
baseTag = new URL(base, baseTagHref);
} catch (MalformedURLException e) {
baseTag = base;
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links...");
}
utils.getOutlinks(baseTag, l, root);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
}
}
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
if (metaTags.getRefresh()) {
status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
}
ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// run filters on parse
ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
if (metaTags.getNoCache()) {
// not okay to cache
for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
}
return filteredParse;
}
use of org.apache.nutch.parse.ParseData in project nutch by apache.
the class JSParseFilter method getParse.
public ParseResult getParse(Content c) {
String type = c.getContentType();
if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT, "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
String script = new String(c.getContent());
Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
if (outlinks == null)
outlinks = new Outlink[0];
// Title? use the first line of the script...
String title;
int idx = script.indexOf('\n');
if (idx != -1) {
if (idx > MAX_TITLE_LEN)
idx = MAX_TITLE_LEN;
title = script.substring(0, idx);
} else {
idx = Math.min(MAX_TITLE_LEN, script.length());
title = script.substring(0, idx);
}
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, c.getMetadata());
return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
use of org.apache.nutch.parse.ParseData in project nutch by apache.
the class JSParseFilter method filter.
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
String url = content.getBaseUrl();
ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
walk(doc, parse, metaTags, url, outlinks);
if (outlinks.size() > 0) {
Outlink[] old = parse.getData().getOutlinks();
String title = parse.getData().getTitle();
List<Outlink> list = Arrays.asList(old);
outlinks.addAll(list);
ParseStatus status = parse.getData().getStatus();
String text = parse.getText();
Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(status, title, newlinks, parse.getData().getContentMeta(), parse.getData().getParseMeta());
// replace original parse obj with new one
parseResult.put(content.getUrl(), new ParseText(text), parseData);
}
return parseResult;
}
use of org.apache.nutch.parse.ParseData in project nutch by apache.
the class ZipTextExtractor method extractText.
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
String resultText = "";
ZipInputStream zin = new ZipInputStream(input);
ZipEntry entry;
while ((entry = zin.getNextEntry()) != null) {
if (!entry.isDirectory()) {
int size = (int) entry.getSize();
byte[] b = new byte[size];
for (int x = 0; x < size; x++) {
int err = zin.read();
if (err != -1) {
b[x] = (byte) err;
}
}
String newurl = url + "/";
String fname = entry.getName();
newurl += fname;
URL aURL = new URL(newurl);
String base = aURL.toString();
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
Tika tika = new Tika();
String contentType = tika.detect(fname);
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
metadata.set(Response.CONTENT_TYPE, contentType);
Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
for (int count = 0; count < theOutlinks.length; count++) {
outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
}
resultText += entry.getName() + " " + parse.getText() + " ";
} catch (ParseException e) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
}
}
}
}
}
return resultText;
}
Aggregations