use of org.apache.nutch.util.CommandRunner in project nutch by apache.
the class ExtParser method getParse.
public ParseResult getParse(Content content) {
String contentType = content.getContentType();
String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
if (params == null)
return new ParseStatus(ParseStatus.FAILED, "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());
String command = params[0];
int timeout = Integer.parseInt(params[1]);
String encoding = params[2];
if (LOG.isTraceEnabled()) {
LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
}
String text = null;
String title = null;
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete " + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
}
ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
CommandRunner cr = new CommandRunner();
cr.setCommand(command + " " + contentType);
cr.setInputStream(new ByteArrayInputStream(raw));
cr.setStdOutputStream(os);
cr.setStdErrorStream(es);
cr.setTimeout(timeout);
cr.evaluate();
if (cr.getExitValue() != 0)
return new ParseStatus(ParseStatus.FAILED, "External command " + command + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
text = os.toString(encoding);
} catch (Exception e) {
// run time exception
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata());
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Aggregations