use of com.anotherbigidea.io.InStream in project nutch by apache.
the class SmallStack method getParse.
@Override
public ParseResult getParse(Content content) {
String text = null;
Vector<Outlink> outlinks = new Vector<>();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf());
}
ExtractText extractor = new ExtractText();
// TagParser implements SWFTags and drives a SWFTagTypes interface
TagParser parser = new TagParser(extractor);
// use this instead to debug the file
// TagParser parser = new TagParser( new SWFTagDumper(true, true) );
// SWFReader reads an input file and drives a SWFTags interface
SWFReader reader = new SWFReader(parser, new InStream(raw));
// read the input SWF file and pass it through the interface pipeline
reader.readFile();
text = extractor.getText();
String atext = extractor.getActionText();
if (atext != null && atext.length() > 0)
text += "\n--------\n" + atext;
// harvest potential outlinks
String[] links = extractor.getUrls();
for (int i = 0; i < links.length; i++) {
Outlink out = new Outlink(links[i], "");
outlinks.add(out);
}
Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
if (olinks != null)
for (int i = 0; i < olinks.length; i++) {
outlinks.add(olinks[i]);
}
} catch (Exception e) {
// run time exception
LOG.error("Error, runtime exception: ", e);
return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf());
}
if (text == null)
text = "";
Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, content.getMetadata());
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Aggregations