use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestMetadataScoringFilter method passScoreBeforeParsing.
@Test
public void passScoreBeforeParsing() {
Configuration conf = NutchConfiguration.create();
conf.set(MetadataScoringFilter.METADATA_DATUM, "parent,depth");
MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
metadataScoringFilter.setConf(conf);
CrawlDatum crawlDatum = new CrawlDatum();
Text from = new Text("https://nutch.apache.org/");
String PARENT = "parent";
String DEPTH = "depth";
String parentMD = "https://nutch.apache.org/";
String depthMD = "1";
crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
Content content = new Content();
metadataScoringFilter.passScoreBeforeParsing(from, crawlDatum, content);
Assert.assertEquals(parentMD, content.getMetadata().get(PARENT));
Assert.assertEquals(depthMD, content.getMetadata().get(DEPTH));
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class HtmlParser method main.
public static void main(String[] args) throws Exception {
String name = args[0];
String url = "file:" + name;
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
@SuppressWarnings("resource") DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
Configuration conf = NutchConfiguration.create();
HtmlParser parser = new HtmlParser();
parser.setConf(conf);
Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
System.out.println("data: " + parse.getData());
System.out.println("text: " + parse.getText());
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestDOMContentUtils method setup.
@Before
public void setup() throws Exception {
conf = NutchConfiguration.create();
utils = new DOMContentUtils(conf);
conf.set("plugin.includes", "parse-tika");
TikaParser parser = new TikaParser();
parser.setConf(conf);
for (int i = 0; i < testPages.length; i++) {
try {
String url = testBaseHrefs[i];
testBaseHrefURLs[i] = new URL(url);
Content content = new Content(url, url, testPages[i].getBytes(StandardCharsets.UTF_8), "text/html", new Metadata(), conf);
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
DocumentFragment root = doc.createDocumentFragment();
parser.getParse(content, doc, root);
testDOMs[i] = root;
} catch (Exception e) {
Assert.assertTrue("caught exception: " + e, false);
}
}
answerOutlinks = new Outlink[][] { { new Outlink("http://www.nutch.org", "anchor") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots") }, { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2") }, { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", "") }, { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, { new Outlink("http://www.nutch.org/index.html", "whitespace test") }, {}, {}, {}, { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, { // this is tricky - see RFC3986 section 5.4.1 example 7
new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }, {} };
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestJSParseFilter method getOutlinks.
public Outlink[] getOutlinks(String sampleFile) throws ProtocolException, ParseException, IOException {
String urlString;
Parse parse;
urlString = "file:" + sampleDir + fileSeparator + sampleFile;
LOG.info("Parsing {}", urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
LOG.info(parse.getData().toString());
return parse.getData().getOutlinks();
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestSWFParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
Assert.assertTrue(sampleTexts[i].equals(text));
}
}
Aggregations