use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class JSParseFilter method main.
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
return;
}
InputStream in = new FileInputStream(args[0]);
BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
StringBuffer sb = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) sb.append(line + "\n");
br.close();
JSParseFilter parseFilter = new JSParseFilter();
parseFilter.setConf(NutchConfiguration.create());
Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
System.out.println("Outlinks extracted: " + links.length);
for (int i = 0; i < links.length; i++) System.out.println(" - " + links[i]);
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class DOMContentUtils method getOutlinks.
// This one is used by NUTCH-1918
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> tikaExtractedOutlinks) {
String target = null;
String anchor = null;
boolean noFollow = false;
for (Link link : tikaExtractedOutlinks) {
target = link.getUri();
noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false;
anchor = link.getText();
if (!ignoredTags.contains(link.getType())) {
if (target != null && !noFollow) {
try {
URL url = URLUtil.resolveURL(base, target);
// clean the anchor
anchor = anchor.replaceAll("\\s+", " ");
anchor = anchor.trim();
outlinks.add(new Outlink(url.toString(), anchor));
} catch (MalformedURLException e) {
// don't care
}
}
}
}
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class TestDOMContentUtils method setup.
@Before
public void setup() throws Exception {
conf = NutchConfiguration.create();
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
DOMFragmentParser parser = new DOMFragmentParser();
parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true);
for (int i = 0; i < testPages.length; i++) {
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {
parser.parse(new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), node);
testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
} catch (Exception e) {
Assert.assertTrue("caught exception: " + e, false);
}
testDOMs[i] = node;
}
answerOutlinks = new Outlink[][] { { new Outlink("http://www.nutch.org", "anchor") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots") }, { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2") }, { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", "") }, { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, { new Outlink("http://www.nutch.org/index.html", "whitespace test") }, {}, { new Outlink("http://www.nutch.org/dummy.jsp", "test2") }, {}, { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, { // this is tricky - see RFC3986 section 5.4.1 example 7
new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }, { new Outlink("http://www.nutch.org/movie.mp4", "") } };
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class ZipTextExtractor method extractText.
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
String resultText = "";
ZipInputStream zin = new ZipInputStream(input);
ZipEntry entry;
while ((entry = zin.getNextEntry()) != null) {
if (!entry.isDirectory()) {
int size = (int) entry.getSize();
byte[] b = new byte[size];
for (int x = 0; x < size; x++) {
int err = zin.read();
if (err != -1) {
b[x] = (byte) err;
}
}
String newurl = url + "/";
String fname = entry.getName();
newurl += fname;
URL aURL = new URL(newurl);
String base = aURL.toString();
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
Tika tika = new Tika();
String contentType = tika.detect(fname);
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
metadata.set(Response.CONTENT_TYPE, contentType);
Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
for (int count = 0; count < theOutlinks.length; count++) {
outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
}
resultText += entry.getName() + " " + parse.getText() + " ";
} catch (ParseException e) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
}
}
}
}
}
return resultText;
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class NaiveBayesParseFilter method filter.
@Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
String url = content.getBaseUrl();
ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>();
String text = parse.getText();
if (!filterParse(text)) {
// kick in the second tier
// if parent page found
// irrelevant
LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
LOG.info("Checking outlinks");
Outlink[] out = null;
for (int i = 0; i < parse.getData().getOutlinks().length; i++) {
LOG.info("ParseFilter: NaiveBayes: Outlink to check:: " + parse.getData().getOutlinks()[i].getToUrl());
if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
tempOutlinks.add(parse.getData().getOutlinks()[i]);
LOG.info("ParseFilter: NaiveBayes: found relevant");
} else {
LOG.info("ParseFilter: NaiveBayes: found irrelevant");
}
}
out = new Outlink[tempOutlinks.size()];
for (int i = 0; i < tempOutlinks.size(); i++) {
out[i] = tempOutlinks.get(i);
}
parse.getData().setOutlinks(out);
} else {
LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url);
}
return parseResult;
}
Aggregations