Search in sources :

Example 16 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class JSParseFilter method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
        return;
    }
    InputStream in = new FileInputStream(args[0]);
    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
    StringBuffer sb = new StringBuffer();
    String line = null;
    while ((line = br.readLine()) != null) sb.append(line + "\n");
    br.close();
    JSParseFilter parseFilter = new JSParseFilter();
    parseFilter.setConf(NutchConfiguration.create());
    Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
    System.out.println("Outlinks extracted: " + links.length);
    for (int i = 0; i < links.length; i++) System.out.println(" - " + links[i]);
}
Also used : Outlink(org.apache.nutch.parse.Outlink) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) BufferedReader(java.io.BufferedReader) FileInputStream(java.io.FileInputStream)

Example 17 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class DOMContentUtils method getOutlinks.

// This one is used by NUTCH-1918
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> tikaExtractedOutlinks) {
    String target = null;
    String anchor = null;
    boolean noFollow = false;
    for (Link link : tikaExtractedOutlinks) {
        target = link.getUri();
        noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false;
        anchor = link.getText();
        if (!ignoredTags.contains(link.getType())) {
            if (target != null && !noFollow) {
                try {
                    URL url = URLUtil.resolveURL(base, target);
                    // clean the anchor
                    anchor = anchor.replaceAll("\\s+", " ");
                    anchor = anchor.trim();
                    outlinks.add(new Outlink(url.toString(), anchor));
                } catch (MalformedURLException e) {
                // don't care
                }
            }
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException(java.net.MalformedURLException) Link(org.apache.tika.sax.Link) URL(java.net.URL)

Example 18 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class TestDOMContentUtils method setup.

@Before
public void setup() throws Exception {
    conf = NutchConfiguration.create();
    conf.setBoolean("parser.html.form.use_action", true);
    utils = new DOMContentUtils(conf);
    DOMFragmentParser parser = new DOMFragmentParser();
    parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true);
    for (int i = 0; i < testPages.length; i++) {
        DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
        try {
            parser.parse(new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), node);
            testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
        } catch (Exception e) {
            Assert.assertTrue("caught exception: " + e, false);
        }
        testDOMs[i] = node;
    }
    answerOutlinks = new Outlink[][] { { new Outlink("http://www.nutch.org", "anchor") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots") }, { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2") }, { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", "") }, { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, { new Outlink("http://www.nutch.org/index.html", "whitespace test") }, {}, { new Outlink("http://www.nutch.org/dummy.jsp", "test2") }, {}, { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, { // this is tricky - see RFC3986 section 5.4.1 example 7
    new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }, { new Outlink("http://www.nutch.org/movie.mp4", "") } };
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ByteArrayInputStream(java.io.ByteArrayInputStream) DOMFragmentParser(org.cyberneko.html.parsers.DOMFragmentParser) URL(java.net.URL) DOMContentUtils(org.apache.nutch.parse.tika.DOMContentUtils) Before(org.junit.Before)

Example 19 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class ZipTextExtractor method extractText.

public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
    String resultText = "";
    ZipInputStream zin = new ZipInputStream(input);
    ZipEntry entry;
    while ((entry = zin.getNextEntry()) != null) {
        if (!entry.isDirectory()) {
            int size = (int) entry.getSize();
            byte[] b = new byte[size];
            for (int x = 0; x < size; x++) {
                int err = zin.read();
                if (err != -1) {
                    b[x] = (byte) err;
                }
            }
            String newurl = url + "/";
            String fname = entry.getName();
            newurl += fname;
            URL aURL = new URL(newurl);
            String base = aURL.toString();
            int i = fname.lastIndexOf('.');
            if (i != -1) {
                // Trying to resolve the Mime-Type
                Tika tika = new Tika();
                String contentType = tika.detect(fname);
                try {
                    Metadata metadata = new Metadata();
                    metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
                    metadata.set(Response.CONTENT_TYPE, contentType);
                    Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
                    Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
                    ParseData theParseData = parse.getData();
                    Outlink[] theOutlinks = theParseData.getOutlinks();
                    for (int count = 0; count < theOutlinks.length; count++) {
                        outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
                    }
                    resultText += entry.getName() + " " + parse.getText() + " ";
                } catch (ParseException e) {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
                    }
                }
            }
        }
    }
    return resultText;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) ZipEntry(java.util.zip.ZipEntry) Metadata(org.apache.nutch.metadata.Metadata) Tika(org.apache.tika.Tika) URL(java.net.URL) ZipInputStream(java.util.zip.ZipInputStream) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) ParseException(org.apache.nutch.parse.ParseException)

Example 20 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class NaiveBayesParseFilter method filter.

@Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    Parse parse = parseResult.get(content.getUrl());
    String url = content.getBaseUrl();
    ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>();
    String text = parse.getText();
    if (!filterParse(text)) {
        // kick in the second tier
        // if parent page found
        // irrelevant
        LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
        LOG.info("Checking outlinks");
        Outlink[] out = null;
        for (int i = 0; i < parse.getData().getOutlinks().length; i++) {
            LOG.info("ParseFilter: NaiveBayes: Outlink to check:: " + parse.getData().getOutlinks()[i].getToUrl());
            if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
                tempOutlinks.add(parse.getData().getOutlinks()[i]);
                LOG.info("ParseFilter: NaiveBayes: found relevant");
            } else {
                LOG.info("ParseFilter: NaiveBayes: found irrelevant");
            }
        }
        out = new Outlink[tempOutlinks.size()];
        for (int i = 0; i < tempOutlinks.size(); i++) {
            out[i] = tempOutlinks.get(i);
        }
        parse.getData().setOutlinks(out);
    } else {
        LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url);
    }
    return parseResult;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) Parse(org.apache.nutch.parse.Parse) ArrayList(java.util.ArrayList)

Aggregations

Outlink (org.apache.nutch.parse.Outlink)37 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)17 ParseStatus (org.apache.nutch.parse.ParseStatus)16 URL (java.net.URL)13 Text (org.apache.hadoop.io.Text)13 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)11 Test (org.junit.Test)11 Parse (org.apache.nutch.parse.Parse)10 MalformedURLException (java.net.MalformedURLException)9 Inlinks (org.apache.nutch.crawl.Inlinks)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)9 Metadata (org.apache.nutch.metadata.Metadata)9 ArrayList (java.util.ArrayList)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 Configuration (org.apache.hadoop.conf.Configuration)6 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)6 IOException (java.io.IOException)5 ParseText (org.apache.nutch.parse.ParseText)4 Map (java.util.Map)3