Search in sources :

Example 46 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class Foo method getProtocolOutput.

/**
 * This is a dummy implementation only. So what we will do is return this
 * structure:
 *
 * <pre>
 * foo://example.com - will contain one directory and one file
 * foo://example.com/a - directory, will contain two files
 * foo://example.com/a/aa.txt - text file
 * foo://example.com/a/ab.txt - text file
 * foo://example.com/a.txt - text file
 * </pre>
 */
@Override
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
    LOG.debug("getProtocolOutput({}, {})", url, datum);
    try {
        String urlstr = String.valueOf(url);
        URL u = new URL(urlstr);
        URL base = new URL(u, ".");
        byte[] bytes = new byte[0];
        String contentType = "foo/something";
        ProtocolStatus status = ProtocolStatus.STATUS_GONE;
        switch(urlstr) {
            case "foo://example.com":
            case "foo://example.com/":
                {
                    String time = HttpDateFormat.toString(System.currentTimeMillis());
                    contentType = "text/html";
                    StringBuffer sb = new StringBuffer();
                    sb.append("<html><head>");
                    sb.append("<title>Index of /</title></head>\n");
                    sb.append("<body><h1>Index of /</h1><pre>\n");
                    // add directory
                    sb.append("<a href='a/" + "'>a/</a>\t" + time + "\t-\n");
                    // add file
                    sb.append("<a href='a.txt'>a.txt</a>\t" + time + "\t" + 0 + "\n");
                    sb.append("</pre></html></body>");
                    bytes = sb.toString().getBytes();
                    status = ProtocolStatus.STATUS_SUCCESS;
                    break;
                }
            case "foo://example.com/a/":
                {
                    String time = HttpDateFormat.toString(System.currentTimeMillis());
                    contentType = "text/html";
                    StringBuffer sb = new StringBuffer();
                    sb.append("<html><head>");
                    sb.append("<title>Index of /a/</title></head>\n");
                    sb.append("<body><h1>Index of /a/</h1><pre>\n");
                    // add file
                    sb.append("<a href='aa.txt'>aa.txt</a>\t" + time + "\t" + 0 + "\n");
                    // add file
                    sb.append("<a href='ab.txt'>ab.txt</a>\t" + time + "\t" + 0 + "\n");
                    sb.append("</pre></html></body>");
                    bytes = sb.toString().getBytes();
                    status = ProtocolStatus.STATUS_SUCCESS;
                    break;
                }
            case "foo://example.com/a.txt":
            case "foo://example.com/a/aa.txt":
            case "foo://example.com/a/ab.txt":
                {
                    contentType = "text/plain";
                    bytes = "In publishing and graphic design, lorem ipsum is a filler text or greeking commonly used to demonstrate the textual elements of a graphic document or visual presentation. Replacing meaningful content with placeholder text allows designers to design the form of the content before the content itself has been produced.".getBytes();
                    status = ProtocolStatus.STATUS_SUCCESS;
                    break;
                }
            default:
                LOG.warn("Unknown url '{}'. This dummy implementation only supports 'foo://example.com'", url);
                // all our default values are set for URLs that do not exist.
                break;
        }
        Metadata metadata = new Metadata();
        Content content = new Content(String.valueOf(url), String.valueOf(base), bytes, contentType, metadata, getConf());
        return new ProtocolOutput(content, status);
    } catch (MalformedURLException mue) {
        LOG.error("Could not retrieve {}", url);
        LOG.error("", mue);
        // claim STATUS_GONE to tell nutch to never ever re-request this URL
        return new ProtocolOutput(null, ProtocolStatus.STATUS_GONE);
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) MalformedURLException(java.net.MalformedURLException) Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) URL(java.net.URL) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 47 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestMetadataScoringFilter method passScoreBeforeParsing.

@Test
public void passScoreBeforeParsing() {
    Configuration conf = NutchConfiguration.create();
    conf.set(MetadataScoringFilter.METADATA_DATUM, "parent,depth");
    MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
    metadataScoringFilter.setConf(conf);
    CrawlDatum crawlDatum = new CrawlDatum();
    Text from = new Text("https://nutch.apache.org/");
    String PARENT = "parent";
    String DEPTH = "depth";
    String parentMD = "https://nutch.apache.org/";
    String depthMD = "1";
    crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
    crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
    Content content = new Content();
    metadataScoringFilter.passScoreBeforeParsing(from, crawlDatum, content);
    Assert.assertEquals(parentMD, content.getMetadata().get(PARENT));
    Assert.assertEquals(depthMD, content.getMetadata().get(DEPTH));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Test(org.junit.Test)

Example 48 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestEncodingDetector method testGuessing.

@Test
public void testGuessing() {
    // first disable auto detection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);
    Metadata metadata = new Metadata();
    EncodingDetector detector;
    Content content;
    String encoding;
    content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    // no information is available, so it should return default encoding
    Assert.assertEquals("windows-1252", encoding.toLowerCase());
    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    Assert.assertEquals("utf-16", encoding.toLowerCase());
    metadata.clear();
    content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(content, "windows-1252");
    Assert.assertEquals("windows-1254", encoding.toLowerCase());
    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("utf-32", "sniffed");
    encoding = detector.guessEncoding(content, "windows-1252");
    Assert.assertEquals("utf-8", encoding.toLowerCase());
}
Also used : Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) Test(org.junit.Test)

Example 49 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestTextProfileSignature method testGetSignature.

@Test
public void testGetSignature() {
    Configuration conf = NutchConfiguration.create();
    Signature textProf = new TextProfileSignature();
    textProf.setConf(conf);
    String text = "Hello World The Quick Brown Fox Jumped Over the Lazy Fox";
    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, "Hello World", new Outlink[0], new Metadata());
    byte[] signature1 = textProf.calculate(new Content(), new ParseImpl(text, pd));
    Assert.assertNotNull(signature1);
    List<String> words = Arrays.asList(text.split("\\s"));
    Collections.shuffle(words);
    String text2 = String.join(" ", words);
    byte[] signature2 = textProf.calculate(new Content(), new ParseImpl(text2, pd));
    Assert.assertNotNull(signature2);
    Assert.assertEquals(StringUtil.toHexString(signature1), StringUtil.toHexString(signature2));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) Test(org.junit.Test)

Example 50 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class IndexingFiltersChecker method process.

protected int process(String url, StringBuilder output) throws Exception {
    if (normalizers != null) {
        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
    }
    LOG.info("fetching: " + url);
    CrawlDatum datum = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        datum.getMetaData().put(new Text(key), new Text(value));
    }
    int maxRedirects = getConf().getInt("http.redirect.max", 3);
    if (followRedirects) {
        if (maxRedirects == 0) {
            LOG.info("Following max. 3 redirects (ignored http.redirect.max == 0)");
            maxRedirects = 3;
        } else {
            LOG.info("Following max. {} redirects", maxRedirects);
        }
    }
    ProtocolOutput protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
    Text turl = new Text(url);
    // Following redirects and not reached maxRedirects?
    int numRedirects = 0;
    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
        String[] stuff = protocolOutput.getStatus().getArgs();
        url = stuff[0];
        LOG.info("Follow redirect to {}", url);
        if (normalizers != null) {
            url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
        }
        turl.set(url);
        // try again
        protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
        numRedirects++;
    }
    if (checkRobotsTxt && protocolOutput == null) {
        System.err.println("Fetch disallowed by robots.txt");
        return -1;
    }
    if (!protocolOutput.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + protocolOutput.getStatus());
        if (protocolOutput.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        }
        return -1;
    }
    Content content = protocolOutput.getContent();
    if (content == null) {
        output.append("No content for " + url + "\n");
        return 0;
    }
    String contentType = content.getContentType();
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return -1;
    }
    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(getConf());
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, datum, content);
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", url, e);
    }
    LOG.info("parsing: {}", url);
    LOG.info("contentType: {}", contentType);
    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);
    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + urlText);
        return -1;
    }
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
    parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
    doc.add("digest", digest);
    datum.setSignature(signature);
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", turl, e);
    }
    IndexingFilters indexers = new IndexingFilters(getConf());
    try {
        doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
        e.printStackTrace();
    }
    if (doc == null) {
        output.append("Document discarded by indexing filter\n");
        return 0;
    }
    for (String fname : doc.getFieldNames()) {
        List<Object> values = doc.getField(fname).getValues();
        if (values != null) {
            for (Object value : values) {
                String str = value.toString();
                int minText = dumpText ? str.length() : Math.min(100, str.length());
                output.append(fname + " :\t" + str.substring(0, minText) + "\n");
            }
        }
    }
    // For readability if keepClientCnxOpen
    output.append("\n");
    if (doIndex) {
        IndexWriters writers = IndexWriters.get(getConf());
        writers.open(getConf(), "IndexingFilterChecker");
        writers.write(doc);
        writers.close();
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) ParseResult(org.apache.nutch.parse.ParseResult) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4