Search in sources :

Example 6 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class HttpBase method main.

protected static void main(HttpBase http, String[] args) throws Exception {
    String url = null;
    String usage = "Usage: Http [-verbose] [-timeout N] url";
    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
    }
    for (int i = 0; i < args.length; i++) {
        // parse command line
        if (args[i].equals("-timeout")) {
            // found -timeout option
            http.timeout = Integer.parseInt(args[++i]) * 1000;
        } else if (args[i].equals("-verbose")) {
        // found -verbose option
        } else if (i != args.length - 1) {
            System.err.println(usage);
            System.exit(-1);
        } else
            // root is required parameter
            url = args[i];
    }
    ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
    Content content = out.getContent();
    System.out.println("Status: " + out.getStatus());
    if (content != null) {
        System.out.println("Content Type: " + content.getContentType());
        System.out.println("Content Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
        System.out.println("Content:");
        String text = new String(content.getContent());
        System.out.println(text);
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text)

Example 7 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class HttpBase method getProtocolOutput.

public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
    String urlString = url.toString();
    try {
        URL u = new URL(urlString);
        long startTime = System.currentTimeMillis();
        // make a request
        Response response = getResponse(u, datum, false);
        if (this.responseTime) {
            int elapsedTime = (int) (System.currentTimeMillis() - startTime);
            datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
        }
        int code = response.getCode();
        datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
        byte[] content = response.getContent();
        Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), response.getHeader("Content-Type"), response.getHeaders(), this.conf);
        if (code == 200) {
            // return it
            return new ProtocolOutput(c);
        } else if (code >= 300 && code < 400) {
            // handle redirect
            String location = response.getHeader("Location");
            // some broken servers, such as MS IIS, use lowercase header name...
            if (location == null)
                location = response.getHeader("location");
            if (location == null)
                location = "";
            u = new URL(u, location);
            int protocolStatusCode;
            switch(code) {
                case // multiple choices, preferred value in Location
                300:
                    protocolStatusCode = ProtocolStatus.MOVED;
                    break;
                // moved permanently
                case 301:
                case // use proxy (Location is URL of proxy)
                305:
                    protocolStatusCode = ProtocolStatus.MOVED;
                    break;
                // found (temporarily moved)
                case 302:
                // see other (redirect after POST)
                case 303:
                case // temporary redirect
                307:
                    protocolStatusCode = ProtocolStatus.TEMP_MOVED;
                    break;
                case // not modified
                304:
                    protocolStatusCode = ProtocolStatus.NOTMODIFIED;
                    break;
                default:
                    protocolStatusCode = ProtocolStatus.MOVED;
            }
            // handle this in the higher layer.
            return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
        } else if (code == 400) {
            // bad request, mark as GONE
            if (logger.isTraceEnabled()) {
                logger.trace("400 Bad request: " + u);
            }
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
        } else if (code == 401) {
            // provided.
            if (logger.isTraceEnabled()) {
                logger.trace("401 Authentication Required");
            }
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString));
        } else if (code == 404) {
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
        } else if (code == 410) {
            // permanently GONE
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u));
        } else {
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
        }
    } catch (Throwable e) {
        logger.error("Failed to get protocol output", e);
        return new ProtocolOutput(null, new ProtocolStatus(e));
    }
}
Also used : Response(org.apache.nutch.net.protocols.Response) ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) Text(org.apache.hadoop.io.Text) URL(java.net.URL) IntWritable(org.apache.hadoop.io.IntWritable) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 8 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class File method getProtocolOutput.

/**
 * Creates a {@link FileResponse} object corresponding to the url and return a
 * {@link ProtocolOutput} object as per the content received
 *
 * @param url
 *          Text containing the url
 * @param datum
 *          The CrawlDatum object corresponding to the url
 *
 * @return {@link ProtocolOutput} object for the content of the file indicated
 *         by url
 */
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
    String urlString = url.toString();
    try {
        URL u = new URL(urlString);
        int redirects = 0;
        while (true) {
            FileResponse response;
            // make a
            response = new FileResponse(u, datum, this, getConf());
            // request
            int code = response.getCode();
            if (code == 200) {
                // return it
                return new ProtocolOutput(response.toContent());
            } else if (code == 304) {
                // got not modified
                return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTMODIFIED);
            } else if (code == 401) {
                // access denied / no read permissions
                return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.ACCESS_DENIED));
            } else if (code == 404) {
                // no such file
                return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND);
            } else if (code >= 300 && code < 400) {
                // handle redirect
                u = new URL(response.getHeader("Location"));
                if (LOG.isTraceEnabled()) {
                    LOG.trace("redirect to " + u);
                }
                if (symlinksAsRedirects) {
                    return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.MOVED, u));
                } else if (redirects == MAX_REDIRECTS) {
                    LOG.trace("Too many redirects: {}", url);
                    return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED, u));
                }
                redirects++;
            } else {
                // convert to exception
                throw new FileError(code);
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        return new ProtocolOutput(null, new ProtocolStatus(e));
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) URL(java.net.URL) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 9 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class TestProtocolFile method setContentType.

/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 *
 * @since NUTCH-384
 */
public void setContentType(String testTextFile) throws ProtocolException {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    Assert.assertNotNull(urlString);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), datum);
    Assert.assertNotNull(output);
    Assert.assertEquals("Status code: [" + output.getStatus().getCode() + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output.getStatus().getCode());
    Assert.assertNotNull(output.getContent());
    Assert.assertNotNull(output.getContent().getContentType());
    Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
    Assert.assertNotNull(output.getContent().getMetadata());
    Assert.assertEquals(expectedMimeType, output.getContent().getMetadata().get(Response.CONTENT_TYPE));
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol)

Example 10 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class Ftp method getProtocolOutput.

/**
 * Creates a {@link FtpResponse} object corresponding to the url and returns a
 * {@link ProtocolOutput} object as per the content received
 *
 * @param url
 *          Text containing the ftp url
 * @param datum
 *          The CrawlDatum object corresponding to the url
 *
 * @return {@link ProtocolOutput} object for the url
 */
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
    String urlString = url.toString();
    try {
        URL u = new URL(urlString);
        int redirects = 0;
        while (true) {
            FtpResponse response;
            // make a request
            response = new FtpResponse(u, datum, this, getConf());
            int code = response.getCode();
            datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
            if (code == 200) {
                // return it
                return new ProtocolOutput(response.toContent());
            } else if (code >= 300 && code < 400) {
                // handle redirect
                if (redirects == MAX_REDIRECTS)
                    throw new FtpException("Too many redirects: " + url);
                String loc = response.getHeader("Location");
                try {
                    u = new URL(u, loc);
                } catch (MalformedURLException mue) {
                    LOG.error("Could not create redirectURL for {} with {}", url, loc);
                    return new ProtocolOutput(null, new ProtocolStatus(mue));
                }
                redirects++;
                if (LOG.isTraceEnabled()) {
                    LOG.trace("redirect to " + u);
                }
            } else {
                // convert to exception
                throw new FtpError(code);
            }
        }
    } catch (Exception e) {
        LOG.error("Could not get protocol output for {}: {}", url, e.getMessage());
        return new ProtocolOutput(null, new ProtocolStatus(e));
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) MalformedURLException(java.net.MalformedURLException) Text(org.apache.hadoop.io.Text) URL(java.net.URL) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException)

Aggregations

ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)12 Text (org.apache.hadoop.io.Text)11 Content (org.apache.nutch.protocol.Content)7 URL (java.net.URL)6 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)6 ProtocolStatus (org.apache.nutch.protocol.ProtocolStatus)5 Protocol (org.apache.nutch.protocol.Protocol)4 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)3 BaseRobotRules (crawlercommons.robots.BaseRobotRules)2 IOException (java.io.IOException)2 MalformedURLException (java.net.MalformedURLException)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Response (org.apache.nutch.net.protocols.Response)2 ScoringFilters (org.apache.nutch.scoring.ScoringFilters)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 IntWritable (org.apache.hadoop.io.IntWritable)1 Inlinks (org.apache.nutch.crawl.Inlinks)1 URLFilterException (org.apache.nutch.net.URLFilterException)1 Parse (org.apache.nutch.parse.Parse)1