Search in sources :

Example 11 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class FetcherThread method run.

@SuppressWarnings("fallthrough")
public void run() {
    // count threads
    activeThreads.incrementAndGet();
    FetchItem fit = null;
    try {
        // checking for the server to be running and fetcher.parse to be true
        if (parsing && NutchServer.getInstance().isRunning())
            reportToNutchServer = true;
        while (true) {
            // creating FetchNode for storing in FetchNodeDb
            if (reportToNutchServer)
                this.fetchNode = new FetchNode();
            else
                this.fetchNode = null;
            // check whether must be stopped
            if (isHalted()) {
                LOG.debug("{} set to halted", getName());
                fit = null;
                return;
            }
            fit = fetchQueues.getFetchItem();
            if (fit == null) {
                if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) {
                    LOG.debug("{} spin-waiting ...", getName());
                    // spin-wait.
                    spinWaiting.incrementAndGet();
                    try {
                        Thread.sleep(500);
                    } catch (Exception e) {
                    }
                    spinWaiting.decrementAndGet();
                    continue;
                } else {
                    // all done, finish this thread
                    LOG.info("{} {} has no more work available", getName(), Thread.currentThread().getId());
                    return;
                }
            }
            lastRequestStart.set(System.currentTimeMillis());
            Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
            if (reprUrlWritable == null) {
                setReprUrl(fit.url.toString());
            } else {
                setReprUrl(reprUrlWritable.toString());
            }
            try {
                // fetch the page
                redirecting = false;
                redirectCount = 0;
                // Publisher event
                if (activatePublisher) {
                    FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
                    publisher.publish(startEvent, conf);
                }
                do {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("{} {} fetching {} (queue crawl delay={}ms)", getName(), Thread.currentThread().getId(), fit.url, fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay);
                    }
                    LOG.debug("redirectCount={}", redirectCount);
                    redirecting = false;
                    Protocol protocol = this.protocolFactory.getProtocol(fit.u);
                    BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, robotsTxtContent);
                    if (robotsTxtContent != null) {
                        outputRobotsTxt(robotsTxtContent);
                        robotsTxtContent.clear();
                    }
                    if (rules.isDeferVisits()) {
                        LOG.info("Defer visits for queue {} : {}", fit.queueID, fit.url);
                        // retry the fetch item
                        if (fetchQueues.timelimitExceeded()) {
                            fetchQueues.finishFetchItem(fit, true);
                        } else {
                            fetchQueues.addFetchItem(fit);
                        }
                        // but check whether it's time to cancel the queue
                        int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID(), this.robotsDeferVisitsRetries + 1, this.robotsDeferVisitsDelay);
                        if (killedURLs != 0) {
                            context.getCounter("FetcherStatus", "robots_defer_visits_dropped").increment(killedURLs);
                        }
                        continue;
                    }
                    if (!rules.isAllowed(fit.url.toString())) {
                        // unblock
                        fetchQueues.finishFetchItem(fit, true);
                        LOG.info("Denied by robots.txt: {}", fit.url);
                        output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                        context.getCounter("FetcherStatus", "robots_denied").increment(1);
                        continue;
                    }
                    if (rules.getCrawlDelay() > 0) {
                        if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                            // unblock
                            fetchQueues.finishFetchItem(fit, true);
                            LOG.info("Crawl-Delay for {} too long ({} ms), skipping", fit.url, rules.getCrawlDelay());
                            output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                            context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
                            continue;
                        } else {
                            FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                            long crawlDelay = rules.getCrawlDelay();
                            if (crawlDelay < minCrawlDelay) {
                                LOG.info("Crawl-Delay for {} too short ({} ms), adjusting to {} ms", fit.url, rules.getCrawlDelay(), minCrawlDelay);
                                crawlDelay = minCrawlDelay;
                            }
                            fiq.crawlDelay = crawlDelay;
                            LOG.debug("Crawl delay for queue: {} is set to {} as per robots.txt. url: ", fit.queueID, fiq.crawlDelay, fit.url);
                        }
                    }
                    ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
                    ProtocolStatus status = output.getStatus();
                    Content content = output.getContent();
                    ParseStatus pstatus = null;
                    // unblock queue
                    fetchQueues.finishFetchItem(fit);
                    // used for FetchNode
                    if (fetchNode != null) {
                        fetchNode.setStatus(status.getCode());
                        fetchNode.setFetchTime(System.currentTimeMillis());
                        fetchNode.setUrl(fit.url);
                    }
                    // Publish fetch finish event
                    if (activatePublisher) {
                        FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
                        endEvent.addEventData("status", status.getName());
                        publisher.publish(endEvent, conf);
                    }
                    context.getCounter("FetcherStatus", status.getName()).increment(1);
                    switch(status.getCode()) {
                        case ProtocolStatus.WOULDBLOCK:
                            // retry ?
                            fetchQueues.addFetchItem(fit);
                            break;
                        case // got a page
                        ProtocolStatus.SUCCESS:
                            pstatus = output(fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                            updateStatus(content.getContent().length);
                            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                                String newUrl = pstatus.getMessage();
                                int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
                                Text redirUrl = handleRedirect(fit, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
                                if (redirUrl != null) {
                                    fit = queueRedirect(redirUrl, fit);
                                }
                            }
                            break;
                        // redirect
                        case ProtocolStatus.MOVED:
                        case ProtocolStatus.TEMP_MOVED:
                            int code;
                            boolean temp;
                            if (status.getCode() == ProtocolStatus.MOVED) {
                                code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                                temp = false;
                            } else {
                                code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                                temp = true;
                            }
                            output(fit.url, fit.datum, content, status, code);
                            String newUrl = status.getMessage();
                            Text redirUrl = handleRedirect(fit, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                            if (redirUrl != null) {
                                fit = queueRedirect(redirUrl, fit);
                            } else {
                                // stop redirecting
                                redirecting = false;
                            }
                            break;
                        case ProtocolStatus.EXCEPTION:
                            logError(fit.url, status.getMessage());
                            int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID());
                            if (killedURLs != 0)
                                context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
                        // retry
                        case ProtocolStatus.RETRY:
                        case ProtocolStatus.BLOCKED:
                            output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                            break;
                        // gone
                        case ProtocolStatus.GONE:
                        case ProtocolStatus.NOTFOUND:
                        case ProtocolStatus.ACCESS_DENIED:
                        case ProtocolStatus.ROBOTS_DENIED:
                            output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                            break;
                        case ProtocolStatus.NOTMODIFIED:
                            output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                            break;
                        default:
                            if (LOG.isWarnEnabled()) {
                                LOG.warn("{} {} Unknown ProtocolStatus: {}", getName(), Thread.currentThread().getId(), status.getCode());
                            }
                            output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                    }
                    if (redirecting && redirectCount > maxRedirect) {
                        fetchQueues.finishFetchItem(fit);
                        context.getCounter("FetcherStatus", "redirect_count_exceeded").increment(1);
                        if (LOG.isInfoEnabled()) {
                            LOG.info("{} {} - redirect count exceeded {} ({})", getName(), Thread.currentThread().getId(), fit.url, maxRedirectExceededSkip ? "skipped" : "linked");
                        }
                        if (maxRedirectExceededSkip) {
                        // skip redirect target when redirect count is exceeded
                        } else {
                            Text newUrl = new Text(status.getMessage());
                            CrawlDatum newDatum = createRedirDatum(newUrl, fit, CrawlDatum.STATUS_LINKED);
                            output(newUrl, newDatum, null, null, CrawlDatum.STATUS_LINKED);
                        }
                    }
                } while (redirecting && (redirectCount <= maxRedirect));
            } catch (Throwable t) {
                // unexpected exception
                // unblock
                fetchQueues.finishFetchItem(fit);
                String message;
                if (LOG.isDebugEnabled()) {
                    message = StringUtils.stringifyException(t);
                } else if (logUtil.logShort(t)) {
                    message = t.getClass().getName();
                } else {
                    message = StringUtils.stringifyException(t);
                }
                logError(fit.url, message);
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
            }
        }
    } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:", e);
        }
    } finally {
        if (fit != null) {
            fetchQueues.finishFetchItem(fit);
        }
        // count threads
        activeThreads.decrementAndGet();
        LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(), Thread.currentThread().getId(), getName(), activeThreads);
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) BaseRobotRules(crawlercommons.robots.BaseRobotRules) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 12 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class HttpRobotRulesParser method addRobotsContent.

/**
 * Append {@link Content} of robots.txt to {@literal robotsTxtContent}
 *
 * @param robotsTxtContent
 *          container to store robots.txt response content
 * @param robotsUrl
 *          robots.txt URL
 * @param robotsResponse
 *          response object to be stored
 */
protected void addRobotsContent(List<Content> robotsTxtContent, URL robotsUrl, Response robotsResponse) {
    byte[] robotsBytes = robotsResponse.getContent();
    if (robotsBytes == null)
        robotsBytes = new byte[0];
    Content content = new Content(robotsUrl.toString(), robotsUrl.toString(), robotsBytes, robotsResponse.getHeader("Content-Type"), robotsResponse.getHeaders(), getConf());
    robotsTxtContent.add(content);
}
Also used : Content(org.apache.nutch.protocol.Content)

Example 13 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestMetadataScoringFilter method passScoreAfterParsing.

@Test
public void passScoreAfterParsing() {
    Configuration conf = NutchConfiguration.create();
    conf.set(MetadataScoringFilter.METADATA_DATUM, "parent,depth");
    conf.set(MetadataScoringFilter.METADATA_CONTENT, "parent,depth");
    MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
    metadataScoringFilter.setConf(conf);
    CrawlDatum crawlDatum = new CrawlDatum();
    Text from = new Text("https://nutch.apache.org/");
    String PARENT = "parent";
    String DEPTH = "depth";
    String parentMD = "https://nutch.apache.org/";
    String depthMD = "1";
    crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
    crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
    Content content = new Content();
    metadataScoringFilter.passScoreBeforeParsing(from, crawlDatum, content);
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, null, null, content.getMetadata());
    Parse parse = new ParseImpl(from.toString(), parseData);
    metadataScoringFilter.passScoreAfterParsing(from, content, parse);
    Assert.assertEquals(parentMD, parse.getData().getMeta(PARENT));
    Assert.assertEquals(depthMD, parse.getData().getMeta(DEPTH));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Test(org.junit.Test)

Example 14 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class SegmentHandler method handle.

@Override
public void handle(Request req, HttpServletResponse res, String target, int dispatch) throws IOException, ServletException {
    try {
        String uri = req.getUri().toString();
        LOG.info("URI: " + uri);
        addMyHeader(res, "URI", uri);
        Text url = new Text(uri.toString());
        CrawlDatum cd = seg.getCrawlDatum(url);
        if (cd != null) {
            addMyHeader(res, "Res", "found");
            LOG.info("-got " + cd.toString());
            ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
            if (ps != null) {
                Integer TrCode = protoCodes.get(ps.getCode());
                if (TrCode != null) {
                    res.setStatus(TrCode.intValue());
                } else {
                    res.setStatus(HttpServletResponse.SC_OK);
                }
                addMyHeader(res, "ProtocolStatus", ps.toString());
            } else {
                res.setStatus(HttpServletResponse.SC_OK);
            }
            Content c = seg.getContent(url);
            if (c == null) {
                // missing content
                req.setHandled(true);
                res.addHeader("X-Handled-By", getClass().getSimpleName());
                return;
            }
            byte[] data = c.getContent();
            LOG.debug("-data len=" + data.length);
            Metadata meta = c.getMetadata();
            String[] names = meta.names();
            LOG.debug("- " + names.length + " meta");
            for (int i = 0; i < names.length; i++) {
                boolean my = true;
                char ch = names[i].charAt(0);
                if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
                    // pretty good chance it's a standard header
                    my = false;
                }
                String[] values = meta.getValues(names[i]);
                for (int k = 0; k < values.length; k++) {
                    if (my) {
                        addMyHeader(res, names[i], values[k]);
                    } else {
                        res.addHeader(names[i], values[k]);
                    }
                }
            }
            req.setHandled(true);
            res.addHeader("X-Handled-By", getClass().getSimpleName());
            res.setContentType(meta.get(Metadata.CONTENT_TYPE));
            res.setContentLength(data.length);
            OutputStream os = res.getOutputStream();
            os.write(data, 0, data.length);
            res.flushBuffer();
        } else {
            addMyHeader(res, "Res", "not found");
            LOG.info(" -not found " + url);
        }
    } catch (Exception e) {
        e.printStackTrace();
        LOG.warn(StringUtils.stringifyException(e));
        addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
    }
}
Also used : OutputStream(java.io.OutputStream) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ServletException(javax.servlet.ServletException) IOException(java.io.IOException) Content(org.apache.nutch.protocol.Content) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 15 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class CommonCrawlDataDumper method dump.

/**
 * Dumps the reverse engineered CBOR content from the provided segment
 * directories if a parent directory contains more than one segment,
 * otherwise a single segment can be passed as an argument. If the boolean
 * argument is provided then the CBOR is also zipped.
 *
 * @param outputDir      the directory you wish to dump the raw content to. This
 *                       directory will be created.
 * @param segmentRootDir a directory containing one or more segments.
 * @param linkdb         Path to linkdb.
 * @param gzip           a boolean flag indicating whether the CBOR content should also
 *                       be gzipped.
 * @param mimeTypes a string array of mimeTypes to filter on, everything else is excluded
 * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
 * @param extension      a file extension to use with output documents.
 * @param warc if true write as warc format
 * @throws Exception if any exception occurs.
 */
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception {
    if (gzip) {
        LOG.info("Gzipping CBOR data has been skipped");
    }
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counters
    Map<String, Integer> filteredCounts = new HashMap<>();
    Configuration nutchConfig = NutchConfiguration.create();
    Path segmentRootPath = new Path(segmentRootDir.toString());
    FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);
    // get all paths
    List<Path> parts = new ArrayList<>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
    String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data";
    while (files.hasNext()) {
        LocatedFileStatus next = files.next();
        if (next.isFile()) {
            Path path = next.getPath();
            if (path.toString().matches(partPattern)) {
                parts.add(path);
            }
        }
    }
    LinkDbReader linkDbReader = null;
    if (linkdb != null) {
        linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
    }
    if (parts == null || parts.size() == 0) {
        LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
        System.exit(1);
    }
    LOG.info("Found {} segment parts", parts.size());
    if (gzip && !warc) {
        fileList = new ArrayList<>();
        constructNewStream(outputDir);
    }
    for (Path segmentPart : parts) {
        LOG.info("Processing segment Part : [ {} ]", segmentPart);
        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(segmentPart));
            Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();
            Content content = null;
            while (reader.next(key)) {
                content = new Content();
                reader.getCurrentValue(content);
                Metadata metadata = content.getMetadata();
                String url = key.toString();
                String baseName = FilenameUtils.getBaseName(url);
                String extensionName = FilenameUtils.getExtension(url);
                if (!extension.isEmpty()) {
                    extensionName = extension;
                } else if ((extensionName == null) || extensionName.isEmpty()) {
                    extensionName = "html";
                }
                String outputFullPath = null;
                String outputRelativePath = null;
                String filename = null;
                String timestamp = null;
                String reverseKey = null;
                if (epochFilename || config.getReverseKey()) {
                    try {
                        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(getDate(metadata.get("Date"))).getTime();
                        timestamp = String.valueOf(epoch);
                    } catch (ParseException pe) {
                        LOG.warn(pe.getMessage());
                    }
                    reverseKey = reverseUrl(url);
                    config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
                }
                if (!warc) {
                    if (epochFilename) {
                        outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, url, timestamp, extensionName, !gzip);
                        outputRelativePath = outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
                        filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
                    } else {
                        String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                        String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
                        filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                        outputFullPath = String.format("%s/%s", fullDir, filename);
                        String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
                        String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
                        String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
                        outputRelativePath = firstLevelDirName + secondLevelDirName;
                    }
                }
                // Encode all filetypes if no mimetypes have been given
                Boolean filter = (mimeTypes == null);
                String jsonData = "";
                try {
                    String mimeType = new Tika().detect(content.getContent());
                    // Maps file to JSON-based structure
                    // there may be duplicates, so using set
                    Set<String> inUrls = null;
                    if (linkDbReader != null) {
                        Inlinks inlinks = linkDbReader.getInlinks((Text) key);
                        if (inlinks != null) {
                            Iterator<Inlink> iterator = inlinks.iterator();
                            inUrls = new LinkedHashSet<>();
                            while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
                                inUrls.add(iterator.next().getFromUrl());
                            }
                        }
                    }
                    // TODO: Make this Jackson Format implementation reusable
                    try (CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
                        if (inUrls != null) {
                            format.setInLinks(new ArrayList<>(inUrls));
                        }
                        jsonData = format.getJsonData(url, content, metadata);
                    }
                    collectStats(typeCounts, mimeType);
                    // collects statistics for the given mimetypes
                    if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
                        collectStats(filteredCounts, mimeType);
                        filter = true;
                    }
                } catch (IOException ioe) {
                    LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
                    return;
                }
                if (!warc) {
                    if (filter) {
                        byte[] byteData = serializeCBORData(jsonData);
                        if (!gzip) {
                            File outputFile = new File(outputFullPath);
                            if (outputFile.exists()) {
                                LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                            } else {
                                LOG.info("Writing: [" + outputFullPath + "]");
                                IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
                            }
                        } else {
                            if (fileList.contains(outputFullPath)) {
                                LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                            } else {
                                fileList.add(outputFullPath);
                                LOG.info("Compressing: [" + outputFullPath + "]");
                                // TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
                                TarArchiveEntry tarEntry = new TarArchiveEntry(outputRelativePath + File.separator + filename);
                                tarEntry.setSize(byteData.length);
                                tarOutput.putArchiveEntry(tarEntry);
                                tarOutput.write(byteData);
                                tarOutput.closeArchiveEntry();
                            }
                        }
                    }
                }
            }
            reader.close();
        } catch (Exception e) {
            LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
        } finally {
            fs.close();
        }
    }
    if (gzip && !warc) {
        closeStream();
    }
    if (!typeCounts.isEmpty()) {
        LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Metadata(org.apache.nutch.metadata.Metadata) LinkDbReader(org.apache.nutch.crawl.LinkDbReader) Writable(org.apache.hadoop.io.Writable) LinkDbReader(org.apache.nutch.crawl.LinkDbReader) Tika(org.apache.tika.Tika) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Inlinks(org.apache.nutch.crawl.Inlinks) IOException(java.io.IOException) Inlink(org.apache.nutch.crawl.Inlink) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) ParseException(java.text.ParseException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ByteArrayInputStream(java.io.ByteArrayInputStream) Content(org.apache.nutch.protocol.Content) FileOutputStream(java.io.FileOutputStream) ParseException(java.text.ParseException) SimpleDateFormat(com.ibm.icu.text.SimpleDateFormat) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4