Search in sources :

Example 16 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class FileDumper method dump.

/**
 * Dumps the reverse engineered raw content from the provided segment
 * directories if a parent directory contains more than one segment, otherwise
 * a single segment can be passed as an argument.
 *
 * @param outputDir
 *          the directory you wish to dump the raw content to. This directory
 *          will be created.
 * @param segmentRootDir
 *          a directory containing one or more segments.
 * @param mimeTypes
 *          an array of mime types we have to dump, all others will be
 *          filtered out.
 * @param flatDir
 *          a boolean flag specifying whether the output directory should contain
 *          only files instead of using nested directories to prevent naming
 *          conflicts.
 * @param mimeTypeStats
 *          a flag indicating whether mimetype stats should be displayed
 *          instead of dumping files.
 * @param reverseURLDump whether to reverse the URLs when they are written to disk
 * @throws Exception if there is a fatal error dumping files to disk
 */
public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump) throws Exception {
    if (mimeTypes == null)
        LOG.info("Accepting all mimetypes.");
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counts
    Map<String, Integer> filteredCounts = new HashMap<>();
    Configuration conf = NutchConfiguration.create();
    int fileCount = 0;
    File[] segmentDirs = segmentRootDir.listFiles(file -> file.canRead() && file.isDirectory());
    if (segmentDirs == null) {
        LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
        return;
    }
    for (File segment : segmentDirs) {
        LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
        DataOutputStream doutputStream = null;
        Map<String, String> filenameToUrl = new HashMap<String, String>();
        File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
        File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory());
        if (partDirs == null) {
            LOG.warn("Skipping Corrupt Segment: [{}]", segment.getAbsolutePath());
            continue;
        }
        for (File partDir : partDirs) {
            try (FileSystem fs = FileSystem.get(conf)) {
                String segmentPath = partDir + "/data";
                Path file = new Path(segmentPath);
                if (!new File(file.toString()).exists()) {
                    LOG.warn("Skipping segment: [" + segmentPath + "]: no data directory present");
                    continue;
                }
                SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
                Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();
                Content content = null;
                while (reader.next(key)) {
                    content = new Content();
                    reader.getCurrentValue(content);
                    String url = key.toString();
                    String baseName = FilenameUtils.getBaseName(url);
                    String extension = FilenameUtils.getExtension(url);
                    if (extension == null || (extension != null && extension.equals(""))) {
                        extension = "html";
                    }
                    ByteArrayInputStream bas = null;
                    Boolean filter = false;
                    try {
                        bas = new ByteArrayInputStream(content.getContent());
                        String mimeType = new Tika().detect(content.getContent());
                        collectStats(typeCounts, mimeType);
                        if (mimeType != null) {
                            if (mimeTypes == null || Arrays.asList(mimeTypes).contains(mimeType)) {
                                collectStats(filteredCounts, mimeType);
                                filter = true;
                            }
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                        LOG.warn("Tika is unable to detect type for: [" + url + "]");
                    } finally {
                        if (bas != null) {
                            try {
                                bas.close();
                            } catch (Exception ignore) {
                            }
                        }
                    }
                    if (filter) {
                        if (!mimeTypeStats) {
                            String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                            String fullDir = outputDir.getAbsolutePath();
                            if (!flatDir && !reverseURLDump) {
                                fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
                            }
                            if (!Strings.isNullOrEmpty(fullDir)) {
                                String outputFullPath;
                                if (reverseURLDump) {
                                    String[] reversedURL = TableUtil.reverseUrl(url).split(":");
                                    reversedURL[0] = reversedURL[0].replace('.', '/');
                                    String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
                                    outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
                                    // We'll drop the trailing file name and create the nested structure if it doesn't already exist.
                                    String[] splitPath = outputFullPath.split("/");
                                    File fullOutputDir = new File(org.apache.commons.lang3.StringUtils.join(Arrays.copyOf(splitPath, splitPath.length - 1), "/"));
                                    if (!fullOutputDir.exists()) {
                                        if (!fullOutputDir.mkdirs())
                                            ;
                                        throw new Exception("Unable to create: [" + fullOutputDir.getAbsolutePath() + "]");
                                    }
                                } else {
                                    outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
                                }
                                filenameToUrl.put(outputFullPath, url);
                                File outputFile = new File(outputFullPath);
                                if (!outputFile.exists()) {
                                    LOG.info("Writing: [" + outputFullPath + "]");
                                    // Modified to prevent FileNotFoundException (Invalid Argument)
                                    FileOutputStream output = null;
                                    try {
                                        output = new FileOutputStream(outputFile);
                                        IOUtils.write(content.getContent(), output);
                                    } catch (Exception e) {
                                        LOG.warn("Write Error: [" + outputFullPath + "]");
                                        e.printStackTrace();
                                    } finally {
                                        if (output != null) {
                                            output.flush();
                                            try {
                                                output.close();
                                            } catch (Exception ignore) {
                                            }
                                        }
                                    }
                                    fileCount++;
                                } else {
                                    LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                                }
                            }
                        }
                    }
                }
                reader.close();
            } finally {
                if (doutputStream != null) {
                    try {
                        doutputStream.close();
                    } catch (Exception ignore) {
                    }
                }
            }
        }
        // save filenameToUrl in a json file for each segment there is one mapping file
        String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(), segment.getName());
        new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl);
    }
    LOG.info("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    if (mimeTypeStats) {
        System.out.println("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) HashMap(java.util.HashMap) DataOutputStream(java.io.DataOutputStream) Writable(org.apache.hadoop.io.Writable) Tika(org.apache.tika.Tika) SequenceFile(org.apache.hadoop.io.SequenceFile) ByteArrayInputStream(java.io.ByteArrayInputStream) Content(org.apache.nutch.protocol.Content) FileSystem(org.apache.hadoop.fs.FileSystem) FileOutputStream(java.io.FileOutputStream) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File) ObjectMapper(org.codehaus.jackson.map.ObjectMapper)

Example 17 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestRobotsMetaProcessor method testRobotsMetaProcessor.

@Test
public void testRobotsMetaProcessor() {
    Configuration conf = NutchConfiguration.create();
    TikaParser parser = new TikaParser();
    parser.setConf(conf);
    try {
        currURLsAndAnswers = new URL[][] { { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org/foo/"), new URL("http://www.nutch.org/") }, { new URL("http://www.nutch.org"), new URL("http://www.nutch.org/base/") }, { new URL("http://www.nutch.org"), null } };
    } catch (Exception e) {
        Assert.assertTrue("couldn't make test URLs!", false);
    }
    for (int i = 0; i < tests.length; i++) {
        byte[] bytes = tests[i].getBytes();
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        DocumentFragment root = doc.createDocumentFragment();
        String url = "http://www.nutch.org";
        Content content = new Content(url, url, bytes, "text/html", new Metadata(), conf);
        Parse parse = null;
        try {
            parse = parser.getParse(content, doc, root).get(url);
        } catch (Exception e) {
            e.printStackTrace();
        }
        HTMLMetaTags robotsMeta = new HTMLMetaTags();
        HTMLMetaProcessor.getMetaTags(robotsMeta, root, currURLsAndAnswers[i][0]);
        Assert.assertEquals("got noindex wrong on test " + i, answers[i][0], robotsMeta.getNoIndex());
        Assert.assertEquals("got nofollow wrong on test " + i, answers[i][1], robotsMeta.getNoFollow());
        Assert.assertEquals("got nocache wrong on test " + i, answers[i][2], robotsMeta.getNoCache());
        Assert.assertTrue("got base href wrong on test " + i + " (got " + robotsMeta.getBaseHref() + ")", ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) || ((robotsMeta.getBaseHref() != null) && robotsMeta.getBaseHref().equals(currURLsAndAnswers[i][1])));
        if (tests[i].contains("meta-refresh redirect")) {
            // test for NUTCH-2589
            URL metaRefreshUrl = robotsMeta.getRefreshHref();
            Assert.assertNotNull("failed to get meta-refresh redirect", metaRefreshUrl);
            Assert.assertEquals("failed to get meta-refresh redirect", "http://example.com/", metaRefreshUrl.toString());
            Assert.assertEquals("failed to add meta-refresh redirect to parse status", "http://example.com/", parse.getData().getStatus().getArgs()[0]);
        }
    }
}
Also used : HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) URL(java.net.URL) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) Content(org.apache.nutch.protocol.Content) DocumentFragment(org.w3c.dom.DocumentFragment) Test(org.junit.Test)

Example 18 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TikaParserTest method getTextContent.

public String getTextContent(String fileName) throws ProtocolException, ParseException {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    return parse.getText();
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol)

Example 19 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class SmallStack method main.

/**
 * @param args arguments are: 0. Name of input SWF file.
 * @throws IOException if there is a fatal error processing the input
 * file
 */
public static void main(String[] args) throws IOException {
    FileInputStream in = new FileInputStream(args[0]);
    byte[] buf = new byte[in.available()];
    in.read(buf);
    in.close();
    SWFParser parser = new SWFParser();
    ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", new Metadata(), NutchConfiguration.create()));
    Parse p = parseResult.get("file:" + args[0]);
    System.out.println("Parse Text:");
    System.out.println(p.getText());
    System.out.println("Parse Data:");
    System.out.println(p.getData());
}
Also used : ParseResult(org.apache.nutch.parse.ParseResult) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) FileInputStream(java.io.FileInputStream)

Example 20 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class ZipTextExtractor method extractText.

public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
    String resultText = "";
    ZipInputStream zin = new ZipInputStream(input);
    ZipEntry entry;
    while ((entry = zin.getNextEntry()) != null) {
        if (!entry.isDirectory()) {
            int size = (int) entry.getSize();
            byte[] b = new byte[size];
            for (int x = 0; x < size; x++) {
                int err = zin.read();
                if (err != -1) {
                    b[x] = (byte) err;
                }
            }
            String newurl = url + "/";
            String fname = entry.getName();
            newurl += fname;
            URL aURL = new URL(newurl);
            String base = aURL.toString();
            int i = fname.lastIndexOf('.');
            if (i != -1) {
                // Trying to resolve the Mime-Type
                Tika tika = new Tika();
                String contentType = tika.detect(fname);
                try {
                    Metadata metadata = new Metadata();
                    metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
                    metadata.set(Response.CONTENT_TYPE, contentType);
                    Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
                    Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
                    ParseData theParseData = parse.getData();
                    Outlink[] theOutlinks = theParseData.getOutlinks();
                    for (int count = 0; count < theOutlinks.length; count++) {
                        outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
                    }
                    resultText += entry.getName() + " " + parse.getText() + " ";
                } catch (ParseException e) {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
                    }
                }
            }
        }
    }
    return resultText;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) ZipEntry(java.util.zip.ZipEntry) Metadata(org.apache.nutch.metadata.Metadata) Tika(org.apache.tika.Tika) URL(java.net.URL) ZipInputStream(java.util.zip.ZipInputStream) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) ParseException(org.apache.nutch.parse.ParseException)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4