use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class TestMoreIndexingFilter method testNoParts.
* @since NUTCH-901
public void testNoParts() {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
MoreIndexingFilter filter = new MoreIndexingFilter();
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
try {
filter.filter(doc, parse, new Text(""), new CrawlDatum(), new Inlinks());
} catch (Exception e) {
Assert.assertEquals(1, doc.getField("type").getValues().size());
Assert.assertEquals("text/html", doc.getFieldValue("type"));
use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class CommonCrawlDataDumper method dump.
* Dumps the reverse engineered CBOR content from the provided segment
* directories if a parent directory contains more than one segment,
* otherwise a single segment can be passed as an argument. If the boolean
* argument is provided then the CBOR is also zipped.
* @param outputDir the directory you wish to dump the raw content to. This
* directory will be created.
* @param segmentRootDir a directory containing one or more segments.
* @param linkdb Path to linkdb.
* @param gzip a boolean flag indicating whether the CBOR content should also
* be gzipped.
* @param epochFilename if {@code true}, output files will be names using the epoch time (in milliseconds).
* @param extension a file extension to use with output documents.
* @throws Exception if any exception occurs.
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception {
if (gzip) {"Gzipping CBOR data has been skipped");
// total file counts
Map<String, Integer> typeCounts = new HashMap<>();
// filtered file counters
Map<String, Integer> filteredCounts = new HashMap<>();
Configuration nutchConfig = NutchConfiguration.create();
Path segmentRootPath = new Path(segmentRootDir.toString());
FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);
// get all paths
List<Path> parts = new ArrayList<>();
RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data";
while (files.hasNext()) {
LocatedFileStatus next =;
if (next.isFile()) {
Path path = next.getPath();
if (path.toString().matches(partPattern)) {
LinkDbReader linkDbReader = null;
if (linkdb != null) {
linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
if (parts == null || parts.size() == 0) {
LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
}"Found {} segment parts", parts.size());
if (gzip && !warc) {
fileList = new ArrayList<>();
for (Path segmentPart : parts) {"Processing segment Part : [ {} ]", segmentPart);
try {
SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(segmentPart));
Writable key = (Writable) reader.getKeyClass().newInstance();
Content content = null;
while ( {
content = new Content();
Metadata metadata = content.getMetadata();
String url = key.toString();
String baseName = FilenameUtils.getBaseName(url);
String extensionName = FilenameUtils.getExtension(url);
if (!extension.isEmpty()) {
extensionName = extension;
} else if ((extensionName == null) || extensionName.isEmpty()) {
extensionName = "html";
String outputFullPath = null;
String outputRelativePath = null;
String filename = null;
String timestamp = null;
String reverseKey = null;
if (epochFilename || config.getReverseKey()) {
try {
long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(getDate(metadata.get("Date"))).getTime();
timestamp = String.valueOf(epoch);
} catch (ParseException pe) {
reverseKey = reverseUrl(url);
config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
if (!warc) {
if (epochFilename) {
outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, url, timestamp, extensionName, !gzip);
outputRelativePath = outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
} else {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
outputFullPath = String.format("%s/%s", fullDir, filename);
String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
outputRelativePath = firstLevelDirName + secondLevelDirName;
// Encode all filetypes if no mimetypes have been given
Boolean filter = (mimeTypes == null);
String jsonData = "";
try {
String mimeType = new Tika().detect(content.getContent());
// Maps file to JSON-based structure
// there may be duplicates, so using set
Set<String> inUrls = null;
if (linkDbReader != null) {
Inlinks inlinks = linkDbReader.getInlinks((Text) key);
if (inlinks != null) {
Iterator<Inlink> iterator = inlinks.iterator();
inUrls = new LinkedHashSet<>();
while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
// TODO: Make this Jackson Format implementation reusable
try (CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
if (inUrls != null) {
format.setInLinks(new ArrayList<>(inUrls));
jsonData = format.getJsonData(url, content, metadata);
collectStats(typeCounts, mimeType);
// collects statistics for the given mimetypes
if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
collectStats(filteredCounts, mimeType);
filter = true;
} catch (IOException ioe) {
LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
if (!warc) {
if (filter) {
byte[] byteData = serializeCBORData(jsonData);
if (!gzip) {
File outputFile = new File(outputFullPath);
if (outputFile.exists()) {"Skipping writing: [" + outputFullPath + "]: file already exists");
} else {"Writing: [" + outputFullPath + "]");
IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
} else {
if (fileList.contains(outputFullPath)) {"Skipping compressing: [" + outputFullPath + "]: file already exists");
} else {
fileList.add(outputFullPath);"Compressing: [" + outputFullPath + "]");
// TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
TarArchiveEntry tarEntry = new TarArchiveEntry(outputRelativePath + File.separator + filename);
} catch (Exception e) {
LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
} finally {
if (gzip && !warc) {
if (!typeCounts.isEmpty()) {"CommonsCrawlDataDumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class MimeTypeIndexingFilterTest method testAllowOnlyImages.
public void testAllowOnlyImages() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text(""), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("image")) {
Assert.assertNotNull("Allow only images", doc);
} else {
Assert.assertNull("Block everything else", doc);
use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class MimeTypeIndexingFilterTest method testBlockHTML.
public void testBlockHTML() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text(""), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("html")) {
Assert.assertNull("Block only HTML documents", doc);
} else {
Assert.assertNotNull("Allow everything else", doc);
use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class MimeTypeIndexingFilterTest method testMissingConfigFile.
public void testMissingConfigFile() throws Exception {
String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
Assert.assertEquals(String.format("Property %s must not be present in the the configuration file", MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
// property not set so in this cases all documents must pass the filter
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text(""), new CrawlDatum(), new Inlinks());
Assert.assertNotNull("All documents must be allowed by default", doc);