use of org.apache.nutch.protocol.Content in project nutch by apache.
the class FileDumper method dump.
/**
* Dumps the reverse engineered raw content from the provided segment
* directories if a parent directory contains more than one segment, otherwise
* a single segment can be passed as an argument.
*
* @param outputDir
* the directory you wish to dump the raw content to. This directory
* will be created.
* @param segmentRootDir
* a directory containing one or more segments.
* @param mimeTypes
* an array of mime types we have to dump, all others will be
* filtered out.
* @param flatDir
* a boolean flag specifying whether the output directory should contain
* only files instead of using nested directories to prevent naming
* conflicts.
* @param mimeTypeStats
* a flag indicating whether mimetype stats should be displayed
* instead of dumping files.
* @param reverseURLDump whether to reverse the URLs when they are written to disk
* @throws Exception if there is a fatal error dumping files to disk
*/
public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump) throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
// total file counts
Map<String, Integer> typeCounts = new HashMap<>();
// filtered file counts
Map<String, Integer> filteredCounts = new HashMap<>();
Configuration conf = NutchConfiguration.create();
int fileCount = 0;
File[] segmentDirs = segmentRootDir.listFiles(file -> file.canRead() && file.isDirectory());
if (segmentDirs == null) {
LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
return;
}
for (File segment : segmentDirs) {
LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
DataOutputStream doutputStream = null;
Map<String, String> filenameToUrl = new HashMap<String, String>();
File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory());
if (partDirs == null) {
LOG.warn("Skipping Corrupt Segment: [{}]", segment.getAbsolutePath());
continue;
}
for (File partDir : partDirs) {
try (FileSystem fs = FileSystem.get(conf)) {
String segmentPath = partDir + "/data";
Path file = new Path(segmentPath);
if (!new File(file.toString()).exists()) {
LOG.warn("Skipping segment: [" + segmentPath + "]: no data directory present");
continue;
}
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();
Content content = null;
while (reader.next(key)) {
content = new Content();
reader.getCurrentValue(content);
String url = key.toString();
String baseName = FilenameUtils.getBaseName(url);
String extension = FilenameUtils.getExtension(url);
if (extension == null || (extension != null && extension.equals(""))) {
extension = "html";
}
ByteArrayInputStream bas = null;
Boolean filter = false;
try {
bas = new ByteArrayInputStream(content.getContent());
String mimeType = new Tika().detect(content.getContent());
collectStats(typeCounts, mimeType);
if (mimeType != null) {
if (mimeTypes == null || Arrays.asList(mimeTypes).contains(mimeType)) {
collectStats(filteredCounts, mimeType);
filter = true;
}
}
} catch (Exception e) {
e.printStackTrace();
LOG.warn("Tika is unable to detect type for: [" + url + "]");
} finally {
if (bas != null) {
try {
bas.close();
} catch (Exception ignore) {
}
}
}
if (filter) {
if (!mimeTypeStats) {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = outputDir.getAbsolutePath();
if (!flatDir && !reverseURLDump) {
fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
}
if (!Strings.isNullOrEmpty(fullDir)) {
String outputFullPath;
if (reverseURLDump) {
String[] reversedURL = TableUtil.reverseUrl(url).split(":");
reversedURL[0] = reversedURL[0].replace('.', '/');
String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
// We'll drop the trailing file name and create the nested structure if it doesn't already exist.
String[] splitPath = outputFullPath.split("/");
File fullOutputDir = new File(org.apache.commons.lang3.StringUtils.join(Arrays.copyOf(splitPath, splitPath.length - 1), "/"));
if (!fullOutputDir.exists()) {
if (!fullOutputDir.mkdirs())
;
throw new Exception("Unable to create: [" + fullOutputDir.getAbsolutePath() + "]");
}
} else {
outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
}
filenameToUrl.put(outputFullPath, url);
File outputFile = new File(outputFullPath);
if (!outputFile.exists()) {
LOG.info("Writing: [" + outputFullPath + "]");
// Modified to prevent FileNotFoundException (Invalid Argument)
FileOutputStream output = null;
try {
output = new FileOutputStream(outputFile);
IOUtils.write(content.getContent(), output);
} catch (Exception e) {
LOG.warn("Write Error: [" + outputFullPath + "]");
e.printStackTrace();
} finally {
if (output != null) {
output.flush();
try {
output.close();
} catch (Exception ignore) {
}
}
}
fileCount++;
} else {
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
}
}
}
}
}
reader.close();
} finally {
if (doutputStream != null) {
try {
doutputStream.close();
} catch (Exception ignore) {
}
}
}
}
// save filenameToUrl in a json file for each segment there is one mapping file
String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(), segment.getName());
new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl);
}
LOG.info("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
if (mimeTypeStats) {
System.out.println("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
}
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestRobotsMetaProcessor method testRobotsMetaProcessor.
@Test
public void testRobotsMetaProcessor() {
Configuration conf = NutchConfiguration.create();
TikaParser parser = new TikaParser();
parser.setConf(conf);
try {
currURLsAndAnswers = new URL[][] { { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org/foo/"), new URL("http://www.nutch.org/") }, { new URL("http://www.nutch.org"), new URL("http://www.nutch.org/base/") }, { new URL("http://www.nutch.org"), null } };
} catch (Exception e) {
Assert.assertTrue("couldn't make test URLs!", false);
}
for (int i = 0; i < tests.length; i++) {
byte[] bytes = tests[i].getBytes();
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
DocumentFragment root = doc.createDocumentFragment();
String url = "http://www.nutch.org";
Content content = new Content(url, url, bytes, "text/html", new Metadata(), conf);
Parse parse = null;
try {
parse = parser.getParse(content, doc, root).get(url);
} catch (Exception e) {
e.printStackTrace();
}
HTMLMetaTags robotsMeta = new HTMLMetaTags();
HTMLMetaProcessor.getMetaTags(robotsMeta, root, currURLsAndAnswers[i][0]);
Assert.assertEquals("got noindex wrong on test " + i, answers[i][0], robotsMeta.getNoIndex());
Assert.assertEquals("got nofollow wrong on test " + i, answers[i][1], robotsMeta.getNoFollow());
Assert.assertEquals("got nocache wrong on test " + i, answers[i][2], robotsMeta.getNoCache());
Assert.assertTrue("got base href wrong on test " + i + " (got " + robotsMeta.getBaseHref() + ")", ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) || ((robotsMeta.getBaseHref() != null) && robotsMeta.getBaseHref().equals(currURLsAndAnswers[i][1])));
if (tests[i].contains("meta-refresh redirect")) {
// test for NUTCH-2589
URL metaRefreshUrl = robotsMeta.getRefreshHref();
Assert.assertNotNull("failed to get meta-refresh redirect", metaRefreshUrl);
Assert.assertEquals("failed to get meta-refresh redirect", "http://example.com/", metaRefreshUrl.toString());
Assert.assertEquals("failed to add meta-refresh redirect to parse status", "http://example.com/", parse.getData().getStatus().getArgs()[0]);
}
}
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TikaParserTest method getTextContent.
public String getTextContent(String fileName) throws ProtocolException, ParseException {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
return parse.getText();
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class SmallStack method main.
/**
* @param args arguments are: 0. Name of input SWF file.
* @throws IOException if there is a fatal error processing the input
* file
*/
public static void main(String[] args) throws IOException {
FileInputStream in = new FileInputStream(args[0]);
byte[] buf = new byte[in.available()];
in.read(buf);
in.close();
SWFParser parser = new SWFParser();
ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", new Metadata(), NutchConfiguration.create()));
Parse p = parseResult.get("file:" + args[0]);
System.out.println("Parse Text:");
System.out.println(p.getText());
System.out.println("Parse Data:");
System.out.println(p.getData());
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class ZipTextExtractor method extractText.
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
String resultText = "";
ZipInputStream zin = new ZipInputStream(input);
ZipEntry entry;
while ((entry = zin.getNextEntry()) != null) {
if (!entry.isDirectory()) {
int size = (int) entry.getSize();
byte[] b = new byte[size];
for (int x = 0; x < size; x++) {
int err = zin.read();
if (err != -1) {
b[x] = (byte) err;
}
}
String newurl = url + "/";
String fname = entry.getName();
newurl += fname;
URL aURL = new URL(newurl);
String base = aURL.toString();
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
Tika tika = new Tika();
String contentType = tika.detect(fname);
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
metadata.set(Response.CONTENT_TYPE, contentType);
Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
for (int count = 0; count < theOutlinks.length; count++) {
outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
}
resultText += entry.getName() + " " + parse.getText() + " ";
} catch (ParseException e) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
}
}
}
}
}
return resultText;
}
Aggregations